changeset 9590:4211c5fe1e0c

merge with avx2 optimization branch
author Steve Borho <steve@borho.org>
date Thu, 26 Feb 2015 11:25:55 -0600
parents 2e25084cd441 (current diff) 04861917b7b3 (diff)
children a183003fb969
files
diffstat 21 files changed, 10736 insertions(+-), 640 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/dct.cpp	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/dct.cpp	Thu Feb 26 11:25:55 2015 -0600
@@ -709,14 +709,12 @@ uint32_t nquant_c(const int16_t* coef, c
 
     return numSig;
 }
-
-int  count_nonzero_c(const int16_t* quantCoeff, int numCoeff)
+template<int trSize>
+int  count_nonzero_c(const int16_t* quantCoeff)
 {
     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
-    X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
-
     int count = 0;
-
+    int numCoeff = trSize * trSize;
     for (int i = 0; i < numCoeff; i++)
     {
         count += quantCoeff[i] != 0;
@@ -775,8 +773,11 @@ void setupDCTPrimitives_c(EncoderPrimiti
     p.cu[BLOCK_8x8].idct   = idct8_c;
     p.cu[BLOCK_16x16].idct = idct16_c;
     p.cu[BLOCK_32x32].idct = idct32_c;
-    p.count_nonzero = count_nonzero_c;
     p.denoiseDct = denoiseDct_c;
+    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
+    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
+    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
+    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
 
     p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
--- a/source/common/primitives.h	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/primitives.h	Thu Feb 26 11:25:55 2015 -0600
@@ -136,8 +136,7 @@ typedef uint32_t (*quant_t)(const int16_
 typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
-typedef int  (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
-
+typedef int(*count_nonzero_t)(const int16_t* quantCoeff);
 typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
@@ -227,7 +226,7 @@ struct EncoderPrimitives
         pixel_add_ps_t  add_ps;
         blockfill_s_t   blockfill_s;   // block fill, for DC transforms
         copy_cnt_t      copy_cnt;      // copy coeff while counting non-zero
-
+        count_nonzero_t count_nonzero;
         cpy2Dto1D_shl_t cpy2Dto1D_shl;
         cpy2Dto1D_shr_t cpy2Dto1D_shr;
         cpy1Dto2D_shl_t cpy1Dto2D_shl;
@@ -262,9 +261,7 @@ struct EncoderPrimitives
     nquant_t              nquant;
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
-    count_nonzero_t       count_nonzero;
     denoiseDct_t          denoiseDct;
-
     scale_t               scale1D_128to64;
     scale_t               scale2D_64to32;
 
--- a/source/common/quant.cpp	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/quant.cpp	Thu Feb 26 11:25:55 2015 -0600
@@ -488,9 +488,7 @@ void Quant::invtransformNxN(int16_t* res
     else
     {
         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
-
-        X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
-
+        X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n");
         // DC only
         if (numSig == 1 && coeff[0] != 0 && !useDST)
         {
@@ -527,13 +525,10 @@ uint32_t Quant::rdoQuant(const CUData& c
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
     int numCoeff = 1 << (log2TrSize * 2);
-
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
-
-    X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n");
+    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
     if (!numSig)
         return 0;
-
     uint32_t trSize = 1 << log2TrSize;
     int64_t lambda2 = m_qpParam[ttype].lambda2;
     int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda);
--- a/source/common/x86/asm-primitives.cpp	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Feb 26 11:25:55 2015 -0600
@@ -179,7 +179,6 @@ extern "C" {
     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim  = fncdef x265_ ## fname ## _8x32_ ## cpu
 #define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
 
-
 #define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim   = fncdef x265_ ## fname ## _4x8_ ## cpu; \
     p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].prim  = fncdef x265_ ## fname ## _8x16_ ## cpu; \
@@ -895,7 +894,10 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
-        p.count_nonzero = x265_count_nonzero_ssse3;
+        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
+        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3;
+        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3;
+        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3;
         p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
@@ -932,6 +934,9 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
 
+        // TODO: check POPCNT flag!
+        ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+
 #if X86_64
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
@@ -940,11 +945,22 @@ void setupAssemblyPrimitives(EncoderPrim
     if (cpuMask & X265_CPU_AVX)
     {
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx; fails tests
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = x265_pixel_satd_16x24_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = x265_pixel_satd_32x48_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = x265_pixel_satd_24x64_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = x265_pixel_satd_8x64_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = x265_pixel_satd_8x12_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = x265_pixel_satd_12x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = x265_pixel_satd_4x32_avx;
+
         ALL_LUMA_PU(satd, pixel_satd, avx);
         ASSIGN_SA8D(avx);
         LUMA_VAR(avx);
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
         p.ssim_end_4 = x265_pixel_ssim_end4_avx;
+
+        // copy_pp primitives
+        // 16 x N
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
         p.pu[LUMA_16x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x4_avx;
         p.pu[LUMA_16x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x8_avx;
@@ -964,6 +980,72 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x24_avx;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx;
+
+        // 24 X N
+        p.pu[LUMA_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x64_avx;
+
+        // 32 x N
+        p.pu[LUMA_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx;
+        p.pu[LUMA_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
+        p.pu[LUMA_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx;
+        p.pu[LUMA_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
+        p.pu[LUMA_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x48_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx;
+
+        // 48 X 64
+        p.pu[LUMA_48x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_48x64_avx;
+
+        // copy_ss primitives
+        // 16 X N
+        p.cu[BLOCK_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_ss = x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = x265_blockcopy_ss_32x64_avx;
+
+        // 64 X N
+        p.cu[BLOCK_64x64].copy_ss = x265_blockcopy_ss_64x64_avx;
+
+        // copy_ps primitives
+        // 16 X N
+        p.cu[BLOCK_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x32_avx;
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x64_avx;
+
+        // 64 X N
+        p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_64x64_avx;
+
+        // copy_sp primitives
+        // 16 X N
+        p.cu[BLOCK_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x32_avx;
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x64_avx;
+
+        // 64 X N
+        p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_64x64_avx;
+
         p.frameInitLowres = x265_frame_init_lowres_core_avx;
     }
     if (cpuMask & X265_CPU_XOP)
@@ -979,12 +1061,28 @@ void setupAssemblyPrimitives(EncoderPrim
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal  = x265_dequant_normal_avx2;
+        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_avx2;
+        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_avx2;
+        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_avx2;
+        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
         // p.weight_pp = x265_weight_pp_avx2; fails tests
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
 
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
 
+        p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2;
+        p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_avx2;
+        p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_avx2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_8_avx2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_avx2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_avx2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2;
+
 #if X86_64
         ALL_LUMA_TU_S(dct, dct, avx2);
         ALL_LUMA_TU_S(idct, idct, avx2);
@@ -993,6 +1091,114 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2;
         p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
         p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
+
+        p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vpp = x265_interp_8tap_vert_pp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vpp = x265_interp_8tap_vert_pp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vpp = x265_interp_8tap_vert_pp_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vpp = x265_interp_8tap_vert_pp_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vpp = x265_interp_8tap_vert_pp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vpp = x265_interp_8tap_vert_pp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vpp = x265_interp_8tap_vert_pp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vpp = x265_interp_8tap_vert_pp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vpp = x265_interp_8tap_vert_pp_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vpp = x265_interp_8tap_vert_pp_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vpp = x265_interp_8tap_vert_pp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vps = x265_interp_8tap_vert_ps_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vps = x265_interp_8tap_vert_ps_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vps = x265_interp_8tap_vert_ps_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vps = x265_interp_8tap_vert_ps_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vps = x265_interp_8tap_vert_ps_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vps = x265_interp_8tap_vert_ps_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vps = x265_interp_8tap_vert_ps_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vps = x265_interp_8tap_vert_ps_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vps = x265_interp_8tap_vert_ps_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vps = x265_interp_8tap_vert_ps_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vps = x265_interp_8tap_vert_ps_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vps = x265_interp_8tap_vert_ps_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vps = x265_interp_8tap_vert_ps_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vps = x265_interp_8tap_vert_ps_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vps = x265_interp_8tap_vert_ps_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vps = x265_interp_8tap_vert_ps_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vps = x265_interp_8tap_vert_ps_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vps = x265_interp_8tap_vert_ps_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vps = x265_interp_8tap_vert_ps_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vsp = x265_interp_8tap_vert_sp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vsp = x265_interp_8tap_vert_sp_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vsp = x265_interp_8tap_vert_sp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vsp = x265_interp_8tap_vert_sp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vsp = x265_interp_8tap_vert_sp_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vsp = x265_interp_8tap_vert_sp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vsp = x265_interp_8tap_vert_sp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vsp = x265_interp_8tap_vert_sp_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vsp = x265_interp_8tap_vert_sp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vsp = x265_interp_8tap_vert_sp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vsp = x265_interp_8tap_vert_sp_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vss = x265_interp_8tap_vert_ss_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vss = x265_interp_8tap_vert_ss_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vss = x265_interp_8tap_vert_ss_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vss = x265_interp_8tap_vert_ss_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vss = x265_interp_8tap_vert_ss_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vss = x265_interp_8tap_vert_ss_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vss = x265_interp_8tap_vert_ss_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vss = x265_interp_8tap_vert_ss_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vss = x265_interp_8tap_vert_ss_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vss = x265_interp_8tap_vert_ss_64x64_avx2;
 #else
         p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
 #endif
@@ -1000,12 +1206,38 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx;
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx;
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
-
         p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
         p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
+
+        // Blockfill_s primitives
+        p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_avx2;
+        p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_avx2;
+
+        p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vpp = x265_interp_8tap_vert_pp_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vpp = x265_interp_8tap_vert_pp_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vpp = x265_interp_8tap_vert_pp_16x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vps = x265_interp_8tap_vert_ps_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vps = x265_interp_8tap_vert_ps_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vps = x265_interp_8tap_vert_ps_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vsp = x265_interp_8tap_vert_sp_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vsp = x265_interp_8tap_vert_sp_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vsp = x265_interp_8tap_vert_sp_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vsp = x265_interp_8tap_vert_sp_16x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vss = x265_interp_8tap_vert_ss_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vss = x265_interp_8tap_vert_ss_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vss = x265_interp_8tap_vert_ss_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vss = x265_interp_8tap_vert_ss_16x4_avx2;
     }
 }
-
 #else // if HIGH_BIT_DEPTH
 
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
@@ -1117,15 +1349,37 @@ void setupAssemblyPrimitives(EncoderPrim
         ASSIGN_SSE_PP(ssse3);
         p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_ssse3;
+        p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_ssse3;
+        p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_ssse3;
+        p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_ssse3;
+        p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_ssse3;
+        p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_ssse3;
+        p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_ssse3;
+        p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_ssse3;
+        p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_ssse3;
+        p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_ssse3;
+        p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_ssse3;
+        p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_ssse3;
+        p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_ssse3;
+        p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_ssse3;
+        p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_ssse3;
+        p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_ssse3;
+        p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_ssse3;
+        p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_ssse3;
+        p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_ssse3;
+        p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_ssse3;
+        p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
+        p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
+        p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
 
-        p.luma_p2s = x265_luma_p2s_ssse3;
         p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
         p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
-
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
-        p.count_nonzero = x265_count_nonzero_ssse3;
-
+        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
+        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3;
+        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3;
+        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3;
         p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
@@ -1204,6 +1458,13 @@ void setupAssemblyPrimitives(EncoderPrim
     if (cpuMask & X265_CPU_AVX)
     {
         p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = x265_pixel_satd_16x24_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = x265_pixel_satd_32x48_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = x265_pixel_satd_24x64_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = x265_pixel_satd_8x64_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = x265_pixel_satd_8x12_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = x265_pixel_satd_12x32_avx;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = x265_pixel_satd_4x32_avx;
         ALL_LUMA_PU(satd, pixel_satd, avx);
         ASSIGN_SA8D(avx);
         ASSIGN_SSE_PP(avx);
@@ -1241,6 +1502,13 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
         p.pu[LUMA_32x64].copy_pp = x265_blockcopy_pp_32x64_avx;
 
+        p.pu[LUMA_64x16].copy_pp = x265_blockcopy_pp_64x16_avx;
+        p.pu[LUMA_64x32].copy_pp = x265_blockcopy_pp_64x32_avx;
+        p.pu[LUMA_64x48].copy_pp = x265_blockcopy_pp_64x48_avx;
+        p.pu[LUMA_64x64].copy_pp = x265_blockcopy_pp_64x64_avx;
+
+        p.pu[LUMA_48x64].copy_pp = x265_blockcopy_pp_48x64_avx;
+
         p.frameInitLowres = x265_frame_init_lowres_core_avx;
     }
     if (cpuMask & X265_CPU_XOP)
@@ -1255,6 +1523,28 @@ void setupAssemblyPrimitives(EncoderPrim
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
+        p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+        p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
+
+        p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_avx2;
+        p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_avx2;
+        p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_avx2;
+        p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_avx2;
+        p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_avx2;
+        p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_avx2;
+
+        p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_avx2;
+        p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_avx2;
+        p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_avx2;
+        p.pu[LUMA_32x16].pixelavg_pp = x265_pixel_avg_32x16_avx2;
+        p.pu[LUMA_32x8].pixelavg_pp = x265_pixel_avg_32x8_avx2;
+
+        p.pu[LUMA_64x64].pixelavg_pp = x265_pixel_avg_64x64_avx2;
+        p.pu[LUMA_64x48].pixelavg_pp = x265_pixel_avg_64x48_avx2;
+        p.pu[LUMA_64x32].pixelavg_pp = x265_pixel_avg_64x32_avx2;
+        p.pu[LUMA_64x16].pixelavg_pp = x265_pixel_avg_64x16_avx2;
+
         p.pu[LUMA_16x16].satd = x265_pixel_satd_16x16_avx2;
         p.pu[LUMA_16x8].satd  = x265_pixel_satd_16x8_avx2;
         p.pu[LUMA_8x16].satd  = x265_pixel_satd_8x16_avx2;
@@ -1278,17 +1568,51 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
 
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2;
+
         p.denoiseDct = x265_denoise_dct_avx2;
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
+        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_avx2;
+        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_avx2;
+        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_avx2;
+        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2;
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
 
         p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
         p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        // copy_sp primitives
+        p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = x265_blockcopy_sp_16x32_avx2;
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_sp = x265_blockcopy_sp_32x32_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = x265_blockcopy_sp_32x32_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = x265_blockcopy_sp_32x64_avx2;
+
+       // 64 X N
+       p.cu[BLOCK_64x64].copy_sp = x265_blockcopy_sp_64x64_avx2;
+        // copy_ps primitives
+        // 16 X N
+        p.cu[BLOCK_16x16].copy_ps = x265_blockcopy_ps_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ps = x265_blockcopy_ps_16x16_avx2;
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ps = x265_blockcopy_ps_16x32_avx2;
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_ps = x265_blockcopy_ps_32x32_avx2;
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = x265_blockcopy_ps_32x32_avx2;
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = x265_blockcopy_ps_32x64_avx2;
+
+        // 64 x N
+        p.cu[BLOCK_64x64].copy_ps = x265_blockcopy_ps_64x64_avx2;
 
         p.weight_pp = x265_weight_pp_avx2;
-
         p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
 #if X86_64
         p.cu[BLOCK_8x8].dct    = x265_dct8_avx2;
@@ -1305,6 +1629,9 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2;
         p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2;
 
+        p.pu[LUMA_4x8].luma_vpp = x265_interp_8tap_vert_pp_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vpp = x265_interp_8tap_vert_pp_4x16_avx2;
+
         p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
 
         p.pu[LUMA_16x4].luma_vpp  = x265_interp_8tap_vert_pp_16x4_avx2;
@@ -1328,6 +1655,109 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2;
         p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2;
         p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2;
+
+        p.pu[LUMA_4x8].luma_vps = x265_interp_8tap_vert_ps_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vps = x265_interp_8tap_vert_ps_4x16_avx2;
+
+        p.pu[LUMA_12x16].luma_vps = x265_interp_8tap_vert_ps_12x16_avx2;
+
+        p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2;
+        p.pu[LUMA_16x8].luma_vps = x265_interp_8tap_vert_ps_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vps = x265_interp_8tap_vert_ps_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vps = x265_interp_8tap_vert_ps_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vps = x265_interp_8tap_vert_ps_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vps = x265_interp_8tap_vert_ps_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vps = x265_interp_8tap_vert_ps_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vps = x265_interp_8tap_vert_ps_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vps = x265_interp_8tap_vert_ps_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vps = x265_interp_8tap_vert_ps_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vps = x265_interp_8tap_vert_ps_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vps = x265_interp_8tap_vert_ps_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vps = x265_interp_8tap_vert_ps_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vps = x265_interp_8tap_vert_ps_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vps = x265_interp_8tap_vert_ps_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vps = x265_interp_8tap_vert_ps_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vsp = x265_interp_8tap_vert_sp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vsp = x265_interp_8tap_vert_sp_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vsp = x265_interp_8tap_vert_sp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vsp = x265_interp_8tap_vert_sp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vsp = x265_interp_8tap_vert_sp_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vsp = x265_interp_8tap_vert_sp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vsp = x265_interp_8tap_vert_sp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vsp = x265_interp_8tap_vert_sp_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vsp = x265_interp_8tap_vert_sp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vsp = x265_interp_8tap_vert_sp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vsp = x265_interp_8tap_vert_sp_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
+
+        p.pu[LUMA_12x16].luma_vss = x265_interp_8tap_vert_ss_12x16_avx2;
+
+        p.pu[LUMA_16x8].luma_vss = x265_interp_8tap_vert_ss_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vss = x265_interp_8tap_vert_ss_16x12_avx2;
+        p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vss = x265_interp_8tap_vert_ss_24x32_avx2;
+
+        p.pu[LUMA_32x8].luma_vss = x265_interp_8tap_vert_ss_32x8_avx2;
+        p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vss = x265_interp_8tap_vert_ss_32x24_avx2;
+        p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vss = x265_interp_8tap_vert_ss_48x64_avx2;
+
+        p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vss = x265_interp_8tap_vert_ss_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vss = x265_interp_8tap_vert_ss_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vss = x265_interp_8tap_vert_ss_64x64_avx2;
+
+        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_avx2;
+        p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
+        p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2;
+
+        p.pu[LUMA_8x4].luma_hps = x265_interp_8tap_horiz_ps_8x4_avx2;
+        p.pu[LUMA_8x8].luma_hps = x265_interp_8tap_horiz_ps_8x8_avx2;
+        p.pu[LUMA_8x16].luma_hps = x265_interp_8tap_horiz_ps_8x16_avx2;
+        p.pu[LUMA_8x32].luma_hps = x265_interp_8tap_horiz_ps_8x32_avx2;
+
+        p.pu[LUMA_16x8].luma_hps = x265_interp_8tap_horiz_ps_16x8_avx2;
+        p.pu[LUMA_16x16].luma_hps = x265_interp_8tap_horiz_ps_16x16_avx2;
+        p.pu[LUMA_16x12].luma_hps = x265_interp_8tap_horiz_ps_16x12_avx2;
+        p.pu[LUMA_16x4].luma_hps = x265_interp_8tap_horiz_ps_16x4_avx2;
+        p.pu[LUMA_16x32].luma_hps = x265_interp_8tap_horiz_ps_16x32_avx2;
+        p.pu[LUMA_16x64].luma_hps = x265_interp_8tap_horiz_ps_16x64_avx2;
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
 #endif
         p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2;
 
@@ -1362,23 +1792,65 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
 
         p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2;
+
         p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2;
+
         p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2;
         p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2;
         p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
         p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
 
+        p.pu[LUMA_8x4].luma_vps = x265_interp_8tap_vert_ps_8x4_avx2;
+        p.pu[LUMA_8x8].luma_vps = x265_interp_8tap_vert_ps_8x8_avx2;
+        p.pu[LUMA_8x16].luma_vps = x265_interp_8tap_vert_ps_8x16_avx2;
+        p.pu[LUMA_8x32].luma_vps = x265_interp_8tap_vert_ps_8x32_avx2;
+
+        p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vsp = x265_interp_8tap_vert_sp_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vsp = x265_interp_8tap_vert_sp_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vsp = x265_interp_8tap_vert_sp_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vsp = x265_interp_8tap_vert_sp_16x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vss = x265_interp_8tap_vert_ss_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vss = x265_interp_8tap_vert_ss_4x16_avx2;
+        p.pu[LUMA_8x4].luma_vss = x265_interp_8tap_vert_ss_8x4_avx2;
+        p.pu[LUMA_16x4].luma_vss = x265_interp_8tap_vert_ss_16x4_avx2;
+
         // color space i420
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
-
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = x265_interp_4tap_vert_pp_4x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
         // color space i422
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
+        p.cu[BLOCK_8x8].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_8_avx2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_avx2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_avx2;
 
-#if X86_64
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2;
-#endif
+        // intra_pred functions
+        p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2;
+        p.cu[BLOCK_8x8].intra_pred[33] = x265_intra_pred_ang8_33_avx2;
+        p.cu[BLOCK_8x8].intra_pred[4] = x265_intra_pred_ang8_4_avx2;
+        p.cu[BLOCK_8x8].intra_pred[32] = x265_intra_pred_ang8_32_avx2;
+        p.cu[BLOCK_8x8].intra_pred[5] = x265_intra_pred_ang8_5_avx2;
+        p.cu[BLOCK_8x8].intra_pred[31] = x265_intra_pred_ang8_31_avx2;
+        p.cu[BLOCK_8x8].intra_pred[30] = x265_intra_pred_ang8_30_avx2;
+        p.cu[BLOCK_8x8].intra_pred[6] = x265_intra_pred_ang8_6_avx2;
+        p.cu[BLOCK_8x8].intra_pred[7] = x265_intra_pred_ang8_7_avx2;
+        p.cu[BLOCK_8x8].intra_pred[29] = x265_intra_pred_ang8_29_avx2;
+        p.cu[BLOCK_8x8].intra_pred[8] = x265_intra_pred_ang8_8_avx2;
+        p.cu[BLOCK_8x8].intra_pred[28] = x265_intra_pred_ang8_28_avx2;
     }
 }
 #endif // if HIGH_BIT_DEPTH
--- a/source/common/x86/blockcopy8.asm	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/blockcopy8.asm	Thu Feb 26 11:25:55 2015 -0600
@@ -963,6 +963,46 @@ BLOCKCOPY_PP_W48_H2 48, 64
 ;-----------------------------------------------------------------------------
 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W48_H4_avx 2
+INIT_YMM avx
+cglobal blockcopy_pp_%1x%2, 4, 5, 4
+    mov    r4d,    %2/4
+
+.loop:
+    movu    m0,    [r2]
+    movu    xm1,   [r2 + 32]
+    movu    m2,    [r2 + r3]
+    movu    xm3,   [r2 + r3 + 32]
+    lea     r2,    [r2 + 2 * r3]
+
+    movu    [r0],              m0
+    movu    [r0 + 32],         xm1
+    movu    [r0 + r1],         m2
+    movu    [r0 + r1 + 32],    xm3
+    lea     r0,                [r0 + 2 * r1]
+
+    movu    m0,    [r2]
+    movu    xm1,   [r2 + 32]
+    movu    m2,    [r2 + r3]
+    movu    xm3,   [r2 + r3 + 32]
+
+    movu    [r0],              m0
+    movu    [r0 + 32],         xm1
+    movu    [r0 + r1],         m2
+    movu    [r0 + r1 + 32],    xm3
+
+    dec    r4d
+    lea    r0,    [r0 + 2 * r1]
+    lea    r2,    [r2 + 2 * r3]
+    jnz    .loop
+    RET
+%endmacro
+
+BLOCKCOPY_PP_W48_H4_avx 48, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W64_H4 2
 INIT_XMM sse2
 cglobal blockcopy_pp_%1x%2, 4, 5, 6
@@ -1022,6 +1062,49 @@ BLOCKCOPY_PP_W64_H4 64, 48
 BLOCKCOPY_PP_W64_H4 64, 64
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W64_H4_avx 2
+INIT_YMM avx
+cglobal blockcopy_pp_%1x%2, 4, 7, 6
+    lea    r4,  [3 * r1]
+    lea    r5,  [3 * r3]
+    mov    r6d, %2/4
+
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + 32]
+    movu    m2, [r2 + r3]
+    movu    m3, [r2 + r3 + 32]
+    movu    m4, [r2 + 2 * r3]
+    movu    m5, [r2 + 2 * r3 + 32]
+
+    movu    [r0], m0
+    movu    [r0 + 32], m1
+    movu    [r0 + r1], m2
+    movu    [r0 + r1 + 32], m3
+    movu    [r0 + 2 * r1], m4
+    movu    [r0 + 2 * r1 + 32], m5
+
+    movu    m0, [r2 + r5]
+    movu    m1, [r2 + r5 + 32]
+
+    movu    [r0 + r4], m0
+    movu    [r0 + r4 + 32], m1
+
+    lea     r2, [r2 + 4 * r3]
+    lea     r0, [r0 + 4 * r1]
+    dec     r6d
+    jnz     .loop
+    RET
+%endmacro
+
+BLOCKCOPY_PP_W64_H4_avx 64, 16
+BLOCKCOPY_PP_W64_H4_avx 64, 32
+BLOCKCOPY_PP_W64_H4_avx 64, 48
+BLOCKCOPY_PP_W64_H4_avx 64, 64
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
@@ -1675,9 +1758,69 @@ BLOCKCOPY_SP_W16_H4 16, 12
 BLOCKCOPY_SP_W16_H4 16, 16
 BLOCKCOPY_SP_W16_H4 16, 32
 BLOCKCOPY_SP_W16_H4 16, 64
-
 BLOCKCOPY_SP_W16_H4 16, 24
-
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W16_H8_avx2 2
+INIT_YMM avx2
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
+    mov    r4d, %2/8
+    add    r3,  r3
+    lea    r5,  [3 * r3]
+    lea    r6,  [3 * r1]
+
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r5]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    vextracti128 xm1, m0, 1
+    vextracti128 xm3, m2, 1
+
+    movu    [r0],          xm0
+    movu    [r0 + r1],     xm1
+    movu    [r0 + 2 * r1], xm2
+    movu    [r0 + r6],     xm3
+
+    lea     r2, [r2 + 4 * r3]
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r5]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    vextracti128 xm1, m0, 1
+    vextracti128 xm3, m2, 1
+
+    lea     r0,            [r0 + 4 * r1]
+    movu    [r0],          xm0
+    movu    [r0 + r1],     xm1
+    movu    [r0 + 2 * r1], xm2
+    movu    [r0 + r6],     xm3
+
+    lea    r0, [r0 + 4 * r1]
+    lea    r2, [r2 + 4 * r3]
+
+    dec    r4d
+    jnz    .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W16_H8_avx2 16, 16
+BLOCKCOPY_SP_W16_H8_avx2 16, 32
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
@@ -1770,6 +1913,57 @@ BLOCKCOPY_SP_W32_H2 32, 48
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W32_H4_avx2 2
+INIT_YMM avx2
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
+    mov    r4d, %2/4
+    add    r3,  r3
+    lea    r5,  [3 * r3]
+    lea    r6,  [3 * r1]
+
+.loop:
+    movu       m0, [r2]
+    movu       m1, [r2 + 32]
+    movu       m2, [r2 + r3]
+    movu       m3, [r2 + r3 + 32]
+
+    packuswb   m0, m1
+    packuswb   m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu       [r0],      m0
+    movu       [r0 + r1], m2
+
+    movu       m0, [r2 + 2 * r3]
+    movu       m1, [r2 + 2 * r3 + 32]
+    movu       m2, [r2 + r5]
+    movu       m3, [r2 + r5 + 32]
+
+    packuswb   m0, m1
+    packuswb   m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu       [r0 + 2 * r1], m0
+    movu       [r0 + r6],     m2
+
+    lea        r0, [r0 + 4 * r1]
+    lea        r2, [r2 + 4 * r3]
+
+    dec        r4d
+    jnz        .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W32_H4_avx2 32, 32
+BLOCKCOPY_SP_W32_H4_avx2 32, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W48_H2 2
 INIT_XMM sse2
 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
@@ -1851,6 +2045,84 @@ BLOCKCOPY_SP_W64_H1 64, 48
 BLOCKCOPY_SP_W64_H1 64, 64
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SP_W64_H4_avx2 2
+INIT_YMM avx2
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
+    mov    r4d, %2/4
+    add    r3,  r3
+    lea    r5,  [3 * r3]
+    lea    r6,  [3 * r1]
+
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + 32]
+    movu    m2, [r2 + 64]
+    movu    m3, [r2 + 96]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu    [r0],      m0
+    movu    [r0 + 32], m2
+
+    movu    m0, [r2 + r3]
+    movu    m1, [r2 + r3 + 32]
+    movu    m2, [r2 + r3 + 64]
+    movu    m3, [r2 + r3 + 96]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu    [r0 + r1],      m0
+    movu    [r0 + r1 + 32], m2
+
+    movu    m0, [r2 + 2 * r3]
+    movu    m1, [r2 + 2 * r3 + 32]
+    movu    m2, [r2 + 2 * r3 + 64]
+    movu    m3, [r2 + 2 * r3 + 96]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu    [r0 + 2 * r1],      m0
+    movu    [r0 + 2 * r1 + 32], m2
+
+    movu    m0, [r2 + r5]
+    movu    m1, [r2 + r5 + 32]
+    movu    m2, [r2 + r5 + 64]
+    movu    m3, [r2 + r5 + 96]
+
+    packuswb    m0, m1
+    packuswb    m2, m3
+
+    vpermq    m0, m0, 11011000b
+    vpermq    m2, m2, 11011000b
+
+    movu    [r0 + r6],      m0
+    movu    [r0 + r6 + 32], m2
+
+    lea    r0, [r0 + 4 * r1]
+    lea    r2, [r2 + 4 * r3]
+
+    dec    r4d
+    jnz    .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W64_H4_avx2 64, 64
+
+;-----------------------------------------------------------------------------
 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
@@ -2631,9 +2903,44 @@ BLOCKCOPY_PS_W16_H4 16, 12
 BLOCKCOPY_PS_W16_H4 16, 16
 BLOCKCOPY_PS_W16_H4 16, 32
 BLOCKCOPY_PS_W16_H4 16, 64
-
 BLOCKCOPY_PS_W16_H4 16, 24
-
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PS_W16_H4_avx2 2
+INIT_YMM avx2
+cglobal blockcopy_ps_%1x%2, 4, 7, 3
+
+    add     r1, r1
+    mov     r4d, %2/4
+    lea     r5, [3 * r3]
+    lea     r6, [3 * r1]
+    pxor    m0, m0
+
+.loop:
+    movu        xm1, [r2]
+    pmovzxbw    m2, xm1
+    movu        [r0], m2
+    movu        xm1, [r2 + r3]
+    pmovzxbw    m2, xm1
+    movu        [r0 + r1], m2
+    movu        xm1, [r2 + 2 * r3]
+    pmovzxbw    m2, xm1
+    movu        [r0 + 2 * r1], m2
+    movu        xm1, [r2 + r5]
+    pmovzxbw    m2, xm1
+    movu        [r0 + r6], m2
+
+    lea         r0, [r0 + 4 * r1]
+    lea         r2, [r2 + 4 * r3]
+
+    dec         r4d
+    jnz         .loop
+    RET
+%endmacro
+
+BLOCKCOPY_PS_W16_H4_avx2 16, 16
+BLOCKCOPY_PS_W16_H4_avx2 16, 32
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
@@ -2731,6 +3038,57 @@ BLOCKCOPY_PS_W32_H2 32, 32
 BLOCKCOPY_PS_W32_H2 32, 64
 
 BLOCKCOPY_PS_W32_H2 32, 48
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PS_W32_H4_avx2 2
+INIT_YMM avx2
+cglobal blockcopy_ps_%1x%2, 4, 7, 3
+    add     r1, r1
+    mov     r4d, %2/4
+    lea     r5, [3 * r3]
+    lea     r6, [3 * r1]
+    pxor    m0, m0
+
+.loop:
+    movu          m1, [r2]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0], m3
+    movu          [r0 + 32], m2
+    movu          m1, [r2 + r3]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r1], m3
+    movu          [r0 + r1 + 32], m2
+    movu          m1, [r2 + 2 * r3]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + 2 * r1], m3
+    movu          [r0 + 2 * r1 + 32], m2
+    movu          m1, [r2 + r5]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r6], m3
+    movu          [r0 + r6 + 32], m2
+
+    lea           r0, [r0 + 4 * r1]
+    lea           r2, [r2 + 4 * r3]
+    dec           r4d
+    jnz           .loop
+    RET
+%endmacro
+
+BLOCKCOPY_PS_W32_H4_avx2 32, 32
+BLOCKCOPY_PS_W32_H4_avx2 32, 64
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
@@ -2864,6 +3222,80 @@ BLOCKCOPY_PS_W64_H2 64, 16
 BLOCKCOPY_PS_W64_H2 64, 32
 BLOCKCOPY_PS_W64_H2 64, 48
 BLOCKCOPY_PS_W64_H2 64, 64
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal blockcopy_ps_64x64, 4, 7, 4
+    add     r1, r1
+    mov     r4d, 64/4
+    lea     r5, [3 * r3]
+    lea     r6, [3 * r1]
+    pxor    m0, m0
+
+.loop:
+    movu          m1, [r2]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0], m3
+    movu          [r0 + 32], m2
+    movu          m1, [r2 + 32]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + 64], m3
+    movu          [r0 + 96], m2
+    movu          m1, [r2 + r3]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r1], m3
+    movu          [r0 + r1 + 32], m2
+    movu          m1, [r2 + r3 + 32]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r1 + 64], m3
+    movu          [r0 + r1 + 96], m2
+    movu          m1, [r2 + 2 * r3]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + 2 * r1], m3
+    movu          [r0 + 2 * r1 + 32], m2
+    movu          m1, [r2 + 2 * r3 + 32]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + 2 * r1 + 64], m3
+    movu          [r0 + 2 * r1 + 96], m2
+    movu          m1, [r2 + r5]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r6], m3
+    movu          [r0 + r6 + 32], m2
+    movu          m1, [r2 + r5 + 32]
+    punpcklbw     m2, m1, m0
+    punpckhbw     m1, m1, m0
+    vperm2i128    m3, m2, m1, 00100000b
+    vperm2i128    m2, m2, m1, 00110001b
+    movu          [r0 + r6 + 64], m3
+    movu          [r0 + r6 + 96], m2
+
+    lea           r0, [r0 + 4 * r1]
+    lea           r2, [r2 + 4 * r3]
+    dec           r4d
+    jnz           .loop
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
@@ -3495,6 +3927,45 @@ cglobal blockcopy_ss_%1x%2, 4, 5, 6
 BLOCKCOPY_SS_W24_H4 24, 32
 
 BLOCKCOPY_SS_W24_H4 24, 64
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W24_H4_avx 2
+INIT_YMM avx
+cglobal blockcopy_ss_%1x%2, 4, 7, 2
+
+    mov    r4d, %2/4
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r3]
+    lea    r6, [3 * r1]
+
+.loop
+    movu    m0, [r2]
+    movu    xm1, [r2 + 32]
+    movu    [r0], m0
+    movu    [r0 + 32], xm1
+    movu    m0, [r2 + r3]
+    movu    xm1, [r2 + r3 + 32]
+    movu    [r0 + r1], m0
+    movu    [r0 + r1 + 32], xm1
+    movu    m0, [r2 + 2 * r3]
+    movu    xm1, [r2 + 2 * r3 + 32]
+    movu    [r0 + 2 * r1], m0
+    movu    [r0 + 2 * r1 + 32], xm1
+    movu    m0, [r2 + r5]
+    movu    xm1, [r2 + r5 + 32]
+    movu    [r0 + r6], m0
+    movu    [r0 + r6 + 32], xm1
+    dec     r4d
+    lea     r2, [r2 + 4 * r3]
+    lea     r0, [r0 + 4 * r1]
+    jnz     .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SS_W24_H4_avx 24, 32
+BLOCKCOPY_SS_W24_H4_avx 24, 64
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
@@ -3563,6 +4034,57 @@ BLOCKCOPY_SS_W32_H4 32, 32
 BLOCKCOPY_SS_W32_H4 32, 64
 
 BLOCKCOPY_SS_W32_H4 32, 48
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W32_H4_avx 2
+INIT_YMM avx
+cglobal blockcopy_ss_%1x%2, 4, 7, 4
+
+    mov    r4d, %2/4
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + 32]
+
+    movu    [r0], m0
+    movu    [r0 + 32], m1
+
+    movu    m0, [r2 + r3]
+    movu    m1, [r2 + r3 + 32]
+
+    movu    [r0 + r1], m0
+    movu    [r0 + r1 + 32], m1
+
+    movu    m0, [r2 + 2 * r3]
+    movu    m1, [r2 + 2 * r3 + 32]
+
+    movu    [r0 + 2 * r1], m0
+    movu    [r0 + 2 * r1 + 32], m1
+
+    movu    m0, [r2 + r6]
+    movu    m1, [r2 + r6 + 32]
+
+    movu    [r0 + r5], m0
+    movu    [r0 + r5 + 32], m1
+
+    dec     r4d
+    lea     r2, [r2 + 4 * r3]
+    lea     r0, [r0 + 4 * r1]
+    jnz     .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SS_W32_H4_avx 32,  8
+BLOCKCOPY_SS_W32_H4_avx 32, 16
+BLOCKCOPY_SS_W32_H4_avx 32, 24
+BLOCKCOPY_SS_W32_H4_avx 32, 32
+BLOCKCOPY_SS_W32_H4_avx 32, 48
+BLOCKCOPY_SS_W32_H4_avx 32, 64
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
@@ -3641,6 +4163,56 @@ RET
 %endmacro
 
 BLOCKCOPY_SS_W48_H2 48, 64
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_48x64(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx
+cglobal blockcopy_ss_48x64, 4, 7, 6
+
+    mov    r4d, 64/4
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r3]
+    lea    r6, [3 * r1]
+
+.loop:
+    movu    m0, [r2]
+    movu    m1, [r2 + 32]
+    movu    m2, [r2 + 64]
+
+    movu    [r0], m0
+    movu    [r0 + 32], m1
+    movu    [r0 + 64], m2
+
+    movu    m0, [r2 + r3]
+    movu    m1, [r2 + r3 + 32]
+    movu    m2, [r2 + r3 + 64]
+
+    movu    [r0 + r1], m0
+    movu    [r0 + r1 + 32], m1
+    movu    [r0 + r1 + 64], m2
+
+    movu    m0, [r2 + 2 * r3]
+    movu    m1, [r2 + 2 * r3 + 32]
+    movu    m2, [r2 + 2 * r3 + 64]
+
+    movu    [r0 + 2 * r1], m0
+    movu    [r0 + 2 * r1 + 32], m1
+    movu    [r0 + 2 * r1 + 64], m2
+
+    movu    m0, [r2 + r5]
+    movu    m1, [r2 + r5 + 32]
+    movu    m2, [r2 + r5 + 64]
+
+    movu    [r0 + r6], m0
+    movu    [r0 + r6 + 32], m1
+    movu    [r0 + r6 + 64], m2
+
+    dec     r4d
+    lea     r2, [r2 + 4 * r3]
+    lea     r0, [r0 + 4 * r1]
+    jnz     .loop
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
@@ -3819,7 +4391,7 @@ INIT_XMM sse2
 cglobal cpy2Dto1D_shr_4, 3, 4, 4
     add             r2d, r2d
     movd            m0, r3m
-    pcmpeqw	    m1, m1
+    pcmpeqw         m1, m1
     psllw           m1, m0
     psraw           m1, 1
 
@@ -3852,7 +4424,7 @@ INIT_XMM sse2
 cglobal cpy2Dto1D_shr_8, 3, 5, 4
     add             r2d, r2d
     movd            m0, r3m
-    pcmpeqw	    m1, m1
+    pcmpeqw         m1, m1
     psllw           m1, m0
     psraw           m1, 1
     mov             r3d, 8/4
@@ -3894,6 +4466,41 @@ cglobal cpy2Dto1D_shr_8, 3, 5, 4
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_8, 3, 4, 4
+    add        r2d, r2d
+    movd       xm0, r3m
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+
+    ; Row 0-3
+    movu           xm2, [r1]
+    vinserti128    m2, m2, [r1 + r2], 1
+    movu           xm3, [r1 + 2 * r2]
+    vinserti128    m3, m3, [r1 + r3], 1
+    psubw          m2, m1
+    psraw          m2, xm0
+    psubw          m3, m1
+    psraw          m3, xm0
+    movu           [r0], m2
+    movu           [r0 + 32], m3
+
+    ; Row 4-7
+    lea            r1, [r1 + 4 * r2]
+    movu           xm2, [r1]
+    vinserti128    m2, m2, [r1 + r2], 1
+    movu           xm3, [r1 + 2 * r2]
+    vinserti128    m3, m3, [r1 + r3], 1
+    psubw          m2, m1
+    psraw          m2, xm0
+    psubw          m3, m1
+    psraw          m3, xm0
+    movu           [r0 + 64], m2
+    movu           [r0 + 96], m3
+    RET
+
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -3902,7 +4509,7 @@ INIT_XMM sse2
 cglobal cpy2Dto1D_shr_16, 3, 4, 4
     add             r2d, r2d
     movd            m0, r3m
-    pcmpeqw	    m1, m1
+    pcmpeqw         m1, m1
     psllw           m1, m0
     psraw           m1, 1
     mov             r3d, 16/2
@@ -3942,6 +4549,64 @@ cglobal cpy2Dto1D_shr_16, 3, 4, 4
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_16, 4, 5, 4
+    add        r2d, r2d
+    movd       xm0, r3d
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+    mov        r4d, 16/8
+
+.loop:
+    ; Row 0-1
+    movu       m2, [r1]
+    movu       m3, [r1 + r2]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 0 * mmsize], m2
+    movu       [r0 + 1 * mmsize], m3
+
+    ; Row 2-3
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + r3]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 2 * mmsize], m2
+    movu       [r0 + 3 * mmsize], m3
+
+    ; Row 4-5
+    lea        r1, [r1 + 4 * r2]
+    movu       m2, [r1]
+    movu       m3, [r1 + r2]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 4 * mmsize], m2
+    movu       [r0 + 5 * mmsize], m3
+
+    ; Row 6-7
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + r3]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 6 * mmsize], m2
+    movu       [r0 + 7 * mmsize], m3
+
+    add        r0, 8 * mmsize
+    lea        r1, [r1 + 4 * r2]
+    dec        r4d
+    jnz        .loop
+    RET
+
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -3950,7 +4615,7 @@ INIT_XMM sse2
 cglobal cpy2Dto1D_shr_32, 3, 4, 6
     add             r2d, r2d
     movd            m0, r3m
-    pcmpeqw	    m1, m1
+    pcmpeqw         m1, m1
     psllw           m1, m0
     psraw           m1, 1
     mov             r3d, 32/1
@@ -3988,6 +4653,62 @@ cglobal cpy2Dto1D_shr_32, 3, 4, 6
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
+    add        r2d, r2d
+    movd       xm0, r3d
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+    mov        r4d, 32/4
+
+.loop:
+    ; Row 0
+    movu       m2, [r1]
+    movu       m3, [r1 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 0 * mmsize], m2
+    movu       [r0 + 1 * mmsize], m3
+
+    ; Row 1
+    movu       m2, [r1 + r2]
+    movu       m3, [r1 + r2 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 2 * mmsize], m2
+    movu       [r0 + 3 * mmsize], m3
+
+    ; Row 2
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + 2 * r2 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 4 * mmsize], m2
+    movu       [r0 + 5 * mmsize], m3
+
+    ; Row 3
+    movu       m2, [r1 + r3]
+    movu       m3, [r1 + r3 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 6 * mmsize], m2
+    movu       [r0 + 7 * mmsize], m3
+
+    add        r0, 8 * mmsize
+    lea        r1, [r1 + 4 * r2]
+    dec        r4d
+    jnz        .loop
+    RET
 
 ;--------------------------------------------------------------------------------------
 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
@@ -4678,6 +5399,42 @@ cglobal cpy2Dto1D_shl_8, 4, 5, 4
     jnz            .loop
     RET
 
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl_8(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal cpy2Dto1D_shl_8, 4, 5, 2
+    add     r2d, r2d
+    movd    xm0, r3d
+    lea     r4, [3 * r2]
+
+    ; Row 0, 1
+    movu           xm1, [r1]
+    vinserti128    m1, m1, [r1 + r2], 1
+    psllw          m1, xm0
+    movu           [r0], m1
+
+    ; Row 2, 3
+    movu           xm1, [r1 + 2 * r2]
+    vinserti128    m1, m1, [r1 + r4], 1
+    psllw          m1, xm0
+    movu           [r0 + 32], m1
+
+    lea            r1, [r1 + 4 * r2]
+
+    ; Row 4, 5
+    movu           xm1, [r1]
+    vinserti128    m1, m1, [r1 + r2], 1
+    psllw          m1, xm0
+    movu           [r0 + 64], m1
+
+    ; Row 6, 7
+    movu           xm1, [r1 + 2 * r2]
+    vinserti128    m1, m1, [r1 + r4], 1
+    psllw          m1, xm0
+    movu           [r0 + 96], m1
+    RET
+
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -4718,6 +5475,38 @@ cglobal cpy2Dto1D_shl_16, 4, 4, 4
     jnz            .loop
     RET
 
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl_16(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal cpy2Dto1D_shl_16, 3, 5, 3
+    add    r2d, r2d
+    movd   xm0, r3m
+    mov    r3d, 16/4
+    lea     r4, [r2 * 3]
+
+.loop:
+    ; Row 0-1
+    movu     m1, [r1]
+    movu     m2, [r1 + r2]
+    psllw    m1, xm0
+    psllw    m2, xm0
+    movu     [r0 + 0 * mmsize], m1
+    movu     [r0 + 1 * mmsize], m2
+
+    ; Row 2-3
+    movu     m1, [r1 + 2 * r2]
+    movu     m2, [r1 + r4]
+    psllw    m1, xm0
+    psllw    m2, xm0
+    movu     [r0 + 2 * mmsize], m1
+    movu     [r0 + 3 * mmsize], m2
+
+    add      r0, 4 * mmsize
+    lea      r1, [r1 + r2 * 4]
+    dec      r3d
+    jnz      .loop
+    RET
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -4756,6 +5545,52 @@ cglobal cpy2Dto1D_shl_32, 4, 4, 6
     jnz            .loop
     RET
 
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl_32(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal cpy2Dto1D_shl_32, 3, 5, 5
+    add     r2d, r2d
+    movd    xm0, r3m
+    mov     r3d, 32/4
+    lea     r4, [3 * r2]
+
+.loop:
+    ; Row 0-1
+    movu     m1, [r1]
+    movu     m2, [r1 + 32]
+    movu     m3, [r1 + r2]
+    movu     m4, [r1 + r2 + 32]
+
+    psllw    m1, xm0
+    psllw    m2, xm0
+    psllw    m3, xm0
+    psllw    m4, xm0
+    movu     [r0], m1
+    movu     [r0 + mmsize], m2
+    movu     [r0 + 2 * mmsize], m3
+    movu     [r0 + 3 * mmsize], m4
+
+    ; Row 2-3
+    movu     m1, [r1 + 2 * r2]
+    movu     m2, [r1 + 2 * r2 + 32]
+    movu     m3, [r1 + r4]
+    movu     m4, [r1 + r4 + 32]
+
+    psllw    m1, xm0
+    psllw    m2, xm0
+    psllw    m3, xm0
+    psllw    m4, xm0
+    movu     [r0 + 4 * mmsize], m1
+    movu     [r0 + 5 * mmsize], m2
+    movu     [r0 + 6 * mmsize], m3
+    movu     [r0 + 7 * mmsize], m4
+
+    add      r0, 8 * mmsize
+    lea      r1, [r1 + r2 * 4]
+    dec      r3d
+    jnz      .loop
+    RET
 
 ;--------------------------------------------------------------------------------------
 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
--- a/source/common/x86/blockcopy8.h	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/blockcopy8.h	Thu Feb 26 11:25:55 2015 -0600
@@ -48,6 +48,12 @@ void x265_cpy1Dto2D_shr_4_sse2(int16_t* 
 void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
 void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
 void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
@@ -198,6 +204,15 @@ void x265_blockcopy_ss_64x16_avx(int16_t
 void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_32x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_48x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_24x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_24x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 
 void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
@@ -205,9 +220,36 @@ void x265_blockcopy_pp_32x24_avx(pixel* 
 void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_64x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_64x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_64x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_64x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_pp_48x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 
 void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
+// copy_sp primitives
+// 16 x N
+void x265_blockcopy_sp_16x16_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_16x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+// 32 x N
+void x265_blockcopy_sp_32x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_32x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+// 64 x N
+void x265_blockcopy_sp_64x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+// copy_ps primitives
+// 16 x N
+void x265_blockcopy_ps_16x16_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_16x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+// 32 x N
+void x265_blockcopy_ps_32x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_32x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+// 64 x N
+void x265_blockcopy_ps_64x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 
 #undef BLOCKCOPY_COMMON
 #undef BLOCKCOPY_SS_PP
--- a/source/common/x86/const-a.asm	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/const-a.asm	Thu Feb 26 11:25:55 2015 -0600
@@ -6,7 +6,7 @@
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
-;*
+;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
 ;* the Free Software Foundation; either version 2 of the License, or
@@ -40,9 +40,11 @@ const pw_256,      times 16 dw 256
 const pw_257,      times 16 dw 257
 const pw_512,      times 16 dw 512
 const pw_1023,     times 8  dw 1023
+ALIGN 32
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
 const pw_00ff,     times 16 dw 0x00ff
+ALIGN 32
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
@@ -51,16 +53,16 @@ const pb_unpackwq1, db 0,1,0,1,0,1,0,1,2
 const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
 const pw_swap,      times 2 db 6,7,4,5,2,3,0,1
 
-const pb_2,        times 16 db 2
-const pb_4,        times 16 db 4
-const pb_16,       times 16 db 16
-const pb_64,       times 16 db 64
+const pb_2,        times 32 db 2
+const pb_4,        times 32 db 4
+const pb_16,       times 32 db 16
+const pb_64,       times 32 db 64
 const pb_01,       times  8 db 0,1
 const pb_0,        times 16 db 0
 const pb_a1,       times 16 db 0xa1
 const pb_3,        times 16 db 3
-const pb_8,        times 16 db 8
-const pb_32,       times 16 db 32
+const pb_8,        times 32 db 8
+const pb_32,       times 32 db 32
 const pb_128,      times 16 db 128
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
@@ -73,7 +75,7 @@ const pw_64,       times 8 dw 64
 const pw_256,      times 8 dw 256
 const pw_32_0,     times 4 dw 32,
                    times 4 dw 0
-const pw_2000,     times 8 dw 0x2000
+const pw_2000,     times 16 dw 0x2000
 const pw_8000,     times 8 dw 0x8000
 const pw_3fff,     times 8 dw 0x3fff
 const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
--- a/source/common/x86/intrapred.h	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/intrapred.h	Thu Feb 26 11:25:55 2015 -0600
@@ -158,6 +158,18 @@ DECL_ANG(32, 32, sse4);
 DECL_ANG(32, 33, sse4);
 
 #undef DECL_ANG
+void x265_intra_pred_ang8_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_6_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_7_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang8_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
--- a/source/common/x86/intrapred8.asm	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/intrapred8.asm	Thu Feb 26 11:25:55 2015 -0600
@@ -54,6 +54,47 @@ c_mode16_17:          db  4,  2,  1,  0,
 c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
 tab_S2:         db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
 
+c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+c_ang8_26_20:         db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+c_ang8_src3_11_4_12:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+c_ang8_14_8:          db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+c_ang8_src5_13_5_13:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+c_ang8_2_28:          db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+c_ang8_src6_14_7_15:  db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+c_ang8_22_16:         db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+
+c_ang8_21_10       :  db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+c_ang8_src2_10_3_11:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+c_ang8_31_20:         db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+c_ang8_src4_12_4_12:  times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+c_ang8_9_30:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+c_ang8_src5_13_6_14:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
+c_ang8_19_8:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+
+c_ang8_17_2:          db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
+c_ang8_19_4:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+c_ang8_21_6:          db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+c_ang8_23_8:          db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
+c_ang8_src4_12_5_13:  db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+
+c_ang8_13_26:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+c_ang8_7_20:          db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+c_ang8_1_14:          db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+c_ang8_27_8:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+c_ang8_src1_9_1_9:    db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+c_ang8_src2_10_2_10:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+c_ang8_src3_11_3_11:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+
+c_ang8_31_8:          db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+c_ang8_13_22:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+c_ang8_27_4:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+c_ang8_9_18:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+
+c_ang8_5_10:          db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+c_ang8_15_20:         db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+c_ang8_25_30:         db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+c_ang8_3_8:           db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+
 ;; (blkSize - 1 - x)
 pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
 pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
@@ -65,6 +106,8 @@ pw_planar32_1:        dw 31, 31, 31, 31,
 pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
 pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
 
+trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
 const ang_table
 %assign x 0
 %rep 32
@@ -32102,3 +32145,492 @@ cglobal all_angs_pred_32x32, 3,7,8, 0-4
     palignr    m4,              m2,       m1,    14
     movu       [r0 + 2111 * 16],   m4
     RET
+
+;-----------------------------------------------------------------------------------------
+; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_ang8_3, 3,4,5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src3_11_4_12]
+    pshufb            m4, m0, [c_ang8_src5_13_5_13]
+    pshufb            m0,     [c_ang8_src6_14_7_15]
+
+    pmaddubsw         m1, [c_ang8_26_20]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_14_8]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_2_28]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_22_16]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_33, 3,4,5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src3_11_4_12]
+    pshufb            m4, m0, [c_ang8_src5_13_5_13]
+    pshufb            m0,     [c_ang8_src6_14_7_15]
+
+    pmaddubsw         m1, [c_ang8_26_20]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_14_8]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_2_28]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_22_16]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_4, 3,4,5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src2_10_3_11]
+    pshufb            m4, m0, [c_ang8_src4_12_4_12]
+    pshufb            m0,     [c_ang8_src5_13_6_14]
+
+    pmaddubsw         m1, [c_ang8_21_10]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_31_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_9_30]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_19_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_32, 3,4,5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src2_10_3_11]
+    pshufb            m4, m0, [c_ang8_src4_12_4_12]
+    pshufb            m0,     [c_ang8_src5_13_6_14]
+
+    pmaddubsw         m1, [c_ang8_21_10]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_31_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_9_30]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_19_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
+
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_5, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src2_10_3_11]
+    pshufb            m4, m0, [c_ang8_src3_11_4_12]
+    pshufb            m0,     [c_ang8_src4_12_5_13]
+
+    pmaddubsw         m1, [c_ang8_17_2]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_19_4]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_21_6]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_23_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_31, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+
+    pshufb            m1, m0, [c_ang8_src1_9_2_10]
+    pshufb            m2, m0, [c_ang8_src2_10_3_11]
+    pshufb            m4, m0, [c_ang8_src3_11_4_12]
+    pshufb            m0,     [c_ang8_src4_12_5_13]
+
+    pmaddubsw         m1, [c_ang8_17_2]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_19_4]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_21_6]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_23_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
+
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_6, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+
+    pshufb            m1, m0, [c_ang8_src1_9_1_9]
+    pshufb            m2, m0, [c_ang8_src2_10_2_10]
+    pshufb            m4, m0, [c_ang8_src3_11_3_11]
+    pshufb            m0,     [c_ang8_src3_11_4_12]
+
+    pmaddubsw         m1, [c_ang8_13_26]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_7_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_1_14]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_27_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_30, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+
+    pshufb            m1, m0, [c_ang8_src1_9_1_9]
+    pshufb            m2, m0, [c_ang8_src2_10_2_10]
+    pshufb            m4, m0, [c_ang8_src3_11_3_11]
+    pshufb            m0,     [c_ang8_src3_11_4_12]
+
+    pmaddubsw         m1, [c_ang8_13_26]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_7_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_1_14]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_27_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
+
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_7, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+
+    pshufb            m1, m0, [c_ang8_src1_9_1_9]
+    pshufb            m2, m0, [c_ang8_src1_9_2_10]
+    pshufb            m4, m0, [c_ang8_src2_10_2_10]
+    pshufb            m0,     [c_ang8_src2_10_3_11]
+
+    pmaddubsw         m1, [c_ang8_9_18]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_27_4]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_13_22]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_31_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_29, 3, 4, 5
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+
+    pshufb            m1, m0, [c_ang8_src1_9_1_9]
+    pshufb            m2, m0, [c_ang8_src1_9_2_10]
+    pshufb            m4, m0, [c_ang8_src2_10_2_10]
+    pshufb            m0,     [c_ang8_src2_10_3_11]
+
+    pmaddubsw         m1, [c_ang8_9_18]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_27_4]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_13_22]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_31_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
+
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_8, 3, 4, 6
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 17]
+    movu              m5, [c_ang8_src1_9_1_9]
+
+    pshufb            m1, m0, m5
+    pshufb            m2, m0, m5
+    pshufb            m4, m0, m5
+    pshufb            m0,     [c_ang8_src2_10_2_10]
+
+    pmaddubsw         m1, [c_ang8_5_10]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_15_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_25_30]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_3_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    vperm2i128        m2, m1, m4, 00100000b
+    vperm2i128        m1, m1, m4, 00110001b
+    punpcklbw         m4, m2, m1
+    punpckhbw         m2, m1
+    punpcklwd         m1, m4, m2
+    punpckhwd         m4, m2
+    mova              m0, [trans8_shuf]
+    vpermd            m1, m0, m1
+    vpermd            m4, m0, m4
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    movhps            [r0 + r1], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movhps            [r0 + r1], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + 2 * r1], xm2
+    movhps            [r0 + r3], xm2
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_28, 3, 4, 6
+    mova              m3, [pw_1024]
+    vbroadcasti128    m0, [r2 + 1]
+    movu              m5, [c_ang8_src1_9_1_9]
+
+    pshufb            m1, m0, m5
+    pshufb            m2, m0, m5
+    pshufb            m4, m0, m5
+    pshufb            m0,     [c_ang8_src2_10_2_10]
+
+    pmaddubsw         m1, [c_ang8_5_10]
+    pmulhrsw          m1, m3
+    pmaddubsw         m2, [c_ang8_15_20]
+    pmulhrsw          m2, m3
+    pmaddubsw         m4, [c_ang8_25_30]
+    pmulhrsw          m4, m3
+    pmaddubsw         m0, [c_ang8_3_8]
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    lea               r3, [3 * r1]
+    movq              [r0], xm1
+    vextracti128      xm2, m1, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    vextracti128      xm2, m4, 1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm2
+    RET
--- a/source/common/x86/ipfilter16.asm	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/ipfilter16.asm	Thu Feb 26 11:25:55 2015 -0600
@@ -31,6 +31,7 @@ tab_c_32:         times 4 dd 32
 tab_c_n32768:     times 4 dd -32768
 tab_c_524800:     times 4 dd 524800
 tab_c_n8192:      times 8 dw -8192
+pd_524800:        times 8 dd 524800
 
 tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
 
@@ -91,9 +92,28 @@ tab_LumaCoeffV:   times 4 dw 0, 0
                   times 4 dw -5, 17
                   times 4 dw 58, -10
                   times 4 dw 4, -1
+ALIGN 32
+tab_LumaCoeffVer: times 8 dw 0, 0
+                  times 8 dw 0, 64
+                  times 8 dw 0, 0
+                  times 8 dw 0, 0
+
+                  times 8 dw -1, 4
+                  times 8 dw -10, 58
+                  times 8 dw 17, -5
+                  times 8 dw 1, 0
+
+                  times 8 dw -1, 4
+                  times 8 dw -11, 40
+                  times 8 dw 40, -11
+                  times 8 dw 4, -1
+
+                  times 8 dw 0, 1
+                  times 8 dw -5, 17
+                  times 8 dw 58, -10
+                  times 8 dw 4, -1
 
 SECTION .text
-
 cextern pd_32
 cextern pw_pixel_max
 cextern pd_n32768
@@ -2562,6 +2582,2681 @@ cglobal interp_8tap_vert_pp_%1x%2, 5, 7,
     FILTER_VER_LUMA_PP 64, 16
     FILTER_VER_LUMA_PP 16, 64
 
+%macro FILTER_VER_LUMA_AVX2_4x4 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
+    mov             r4d, r4m
+    add             r1d, r1d
+    add             r3d, r3d
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,pp
+    vbroadcasti128  m6, [pd_32]
+%elifidn %1, sp
+    mova            m6, [pd_524800]
+%else
+    vbroadcasti128  m6, [pd_n32768]
+%endif
+
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    pmaddwd         m4, [r5 + 1 * mmsize]
+    paddd           m0, m5
+    paddd           m2, m4
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    pmaddwd         m1, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    paddd           m2, m1
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + 2 * r1]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [A 9 9 8]
+    pmaddwd         m4, [r5 + 3 * mmsize]
+    paddd           m2, m4
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
+    paddd           m0, m6
+    paddd           m2, m6
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
+%else
+    psrad           m0, 2
+    psrad           m2, 2
+%endif
+%endif
+
+    packssdw        m0, m2
+    pxor            m1, m1
+%ifidn %1,pp
+    CLIPW           m0, m1, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m0, m1, [pw_pixel_max]
+%endif
+
+    vextracti128    xm2, m0, 1
+    lea             r4, [r3 * 3]
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r4], xm2
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x4 pp
+FILTER_VER_LUMA_AVX2_4x4 ps
+FILTER_VER_LUMA_AVX2_4x4 sp
+FILTER_VER_LUMA_AVX2_4x4 ss
+
+%macro FILTER_VER_LUMA_AVX2_8x8 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_8x8, 4, 6, 12
+    mov             r4d, r4m
+    add             r1d, r1d
+    add             r3d, r3d
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,pp
+    vbroadcasti128  m11, [pd_32]
+%elifidn %1, sp
+    mova            m11, [pd_524800]
+%else
+    vbroadcasti128  m11, [pd_n32768]
+%endif
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    pmaddwd         m3, [r5]
+    paddd           m1, m5
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    pmaddwd         m5, [r5]
+    paddd           m3, m7
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    pmaddwd         m6, [r5]
+    paddd           m4, m8
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    pmaddwd         m7, [r5]
+    paddd           m5, m9
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    pmaddwd         m8, [r5 + 1 * mmsize]
+    paddd           m4, m10
+    paddd           m6, m8
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm8, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm8, 1
+    pmaddwd         m8, m9, [r5 + 3 * mmsize]
+    paddd           m3, m8
+    pmaddwd         m8, m9, [r5 + 2 * mmsize]
+    pmaddwd         m9, [r5 + 1 * mmsize]
+    paddd           m5, m8
+    paddd           m7, m9
+    movu            xm8, [r0 + r4]                  ; m8 = row 11
+    punpckhwd       xm9, xm10, xm8
+    punpcklwd       xm10, xm8
+    vinserti128     m10, m10, xm9, 1
+    pmaddwd         m9, m10, [r5 + 3 * mmsize]
+    pmaddwd         m10, [r5 + 2 * mmsize]
+    paddd           m4, m9
+    paddd           m6, m10
+
+    lea             r4, [r3 * 3]
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%else
+    paddd           m0, m11
+    paddd           m1, m11
+    paddd           m2, m11
+    paddd           m3, m11
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    pxor            m10, m10
+    mova            m9, [pw_pixel_max]
+%ifidn %1,pp
+    CLIPW           m0, m10, m9
+    CLIPW           m2, m10, m9
+%elifidn %1, sp
+    CLIPW           m0, m10, m9
+    CLIPW           m2, m10, m9
+%endif
+
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm3
+
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 12
+    punpckhwd       xm3, xm8, xm2
+    punpcklwd       xm8, xm2
+    vinserti128     m8, m8, xm3, 1
+    pmaddwd         m3, m8, [r5 + 3 * mmsize]
+    pmaddwd         m8, [r5 + 2 * mmsize]
+    paddd           m5, m3
+    paddd           m7, m8
+    movu            xm3, [r0 + r1]                  ; m3 = row 13
+    punpckhwd       xm0, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm0, 1
+    pmaddwd         m2, [r5 + 3 * mmsize]
+    paddd           m6, m2
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm1, xm3, xm0
+    punpcklwd       xm3, xm0
+    vinserti128     m3, m3, xm1, 1
+    pmaddwd         m3, [r5 + 3 * mmsize]
+    paddd           m7, m3
+
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%else
+    paddd           m4, m11
+    paddd           m5, m11
+    paddd           m6, m11
+    paddd           m7, m11
+%ifidn %1,pp
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m5, 10
+    psrad           m6, 10
+    psrad           m7, 10
+%else
+    psrad           m4, 2
+    psrad           m5, 2
+    psrad           m6, 2
+    psrad           m7, 2
+%endif
+%endif
+
+    packssdw        m4, m5
+    packssdw        m6, m7
+    vpermq          m4, m4, 11011000b
+    vpermq          m6, m6, 11011000b
+%ifidn %1,pp
+    CLIPW           m4, m10, m9
+    CLIPW           m6, m10, m9
+%elifidn %1, sp
+    CLIPW           m4, m10, m9
+    CLIPW           m6, m10, m9
+%endif
+    vextracti128    xm5, m4, 1
+    vextracti128    xm7, m6, 1
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r4], xm7
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_8x8 pp
+FILTER_VER_LUMA_AVX2_8x8 ps
+FILTER_VER_LUMA_AVX2_8x8 sp
+FILTER_VER_LUMA_AVX2_8x8 ss
+
+%macro PROCESS_LUMA_AVX2_W8_16R 1
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m10, m8, [r5 + 1 * mmsize]
+    paddd           m6, m10
+    pmaddwd         m8, [r5]
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm11, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddwd         m11, m9, [r5 + 3 * mmsize]
+    paddd           m3, m11
+    pmaddwd         m11, m9, [r5 + 2 * mmsize]
+    paddd           m5, m11
+    pmaddwd         m11, m9, [r5 + 1 * mmsize]
+    paddd           m7, m11
+    pmaddwd         m9, [r5]
+    movu            xm11, [r7 + r4]                 ; m11 = row 11
+    punpckhwd       xm12, xm10, xm11
+    punpcklwd       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddwd         m12, m10, [r5 + 3 * mmsize]
+    paddd           m4, m12
+    pmaddwd         m12, m10, [r5 + 2 * mmsize]
+    paddd           m6, m12
+    pmaddwd         m12, m10, [r5 + 1 * mmsize]
+    paddd           m8, m12
+    pmaddwd         m10, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm12, [r7]                      ; m12 = row 12
+    punpckhwd       xm13, xm11, xm12
+    punpcklwd       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddwd         m13, m11, [r5 + 3 * mmsize]
+    paddd           m5, m13
+    pmaddwd         m13, m11, [r5 + 2 * mmsize]
+    paddd           m7, m13
+    pmaddwd         m13, m11, [r5 + 1 * mmsize]
+    paddd           m9, m13
+    pmaddwd         m11, [r5]
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%else
+    paddd           m0, m14
+    paddd           m1, m14
+    paddd           m2, m14
+    paddd           m3, m14
+    paddd           m4, m14
+    paddd           m5, m14
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+    psrad           m4, 10
+    psrad           m5, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+    psrad           m4, 2
+    psrad           m5, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    packssdw        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    pxor            m5, m5
+    mova            m3, [pw_pixel_max]
+%ifidn %1,pp
+    CLIPW           m0, m5, m3
+    CLIPW           m2, m5, m3
+    CLIPW           m4, m5, m3
+%elifidn %1, sp
+    CLIPW           m0, m5, m3
+    CLIPW           m2, m5, m3
+    CLIPW           m4, m5, m3
+%endif
+
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    vextracti128    xm1, m2, 1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm1
+    lea             r8, [r2 + r3 * 4]
+    vextracti128    xm1, m4, 1
+    movu            [r8], xm4
+    movu            [r8 + r3], xm1
+
+    movu            xm13, [r7 + r1]                 ; m13 = row 13
+    punpckhwd       xm0, xm12, xm13
+    punpcklwd       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddwd         m0, m12, [r5 + 3 * mmsize]
+    paddd           m6, m0
+    pmaddwd         m0, m12, [r5 + 2 * mmsize]
+    paddd           m8, m0
+    pmaddwd         m0, m12, [r5 + 1 * mmsize]
+    paddd           m10, m0
+    pmaddwd         m12, [r5]
+    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm1, xm13, xm0
+    punpcklwd       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddwd         m1, m13, [r5 + 3 * mmsize]
+    paddd           m7, m1
+    pmaddwd         m1, m13, [r5 + 2 * mmsize]
+    paddd           m9, m1
+    pmaddwd         m1, m13, [r5 + 1 * mmsize]
+    paddd           m11, m1
+    pmaddwd         m13, [r5]
+
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m7, 6
+%else
+    paddd           m6, m14
+    paddd           m7, m14
+%ifidn %1,pp
+    psrad           m6, 6
+    psrad           m7, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m7, 10
+%else
+    psrad           m6, 2
+    psrad           m7, 2
+%endif
+%endif
+
+    packssdw        m6, m7
+    vpermq          m6, m6, 11011000b
+%ifidn %1,pp
+    CLIPW           m6, m5, m3
+%elifidn %1, sp
+    CLIPW           m6, m5, m3
+%endif
+    vextracti128    xm7, m6, 1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm7
+
+    movu            xm1, [r7 + r4]                  ; m1 = row 15
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m2, m0, [r5 + 3 * mmsize]
+    paddd           m8, m2
+    pmaddwd         m2, m0, [r5 + 2 * mmsize]
+    paddd           m10, m2
+    pmaddwd         m2, m0, [r5 + 1 * mmsize]
+    paddd           m12, m2
+    pmaddwd         m0, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm2, [r7]                       ; m2 = row 16
+    punpckhwd       xm6, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm6, 1
+    pmaddwd         m6, m1, [r5 + 3 * mmsize]
+    paddd           m9, m6
+    pmaddwd         m6, m1, [r5 + 2 * mmsize]
+    paddd           m11, m6
+    pmaddwd         m6, m1, [r5 + 1 * mmsize]
+    paddd           m13, m6
+    pmaddwd         m1, [r5]
+    movu            xm6, [r7 + r1]                  ; m6 = row 17
+    punpckhwd       xm4, xm2, xm6
+    punpcklwd       xm2, xm6
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 3 * mmsize]
+    paddd           m10, m4
+    pmaddwd         m4, m2, [r5 + 2 * mmsize]
+    paddd           m12, m4
+    pmaddwd         m2, [r5 + 1 * mmsize]
+    paddd           m0, m2
+    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
+    punpckhwd       xm2, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm2, 1
+    pmaddwd         m2, m6, [r5 + 3 * mmsize]
+    paddd           m11, m2
+    pmaddwd         m2, m6, [r5 + 2 * mmsize]
+    paddd           m13, m2
+    pmaddwd         m6, [r5 + 1 * mmsize]
+    paddd           m1, m6
+    movu            xm2, [r7 + r4]                  ; m2 = row 19
+    punpckhwd       xm6, xm4, xm2
+    punpcklwd       xm4, xm2
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 3 * mmsize]
+    paddd           m12, m6
+    pmaddwd         m4, [r5 + 2 * mmsize]
+    paddd           m0, m4
+    lea             r7, [r7 + r1 * 4]
+    movu            xm6, [r7]                       ; m6 = row 20
+    punpckhwd       xm7, xm2, xm6
+    punpcklwd       xm2, xm6
+    vinserti128     m2, m2, xm7, 1
+    pmaddwd         m7, m2, [r5 + 3 * mmsize]
+    paddd           m13, m7
+    pmaddwd         m2, [r5 + 2 * mmsize]
+    paddd           m1, m2
+    movu            xm7, [r7 + r1]                  ; m7 = row 21
+    punpckhwd       xm2, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddwd         m6, [r5 + 3 * mmsize]
+    paddd           m0, m6
+    movu            xm2, [r7 + r1 * 2]              ; m2 = row 22
+    punpckhwd       xm6, xm7, xm2
+    punpcklwd       xm7, xm2
+    vinserti128     m7, m7, xm6, 1
+    pmaddwd         m7, [r5 + 3 * mmsize]
+    paddd           m1, m7
+
+%ifidn %1,ss
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m1, 6
+%else
+    paddd           m8, m14
+    paddd           m9, m14
+    paddd           m10, m14
+    paddd           m11, m14
+    paddd           m12, m14
+    paddd           m13, m14
+    paddd           m0, m14
+    paddd           m1, m14
+%ifidn %1,pp
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m1, 6
+%elifidn %1, sp
+    psrad           m8, 10
+    psrad           m9, 10
+    psrad           m10, 10
+    psrad           m11, 10
+    psrad           m12, 10
+    psrad           m13, 10
+    psrad           m0, 10
+    psrad           m1, 10
+%else
+    psrad           m8, 2
+    psrad           m9, 2
+    psrad           m10, 2
+    psrad           m11, 2
+    psrad           m12, 2
+    psrad           m13, 2
+    psrad           m0, 2
+    psrad           m1, 2
+%endif
+%endif
+
+    packssdw        m8, m9
+    packssdw        m10, m11
+    packssdw        m12, m13
+    packssdw        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+%ifidn %1,pp
+    CLIPW           m8, m5, m3
+    CLIPW           m10, m5, m3
+    CLIPW           m12, m5, m3
+    CLIPW           m0, m5, m3
+%elifidn %1, sp
+    CLIPW           m8, m5, m3
+    CLIPW           m10, m5, m3
+    CLIPW           m12, m5, m3
+    CLIPW           m0, m5, m3
+%endif
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm8
+    movu            [r8 + r3], xm9
+    movu            [r8 + r3 * 2], xm10
+    movu            [r8 + r6], xm11
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm12
+    movu            [r8 + r3], xm13
+    movu            [r8 + r3 * 2], xm0
+    movu            [r8 + r6], xm1
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_Nx16 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    mov             r9d, %2 / 8
+.loopW:
+    PROCESS_LUMA_AVX2_W8_16R %1
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_Nx16 pp, 16
+FILTER_VER_LUMA_AVX2_Nx16 pp, 32
+FILTER_VER_LUMA_AVX2_Nx16 pp, 64
+FILTER_VER_LUMA_AVX2_Nx16 ps, 16
+FILTER_VER_LUMA_AVX2_Nx16 ps, 32
+FILTER_VER_LUMA_AVX2_Nx16 ps, 64
+FILTER_VER_LUMA_AVX2_Nx16 sp, 16
+FILTER_VER_LUMA_AVX2_Nx16 sp, 32
+FILTER_VER_LUMA_AVX2_Nx16 sp, 64
+FILTER_VER_LUMA_AVX2_Nx16 ss, 16
+FILTER_VER_LUMA_AVX2_Nx16 ss, 32
+FILTER_VER_LUMA_AVX2_Nx16 ss, 64
+
+%macro FILTER_VER_LUMA_AVX2_NxN 3
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %3,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %3, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+
+    lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+    mov             r9d, %2 / 16
+.loopH:
+    mov             r10d, %1 / 8
+.loopW:
+    PROCESS_LUMA_AVX2_W8_16R %3
+    add             r2, 16
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 2 * %1 + 16]
+    lea             r2, [r8 + r3 * 4 - 2 * %1 + 16]
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_NxN 16, 32, pp
+FILTER_VER_LUMA_AVX2_NxN 16, 64, pp
+FILTER_VER_LUMA_AVX2_NxN 24, 32, pp
+FILTER_VER_LUMA_AVX2_NxN 32, 32, pp
+FILTER_VER_LUMA_AVX2_NxN 32, 64, pp
+FILTER_VER_LUMA_AVX2_NxN 48, 64, pp
+FILTER_VER_LUMA_AVX2_NxN 64, 32, pp
+FILTER_VER_LUMA_AVX2_NxN 64, 48, pp
+FILTER_VER_LUMA_AVX2_NxN 64, 64, pp
+FILTER_VER_LUMA_AVX2_NxN 16, 32, ps
+FILTER_VER_LUMA_AVX2_NxN 16, 64, ps
+FILTER_VER_LUMA_AVX2_NxN 24, 32, ps
+FILTER_VER_LUMA_AVX2_NxN 32, 32, ps
+FILTER_VER_LUMA_AVX2_NxN 32, 64, ps
+FILTER_VER_LUMA_AVX2_NxN 48, 64, ps
+FILTER_VER_LUMA_AVX2_NxN 64, 32, ps
+FILTER_VER_LUMA_AVX2_NxN 64, 48, ps
+FILTER_VER_LUMA_AVX2_NxN 64, 64, ps
+FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 64, ss
+
+%macro FILTER_VER_LUMA_AVX2_8xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    lea             r7, [r1 * 4]
+    mov             r8d, %2 / 16
+.loopH:
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m10, m8, [r5 + 1 * mmsize]
+    paddd           m6, m10
+    pmaddwd         m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm11, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddwd         m11, m9, [r5 + 3 * mmsize]
+    paddd           m3, m11
+    pmaddwd         m11, m9, [r5 + 2 * mmsize]
+    paddd           m5, m11
+    pmaddwd         m11, m9, [r5 + 1 * mmsize]
+    paddd           m7, m11
+    pmaddwd         m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhwd       xm12, xm10, xm11
+    punpcklwd       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddwd         m12, m10, [r5 + 3 * mmsize]
+    paddd           m4, m12
+    pmaddwd         m12, m10, [r5 + 2 * mmsize]
+    paddd           m6, m12
+    pmaddwd         m12, m10, [r5 + 1 * mmsize]
+    paddd           m8, m12
+    pmaddwd         m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhwd       xm13, xm11, xm12
+    punpcklwd       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddwd         m13, m11, [r5 + 3 * mmsize]
+    paddd           m5, m13
+    pmaddwd         m13, m11, [r5 + 2 * mmsize]
+    paddd           m7, m13
+    pmaddwd         m13, m11, [r5 + 1 * mmsize]
+    paddd           m9, m13
+    pmaddwd         m11, [r5]
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%else
+    paddd           m0, m14
+    paddd           m1, m14
+    paddd           m2, m14
+    paddd           m3, m14
+    paddd           m4, m14
+    paddd           m5, m14
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+    psrad           m4, 10
+    psrad           m5, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+    psrad           m4, 2
+    psrad           m5, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    packssdw        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    pxor            m5, m5
+    mova            m3, [pw_pixel_max]
+%ifidn %1,pp
+    CLIPW           m0, m5, m3
+    CLIPW           m2, m5, m3
+    CLIPW           m4, m5, m3
+%elifidn %1, sp
+    CLIPW           m0, m5, m3
+    CLIPW           m2, m5, m3
+    CLIPW           m4, m5, m3
+%endif
+
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    vextracti128    xm1, m2, 1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm1
+    lea             r2, [r2 + r3 * 4]
+    vextracti128    xm1, m4, 1
+    movu            [r2], xm4
+    movu            [r2 + r3], xm1
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhwd       xm0, xm12, xm13
+    punpcklwd       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddwd         m0, m12, [r5 + 3 * mmsize]
+    paddd           m6, m0
+    pmaddwd         m0, m12, [r5 + 2 * mmsize]
+    paddd           m8, m0
+    pmaddwd         m0, m12, [r5 + 1 * mmsize]
+    paddd           m10, m0
+    pmaddwd         m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm1, xm13, xm0
+    punpcklwd       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddwd         m1, m13, [r5 + 3 * mmsize]
+    paddd           m7, m1
+    pmaddwd         m1, m13, [r5 + 2 * mmsize]
+    paddd           m9, m1
+    pmaddwd         m1, m13, [r5 + 1 * mmsize]
+    paddd           m11, m1
+    pmaddwd         m13, [r5]
+
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m7, 6
+%else
+    paddd           m6, m14
+    paddd           m7, m14
+%ifidn %1,pp
+    psrad           m6, 6
+    psrad           m7, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m7, 10
+%else
+    psrad           m6, 2
+    psrad           m7, 2
+%endif
+%endif
+
+    packssdw        m6, m7
+    vpermq          m6, m6, 11011000b
+%ifidn %1,pp
+    CLIPW           m6, m5, m3
+%elifidn %1, sp
+    CLIPW           m6, m5, m3
+%endif
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m2, m0, [r5 + 3 * mmsize]
+    paddd           m8, m2
+    pmaddwd         m2, m0, [r5 + 2 * mmsize]
+    paddd           m10, m2
+    pmaddwd         m2, m0, [r5 + 1 * mmsize]
+    paddd           m12, m2
+    pmaddwd         m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhwd       xm6, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm6, 1
+    pmaddwd         m6, m1, [r5 + 3 * mmsize]
+    paddd           m9, m6
+    pmaddwd         m6, m1, [r5 + 2 * mmsize]
+    paddd           m11, m6
+    pmaddwd         m6, m1, [r5 + 1 * mmsize]
+    paddd           m13, m6
+    pmaddwd         m1, [r5]
+    movu            xm6, [r0 + r1]                  ; m6 = row 17
+    punpckhwd       xm4, xm2, xm6
+    punpcklwd       xm2, xm6
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 3 * mmsize]
+    paddd           m10, m4
+    pmaddwd         m4, m2, [r5 + 2 * mmsize]
+    paddd           m12, m4
+    pmaddwd         m2, [r5 + 1 * mmsize]
+    paddd           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhwd       xm2, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm2, 1
+    pmaddwd         m2, m6, [r5 + 3 * mmsize]
+    paddd           m11, m2
+    pmaddwd         m2, m6, [r5 + 2 * mmsize]
+    paddd           m13, m2
+    pmaddwd         m6, [r5 + 1 * mmsize]
+    paddd           m1, m6
+    movu            xm2, [r0 + r4]                  ; m2 = row 19
+    punpckhwd       xm6, xm4, xm2
+    punpcklwd       xm4, xm2
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 3 * mmsize]
+    paddd           m12, m6
+    pmaddwd         m4, [r5 + 2 * mmsize]
+    paddd           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 20
+    punpckhwd       xm7, xm2, xm6
+    punpcklwd       xm2, xm6
+    vinserti128     m2, m2, xm7, 1
+    pmaddwd         m7, m2, [r5 + 3 * mmsize]
+    paddd           m13, m7
+    pmaddwd         m2, [r5 + 2 * mmsize]
+    paddd           m1, m2
+    movu            xm7, [r0 + r1]                  ; m7 = row 21
+    punpckhwd       xm2, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddwd         m6, [r5 + 3 * mmsize]
+    paddd           m0, m6
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhwd       xm6, xm7, xm2
+    punpcklwd       xm7, xm2
+    vinserti128     m7, m7, xm6, 1
+    pmaddwd         m7, [r5 + 3 * mmsize]
+    paddd           m1, m7
+
+%ifidn %1,ss
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m1, 6
+%else
+    paddd           m8, m14
+    paddd           m9, m14
+    paddd           m10, m14
+    paddd           m11, m14
+    paddd           m12, m14
+    paddd           m13, m14
+    paddd           m0, m14
+    paddd           m1, m14
+%ifidn %1,pp
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m1, 6
+%elifidn %1, sp
+    psrad           m8, 10
+    psrad           m9, 10
+    psrad           m10, 10
+    psrad           m11, 10
+    psrad           m12, 10
+    psrad           m13, 10
+    psrad           m0, 10
+    psrad           m1, 10
+%else
+    psrad           m8, 2
+    psrad           m9, 2
+    psrad           m10, 2
+    psrad           m11, 2
+    psrad           m12, 2
+    psrad           m13, 2
+    psrad           m0, 2
+    psrad           m1, 2
+%endif
+%endif
+
+    packssdw        m8, m9
+    packssdw        m10, m11
+    packssdw        m12, m13
+    packssdw        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+%ifidn %1,pp
+    CLIPW           m8, m5, m3
+    CLIPW           m10, m5, m3
+    CLIPW           m12, m5, m3
+    CLIPW           m0, m5, m3
+%elifidn %1, sp
+    CLIPW           m8, m5, m3
+    CLIPW           m10, m5, m3
+    CLIPW           m12, m5, m3
+    CLIPW           m0, m5, m3
+%endif
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm12
+    movu            [r2 + r3], xm13
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r6], xm1
+    lea             r2, [r2 + r3 * 4]
+    sub             r0, r7
+    dec             r8d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_8xN pp, 16
+FILTER_VER_LUMA_AVX2_8xN pp, 32
+FILTER_VER_LUMA_AVX2_8xN ps, 16
+FILTER_VER_LUMA_AVX2_8xN ps, 32
+FILTER_VER_LUMA_AVX2_8xN sp, 16
+FILTER_VER_LUMA_AVX2_8xN sp, 32
+FILTER_VER_LUMA_AVX2_8xN ss, 16
+FILTER_VER_LUMA_AVX2_8xN ss, 32
+
+%macro PROCESS_LUMA_AVX2_W8_8R 1
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m8, [r5 + 1 * mmsize]
+    paddd           m6, m8
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm8, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm8, 1
+    pmaddwd         m8, m9, [r5 + 3 * mmsize]
+    paddd           m3, m8
+    pmaddwd         m8, m9, [r5 + 2 * mmsize]
+    paddd           m5, m8
+    pmaddwd         m9, [r5 + 1 * mmsize]
+    paddd           m7, m9
+    movu            xm8, [r7 + r4]                  ; m8 = row 11
+    punpckhwd       xm9, xm10, xm8
+    punpcklwd       xm10, xm8
+    vinserti128     m10, m10, xm9, 1
+    pmaddwd         m9, m10, [r5 + 3 * mmsize]
+    paddd           m4, m9
+    pmaddwd         m10, [r5 + 2 * mmsize]
+    paddd           m6, m10
+    lea             r7, [r7 + r1 * 4]
+    movu            xm9, [r7]                       ; m9 = row 12
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m5, m10
+    pmaddwd         m8, [r5 + 2 * mmsize]
+    paddd           m7, m8
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%else
+    paddd           m0, m11
+    paddd           m1, m11
+    paddd           m2, m11
+    paddd           m3, m11
+    paddd           m4, m11
+    paddd           m5, m11
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+    psrad           m4, 10
+    psrad           m5, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+    psrad           m4, 2
+    psrad           m5, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    packssdw        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    pxor            m8, m8
+%ifidn %1,pp
+    CLIPW           m0, m8, m12
+    CLIPW           m2, m8, m12
+    CLIPW           m4, m8, m12
+%elifidn %1, sp
+    CLIPW           m0, m8, m12
+    CLIPW           m2, m8, m12
+    CLIPW           m4, m8, m12
+%endif
+
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], xm4
+    movu            [r8 + r3], xm5
+
+    movu            xm10, [r7 + r1]                 ; m10 = row 13
+    punpckhwd       xm0, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm0, 1
+    pmaddwd         m9, [r5 + 3 * mmsize]
+    paddd           m6, m9
+    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm1, xm10, xm0
+    punpcklwd       xm10, xm0
+    vinserti128     m10, m10, xm1, 1
+    pmaddwd         m10, [r5 + 3 * mmsize]
+    paddd           m7, m10
+
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m7, 6
+%else
+    paddd           m6, m11
+    paddd           m7, m11
+%ifidn %1,pp
+    psrad           m6, 6
+    psrad           m7, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m7, 10
+%else
+    psrad           m6, 2
+    psrad           m7, 2
+%endif
+%endif
+
+    packssdw        m6, m7
+    vpermq          m6, m6, 11011000b
+%ifidn %1,pp
+    CLIPW           m6, m8, m12
+%elifidn %1, sp
+    CLIPW           m6, m8, m12
+%endif
+    vextracti128    xm7, m6, 1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm7
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_Nx8 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_%2x8, 4, 10, 13
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m11, [pd_32]
+%elifidn %1, sp
+    mova            m11, [pd_524800]
+%else
+    vbroadcasti128  m11, [pd_n32768]
+%endif
+    mova            m12, [pw_pixel_max]
+    lea             r6, [r3 * 3]
+    mov             r9d, %2 / 8
+.loopW:
+    PROCESS_LUMA_AVX2_W8_8R %1
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_Nx8 pp, 32
+FILTER_VER_LUMA_AVX2_Nx8 pp, 16
+FILTER_VER_LUMA_AVX2_Nx8 ps, 32
+FILTER_VER_LUMA_AVX2_Nx8 ps, 16
+FILTER_VER_LUMA_AVX2_Nx8 sp, 32
+FILTER_VER_LUMA_AVX2_Nx8 sp, 16
+FILTER_VER_LUMA_AVX2_Nx8 ss, 32
+FILTER_VER_LUMA_AVX2_Nx8 ss, 16
+
+%macro FILTER_VER_LUMA_AVX2_32x24 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_32x24, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    mov             r9d, 4
+.loopW:
+    PROCESS_LUMA_AVX2_W8_16R %1
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    lea             r9, [r1 * 4]
+    sub             r7, r9
+    lea             r0, [r7 - 48]
+    lea             r2, [r8 + r3 * 4 - 48]
+    mova            m11, m14
+    mova            m12, m3
+    mov             r9d, 4
+.loop:
+    PROCESS_LUMA_AVX2_W8_8R %1
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32x24 pp
+FILTER_VER_LUMA_AVX2_32x24 ps
+FILTER_VER_LUMA_AVX2_32x24 sp
+FILTER_VER_LUMA_AVX2_32x24 ss
+
+%macro PROCESS_LUMA_AVX2_W8_4R 1
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m4, [r5 + 1 * mmsize]
+    paddd           m2, m4
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm4, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm4, 1
+    pmaddwd         m4, m5, [r5 + 2 * mmsize]
+    paddd           m1, m4
+    pmaddwd         m5, [r5 + 1 * mmsize]
+    paddd           m3, m5
+    movu            xm4, [r0 + r4]                  ; m4 = row 7
+    punpckhwd       xm5, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm5, 1
+    pmaddwd         m5, m6, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m6, [r5 + 2 * mmsize]
+    paddd           m2, m6
+    lea             r0, [r0 + r1 * 4]
+    movu            xm5, [r0]                       ; m5 = row 8
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 3 * mmsize]
+    paddd           m1, m6
+    pmaddwd         m4, [r5 + 2 * mmsize]
+    paddd           m3, m4
+    movu            xm6, [r0 + r1]                  ; m6 = row 9
+    punpckhwd       xm4, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm4, 1
+    pmaddwd         m5, [r5 + 3 * mmsize]
+    paddd           m2, m5
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 10
+    punpckhwd       xm5, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm5, 1
+    pmaddwd         m6, [r5 + 3 * mmsize]
+    paddd           m3, m6
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%else
+    paddd           m0, m7
+    paddd           m1, m7
+    paddd           m2, m7
+    paddd           m3, m7
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    pxor            m4, m4
+%ifidn %1,pp
+    CLIPW           m0, m4, [pw_pixel_max]
+    CLIPW           m2, m4, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m0, m4, [pw_pixel_max]
+    CLIPW           m2, m4, [pw_pixel_max]
+%endif
+
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_16x4 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
+%else
+    vbroadcasti128  m7, [pd_n32768]
+%endif
+    mov             dword [rsp], 2
+.loopW:
+    PROCESS_LUMA_AVX2_W8_4R %1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    lea             r6, [r3 * 3]
+    movu            [r2 + r6], xm3
+    add             r2, 16
+    lea             r6, [8 * r1 - 16]
+    sub             r0, r6
+    dec             dword [rsp]
+    jnz             .loopW
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x4 pp
+FILTER_VER_LUMA_AVX2_16x4 ps
+FILTER_VER_LUMA_AVX2_16x4 sp
+FILTER_VER_LUMA_AVX2_16x4 ss
+
+%macro FILTER_VER_LUMA_AVX2_8x4 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_8x4, 4, 6, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
+%else
+    vbroadcasti128  m7, [pd_n32768]
+%endif
+
+    PROCESS_LUMA_AVX2_W8_4R %1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    lea             r4, [r3 * 3]
+    movu            [r2 + r4], xm3
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_8x4 pp
+FILTER_VER_LUMA_AVX2_8x4 ps
+FILTER_VER_LUMA_AVX2_8x4 sp
+FILTER_VER_LUMA_AVX2_8x4 ss
+
+%macro FILTER_VER_LUMA_AVX2_16x12 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_16x12, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    mova            m13, [pw_pixel_max]
+    pxor            m12, m12
+    lea             r6, [r3 * 3]
+    mov             r9d, 2
+.loopW:
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m10, m8, [r5 + 1 * mmsize]
+    paddd           m6, m10
+    pmaddwd         m8, [r5]
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm11, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddwd         m11, m9, [r5 + 3 * mmsize]
+    paddd           m3, m11
+    pmaddwd         m11, m9, [r5 + 2 * mmsize]
+    paddd           m5, m11
+    pmaddwd         m11, m9, [r5 + 1 * mmsize]
+    paddd           m7, m11
+    pmaddwd         m9, [r5]
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%else
+    paddd           m0, m14
+    paddd           m1, m14
+    paddd           m2, m14
+    paddd           m3, m14
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+%else
+    psrad           m0, 2
+    psrad           m1, 2
+    psrad           m2, 2
+    psrad           m3, 2
+%endif
+%endif
+
+    packssdw        m0, m1
+    packssdw        m2, m3
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+%ifidn %1,pp
+    CLIPW           m0, m12, m13
+    CLIPW           m2, m12, m13
+%elifidn %1, sp
+    CLIPW           m0, m12, m13
+    CLIPW           m2, m12, m13
+%endif
+
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+
+    movu            xm11, [r7 + r4]                 ; m11 = row 11
+    punpckhwd       xm0, xm10, xm11
+    punpcklwd       xm10, xm11
+    vinserti128     m10, m10, xm0, 1
+    pmaddwd         m0, m10, [r5 + 3 * mmsize]
+    paddd           m4, m0
+    pmaddwd         m0, m10, [r5 + 2 * mmsize]
+    paddd           m6, m0
+    pmaddwd         m0, m10, [r5 + 1 * mmsize]
+    paddd           m8, m0
+    pmaddwd         m10, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm0, [r7]                      ; m0 = row 12
+    punpckhwd       xm1, xm11, xm0
+    punpcklwd       xm11, xm0
+    vinserti128     m11, m11, xm1, 1
+    pmaddwd         m1, m11, [r5 + 3 * mmsize]
+    paddd           m5, m1
+    pmaddwd         m1, m11, [r5 + 2 * mmsize]
+    paddd           m7, m1
+    pmaddwd         m1, m11, [r5 + 1 * mmsize]
+    paddd           m9, m1
+    pmaddwd         m11, [r5]
+    movu            xm2, [r7 + r1]                 ; m2 = row 13
+    punpckhwd       xm1, xm0, xm2
+    punpcklwd       xm0, xm2
+    vinserti128     m0, m0, xm1, 1
+    pmaddwd         m1, m0, [r5 + 3 * mmsize]
+    paddd           m6, m1
+    pmaddwd         m1, m0, [r5 + 2 * mmsize]
+    paddd           m8, m1
+    pmaddwd         m0, [r5 + 1 * mmsize]
+    paddd           m10, m0
+    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm1, xm2, xm0
+    punpcklwd       xm2, xm0
+    vinserti128     m2, m2, xm1, 1
+    pmaddwd         m1, m2, [r5 + 3 * mmsize]
+    paddd           m7, m1
+    pmaddwd         m1, m2, [r5 + 2 * mmsize]
+    paddd           m9, m1
+    pmaddwd         m2, [r5 + 1 * mmsize]
+    paddd           m11, m2
+
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%else
+    paddd           m4, m14
+    paddd           m5, m14
+    paddd           m6, m14
+    paddd           m7, m14
+%ifidn %1,pp
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m5, 10
+    psrad           m6, 10
+    psrad           m7, 10
+%else
+    psrad           m4, 2
+    psrad           m5, 2
+    psrad           m6, 2
+    psrad           m7, 2
+%endif
+%endif
+
+    packssdw        m4, m5
+    packssdw        m6, m7
+    vpermq          m4, m4, 11011000b
+    vpermq          m6, m6, 11011000b
+%ifidn %1,pp
+    CLIPW           m4, m12, m13
+    CLIPW           m6, m12, m13
+%elifidn %1, sp
+    CLIPW           m4, m12, m13
+    CLIPW           m6, m12, m13
+%endif
+    lea             r8, [r2 + r3 * 4]
+    vextracti128    xm1, m4, 1
+    vextracti128    xm7, m6, 1
+    movu            [r8], xm4
+    movu            [r8 + r3], xm1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm7
+
+    movu            xm1, [r7 + r4]                  ; m1 = row 15
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m2, m0, [r5 + 3 * mmsize]
+    paddd           m8, m2
+    pmaddwd         m0, [r5 + 2 * mmsize]
+    paddd           m10, m0
+    lea             r7, [r7 + r1 * 4]
+    movu            xm2, [r7]                       ; m2 = row 16
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m3, m1, [r5 + 3 * mmsize]
+    paddd           m9, m3
+    pmaddwd         m1, [r5 + 2 * mmsize]
+    paddd           m11, m1
+    movu            xm3, [r7 + r1]                  ; m3 = row 17
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m2, [r5 + 3 * mmsize]
+    paddd           m10, m2
+    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
+    punpckhwd       xm2, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm2, 1
+    pmaddwd         m3, [r5 + 3 * mmsize]
+    paddd           m11, m3
+
+%ifidn %1,ss
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+%else
+    paddd           m8, m14
+    paddd           m9, m14
+    paddd           m10, m14
+    paddd           m11, m14
+%ifidn %1,pp
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+%elifidn %1, sp
+    psrad           m8, 10
+    psrad           m9, 10
+    psrad           m10, 10
+    psrad           m11, 10
+%else
+    psrad           m8, 2
+    psrad           m9, 2
+    psrad           m10, 2
+    psrad           m11, 2
+%endif
+%endif
+
+    packssdw        m8, m9
+    packssdw        m10, m11
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+%ifidn %1,pp
+    CLIPW           m8, m12, m13
+    CLIPW           m10, m12, m13
+%elifidn %1, sp
+    CLIPW           m8, m12, m13
+    CLIPW           m10, m12, m13
+%endif
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm8
+    movu            [r8 + r3], xm9
+    movu            [r8 + r3 * 2], xm10
+    movu            [r8 + r6], xm11
+    add             r2, 16
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x12 pp
+FILTER_VER_LUMA_AVX2_16x12 ps
+FILTER_VER_LUMA_AVX2_16x12 sp
+FILTER_VER_LUMA_AVX2_16x12 ss
+
+%macro FILTER_VER_LUMA_AVX2_4x8 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,pp
+    vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
+%else
+    vbroadcasti128  m7, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m4, [r5 + 1 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m4, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm6, [r0]
+    punpcklwd       xm3, xm6
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m1, [r5 + 2 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m5, m1, [r5 + 1 * mmsize]
+    paddd           m4, m5
+    pmaddwd         m1, [r5]
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm6, xm3
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm3, xm5
+    vinserti128     m6, m6, xm3, 1                  ; m6 = [A 9 9 8]
+    pmaddwd         m3, m6, [r5 + 3 * mmsize]
+    paddd           m2, m3
+    pmaddwd         m3, m6, [r5 + 2 * mmsize]
+    paddd           m4, m3
+    pmaddwd         m6, [r5 + 1 * mmsize]
+    paddd           m1, m6
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
+    paddd           m0, m7
+    paddd           m2, m7
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
+%else
+    psrad           m0, 2
+    psrad           m2, 2
+%endif
+%endif
+
+    packssdw        m0, m2
+    pxor            m6, m6
+    mova            m3, [pw_pixel_max]
+%ifidn %1,pp
+    CLIPW           m0, m6, m3
+%elifidn %1, sp
+    CLIPW           m0, m6, m3
+%endif
+
+    vextracti128    xm2, m0, 1
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm2
+
+    movq            xm2, [r0 + r4]
+    punpcklwd       xm5, xm2
+    lea             r0, [r0 + 4 * r1]
+    movq            xm0, [r0]
+    punpcklwd       xm2, xm0
+    vinserti128     m5, m5, xm2, 1                  ; m5 = [C B B A]
+    pmaddwd         m2, m5, [r5 + 3 * mmsize]
+    paddd           m4, m2
+    pmaddwd         m5, [r5 + 2 * mmsize]
+    paddd           m1, m5
+    movq            xm2, [r0 + r1]
+    punpcklwd       xm0, xm2
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm2, xm5
+    vinserti128     m0, m0, xm2, 1                  ; m0 = [E D D C]
+    pmaddwd         m0, [r5 + 3 * mmsize]
+    paddd           m1, m0
+
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m1, 6
+%else
+    paddd           m4, m7
+    paddd           m1, m7
+%ifidn %1,pp
+    psrad           m4, 6
+    psrad           m1, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m1, 10
+%else
+    psrad           m4, 2
+    psrad           m1, 2
+%endif
+%endif
+
+    packssdw        m4, m1
+%ifidn %1,pp
+    CLIPW           m4, m6, m3
+%elifidn %1, sp
+    CLIPW           m4, m6, m3
+%endif
+
+    vextracti128    xm1, m4, 1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    movq            [r2 + r3], xm1
+    movhps          [r2 + r3 * 2], xm4
+    movhps          [r2 + r6], xm1
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x8 pp
+FILTER_VER_LUMA_AVX2_4x8 ps
+FILTER_VER_LUMA_AVX2_4x8 sp
+FILTER_VER_LUMA_AVX2_4x8 ss
+
+%macro PROCESS_LUMA_AVX2_W4_16R 1
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m4, [r5 + 1 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m4, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm6, [r0]
+    punpcklwd       xm3, xm6
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m1, [r5 + 2 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m5, m1, [r5 + 1 * mmsize]
+    paddd           m4, m5
+    pmaddwd         m1, [r5]
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm6, xm3
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm3, xm5
+    vinserti128     m6, m6, xm3, 1                  ; m6 = [10 9 9 8]
+    pmaddwd         m3, m6, [r5 + 3 * mmsize]
+    paddd           m2, m3
+    pmaddwd         m3, m6, [r5 + 2 * mmsize]
+    paddd           m4, m3
+    pmaddwd         m3, m6, [r5 + 1 * mmsize]
+    paddd           m1, m3
+    pmaddwd         m6, [r5]
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
+    paddd           m0, m7
+    paddd           m2, m7
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
+%else
+    psrad           m0, 2
+    psrad           m2, 2
+%endif
+%endif
+
+    packssdw        m0, m2
+    pxor            m3, m3
+%ifidn %1,pp
+    CLIPW           m0, m3, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m0, m3, [pw_pixel_max]
+%endif
+
+    vextracti128    xm2, m0, 1
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm2
+
+    movq            xm2, [r0 + r4]
+    punpcklwd       xm5, xm2
+    lea             r0, [r0 + 4 * r1]
+    movq            xm0, [r0]
+    punpcklwd       xm2, xm0
+    vinserti128     m5, m5, xm2, 1                  ; m5 = [12 11 11 10]
+    pmaddwd         m2, m5, [r5 + 3 * mmsize]
+    paddd           m4, m2
+    pmaddwd         m2, m5, [r5 + 2 * mmsize]
+    paddd           m1, m2
+    pmaddwd         m2, m5, [r5 + 1 * mmsize]
+    paddd           m6, m2
+    pmaddwd         m5, [r5]
+    movq            xm2, [r0 + r1]
+    punpcklwd       xm0, xm2
+    movq            xm3, [r0 + 2 * r1]
+    punpcklwd       xm2, xm3
+    vinserti128     m0, m0, xm2, 1                  ; m0 = [14 13 13 12]
+    pmaddwd         m2, m0, [r5 + 3 * mmsize]
+    paddd           m1, m2
+    pmaddwd         m2, m0, [r5 + 2 * mmsize]
+    paddd           m6, m2
+    pmaddwd         m2, m0, [r5 + 1 * mmsize]
+    paddd           m5, m2
+    pmaddwd         m0, [r5]
+
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m1, 6
+%else
+    paddd           m4, m7
+    paddd           m1, m7
+%ifidn %1,pp
+    psrad           m4, 6
+    psrad           m1, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m1, 10
+%else
+    psrad           m4, 2
+    psrad           m1, 2
+%endif
+%endif
+
+    packssdw        m4, m1
+    pxor            m2, m2
+%ifidn %1,pp
+    CLIPW           m4, m2, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m4, m2, [pw_pixel_max]
+%endif
+
+    vextracti128    xm1, m4, 1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    movq            [r2 + r3], xm1
+    movhps          [r2 + r3 * 2], xm4
+    movhps          [r2 + r6], xm1
+
+    movq            xm4, [r0 + r4]
+    punpcklwd       xm3, xm4
+    lea             r0, [r0 + 4 * r1]
+    movq            xm1, [r0]
+    punpcklwd       xm4, xm1
+    vinserti128     m3, m3, xm4, 1                  ; m3 = [16 15 15 14]
+    pmaddwd         m4, m3, [r5 + 3 * mmsize]
+    paddd           m6, m4
+    pmaddwd         m4, m3, [r5 + 2 * mmsize]
+    paddd           m5, m4
+    pmaddwd         m4, m3, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m3, [r5]
+    movq            xm4, [r0 + r1]
+    punpcklwd       xm1, xm4
+    movq            xm2, [r0 + 2 * r1]
+    punpcklwd       xm4, xm2
+    vinserti128     m1, m1, xm4, 1                  ; m1 = [18 17 17 16]
+    pmaddwd         m4, m1, [r5 + 3 * mmsize]
+    paddd           m5, m4
+    pmaddwd         m4, m1, [r5 + 2 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m1, [r5 + 1 * mmsize]
+    paddd           m3, m1
+
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m5, 6
+%else
+    paddd           m6, m7
+    paddd           m5, m7
+%ifidn %1,pp
+    psrad           m6, 6
+    psrad           m5, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m5, 10
+%else
+    psrad           m6, 2
+    psrad           m5, 2
+%endif
+%endif
+
+    packssdw        m6, m5
+    pxor            m1, m1
+%ifidn %1,pp
+    CLIPW           m6, m1, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m6, m1, [pw_pixel_max]
+%endif
+
+    vextracti128    xm5, m6, 1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm6
+    movq            [r2 + r3], xm5
+    movhps          [r2 + r3 * 2], xm6
+    movhps          [r2 + r6], xm5
+
+    movq            xm4, [r0 + r4]
+    punpcklwd       xm2, xm4
+    lea             r0, [r0 + 4 * r1]
+    movq            xm6, [r0]
+    punpcklwd       xm4, xm6
+    vinserti128     m2, m2, xm4, 1                  ; m2 = [20 19 19 18]
+    pmaddwd         m4, m2, [r5 + 3 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5 + 2 * mmsize]
+    paddd           m3, m2
+    movq            xm4, [r0 + r1]
+    punpcklwd       xm6, xm4
+    movq            xm2, [r0 + 2 * r1]
+    punpcklwd       xm4, xm2
+    vinserti128     m6, m6, xm4, 1                  ; m6 = [22 21 21 20]
+    pmaddwd         m6, [r5 + 3 * mmsize]
+    paddd           m3, m6
+
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m3, 6
+%else
+    paddd           m0, m7
+    paddd           m3, m7
+%ifidn %1,pp
+    psrad           m0, 6
+    psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m3, 10
+%else
+    psrad           m0, 2
+    psrad           m3, 2
+%endif
+%endif
+  
+    packssdw        m0, m3
+%ifidn %1,pp
+    CLIPW           m0, m1, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m0, m1, [pw_pixel_max]
+%endif
+
+    vextracti128    xm3, m0, 1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm0
+    movq            [r2 + r3], xm3
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm3
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_4x16 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x16, 4, 7, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
+%else
+    vbroadcasti128  m7, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_LUMA_AVX2_W4_16R %1
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x16 pp
+FILTER_VER_LUMA_AVX2_4x16 ps
+FILTER_VER_LUMA_AVX2_4x16 sp
+FILTER_VER_LUMA_AVX2_4x16 ss
+
+%macro FILTER_VER_LUMA_AVX2_12x16 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_12x16, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_LUMA_AVX2_W8_16R %1
+    add             r2, 16
+    add             r0, 16
+    mova            m7, m14
+    PROCESS_LUMA_AVX2_W4_16R %1
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_12x16 pp
+FILTER_VER_LUMA_AVX2_12x16 ps
+FILTER_VER_LUMA_AVX2_12x16 sp
+FILTER_VER_LUMA_AVX2_12x16 ss
+
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------
--- a/source/common/x86/ipfilter8.asm	Thu Feb 26 11:03:54 2015 -0600
+++ b/source/common/x86/ipfilter8.asm	Thu Feb 26 11:25:55 2015 -0600
@@ -35,6 +35,10 @@ ALIGN 32
 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
 
 ALIGN 32
+const interp_vert_shuf, db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9
+                        db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13
+
+ALIGN 32
 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
                          dd 2, 3, 3, 4, 4, 5, 5, 6
 
@@ -51,6 +55,8 @@ tab_Cm:    db 0, 2, 1, 3, 0, 2, 1, 3, 0,
 
 tab_c_526336:   times 4 dd 8192*64+2048
 
+pd_526336:      times 8 dd 8192*64+2048
+
 tab_ChromaCoeff: db  0, 64,  0,  0
                  db -2, 58, 10, -2
                  db -4, 54, 16, -2
@@ -59,6 +65,30 @@ tab_ChromaCoeff: db  0, 64,  0,  0
                  db -4, 28, 46, -6
                  db -2, 16, 54, -4
                  db -2, 10, 58, -2
+ALIGN 32
+tab_ChromaCoeff_V: times 8 db 0, 64
+                   times 8 db 0,  0
+
+                   times 8 db -2, 58
+                   times 8 db 10, -2
+
+                   times 8 db -4, 54
+                   times 8 db 16, -2
+
+                   times 8 db -6, 46
+                   times 8 db 28, -4
+
+                   times 8 db -4, 36
+                   times 8 db 36, -4
+
+                   times 8 db -4, 28
+                   times 8 db 46, -6
+
+                   times 8 db -2, 16
+                   times 8 db 54, -4
+
+                   times 8 db -2, 10
+                   times 8 db 58, -2
 
 tab_ChromaCoeffV: times 4 dw 0, 64
                   times 4 dw 0, 0
@@ -109,6 +139,27 @@ tab_LumaCoeffV: times 4 dw 0, 0
                 times 4 dw 58, -10
                 times 4 dw 4, -1
 
+ALIGN 32
+pw_LumaCoeffVer: times 8 dw 0, 0
+                 times 8 dw 0, 64
+                 times 8 dw 0, 0
+                 times 8 dw 0, 0
+
+                 times 8 dw -1, 4
+                 times 8 dw -10, 58
+                 times 8 dw 17, -5
+                 times 8 dw 1, 0
+
+                 times 8 dw -1, 4
+                 times 8 dw -11, 40
+                 times 8 dw 40, -11
+                 times 8 dw 4, -1
+
+                 times 8 dw 0, 1
+                 times 8 dw -5, 17
+                 times 8 dw 58, -10
+                 times 8 dw 4, -1
+
 tab_LumaCoeffVer: times 8 db 0, 0
                   times 8 db 0, 64
                   times 8 db 0, 0
@@ -183,6 +234,10 @@ ALIGN 32
 interp4_horiz_shuf1:    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
                         db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 
+
+ALIGN 32
+interp8_hps_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
 SECTION .text
 
 cextern pb_128
@@ -1664,6 +1719,282 @@ cglobal interp_4tap_horiz_pp_8x8, 4,6,6
     IPFILTER_LUMA_64x_avx2 64 , 32
     IPFILTER_LUMA_64x_avx2 64 , 16
 
+;-----------------------------------------------------------------------------------------------------------------------------
+;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+
+%macro IPFILTER_LUMA_PS_4x_AVX2 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6
+    mov                         r5d,               r5m
+    mov                         r4d,               r4m
+%ifdef PIC
+    lea                         r6,                [tab_LumaCoeff]
+    vpbroadcastq                m0,                [r6 + r4 * 8]
+%else
+    vpbroadcastq                m0,                [tab_LumaCoeff + r4 * 8]
+%endif
+    mova                        m1,                [tab_Lm]
+    mov                         r9d,               %2                           ;height
+    add                         r3d,               r3d
+    vbroadcasti128              m2,                [pw_2000]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - pw_2000
+
+    xor                         r10,               r10                          ; loop count variable
+    sub                         r0,                3
+    test                        r5d,               r5d
+    jz                          .label
+    lea                         r8,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride
+    sub                         r0,                r8                           ; r0(src)-r8
+    add                         r9,                4                            ; blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)
+
+.label
+      add                       r10,               4
+
+    ; Row 0-1
+    vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m3,                m1                           ; shuffled based on the col order tab_Lm
+    pmaddubsw                   m3,                m0
+    vbroadcasti128              m4,                [r0 + r1]                    ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m1
+    pmaddubsw                   m4,                m0
+    phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+    ; Row 2-3
+    lea                         r0,                [r0 + r1 * 2]                ;3rd row(i.e 2nd row)
+    vbroadcasti128              m4,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m1
+    pmaddubsw                   m4,                m0
+    vbroadcasti128              m5,                [r0 + r1]                    ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m5,                m1
+    pmaddubsw                   m5,                m0
+    phaddw                      m4,                m5                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+    phaddw                      m3,                m4                           ; all rows and col completed.
+
+    mova                        m5,                [interp8_hps_shuf]
+    vpermd                      m3,                m5,               m3
+    psubw                       m3,                m2
+
+    vextracti128                xm4,               m3,               1
+    lea                         r7,                [r3 * 3]
+    movq                        [r2],              xm3                          ;row 0
+    movhps                      [r2 + r3],         xm3                          ;row 1
+    movq                        [r2 + r3 * 2],     xm4                          ;row 2
+    movhps                      [r2 + r7],         xm4                          ;row 3
+
+    lea                         r0,                [r0 + r1 * 2]                ; first loop src ->5th row(i.e 4)
+    lea                         r2,                [r2 + r3 * 4]                ; first loop dst ->5th row(i.e 4)
+    cmp                         r10,               r9
+    jnz                         .label
+    test                        r5d,               r5d
+    jz                          .end             
+
+    ; Row 8-9
+    vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m3,                m1
+    pmaddubsw                   m3,                m0
+    vbroadcasti128              m4,                [r0 + r1]                    ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m1
+    pmaddubsw                   m4,                m0
+    phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+    ; Row 10
+    lea                         r0,                [r0 + r1 * 2]
+    vbroadcasti128              m4,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m1
+    pmaddubsw                   m4,                m0
+    phaddw                      m4,                m4                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+    phaddw                      m3,                m4 
+
+    mova                        m4,                [interp8_hps_shuf]
+    vpermd                      m3,                m4,            m3
+    psubw                       m3,                m2
+
+    vextracti128                xm4,               m3,            1
+    movq                        [r2],              xm3
+    movhps                      [r2 + r3],         xm3
+    movq                        [r2 + r3 * 2],     xm4
+.end
+RET
+%endif
+%endmacro
+
+
+    IPFILTER_LUMA_PS_4x_AVX2 4 , 4
+    IPFILTER_LUMA_PS_4x_AVX2 4 , 8
+    IPFILTER_LUMA_PS_4x_AVX2 4 , 16
+
+%macro IPFILTER_LUMA_PS_8x_AVX2 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7
+    mov                         r5d,               r5m
+    mov                         r4d,               r4m
+%ifdef PIC
+    lea                         r6,                [tab_LumaCoeff]
+    vpbroadcastq                m0,                [r6 + r4 * 8]
+%else
+    vpbroadcastq                m0,                [tab_LumaCoeff + r4 * 8]
+%endif
+    mova                        m6,                [tab_Lm + 32]
+    mova                        m1,                [tab_Lm]
+    mov                         r9d,               %2                           ;height
+    add                         r3d,               r3d
+    vbroadcasti128              m2,                [pw_2000]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 , m6 - shuffle order table
+    ; m2 - pw_2000
+
+    xor                         r7,               r7                          ; loop count variable
+    sub                         r0,                3
+    test                        r5d,                r5d;
+    jz                          .label
+    lea                         r8,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride
+    sub                         r0,                r8                           ; r0(src)-r8
+    add                         r9,                6                            ; blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)
+
+.label
+      add                       r7,               2
+
+    ; Row 0
+    vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m3,        m6
+    pshufb                      m3,                m1                           ; shuffled based on the col order tab_Lm
+    pmaddubsw                   m3,                m0
+    pmaddubsw                   m4,                m0
+    phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+    ; Row 1
+    lea                         r0,                [r0 + r1]                ;3rd row(i.e 2nd row)
+    vbroadcasti128              m4,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m5,                m4,         m6
+    pshufb                      m4,                m1
+    pmaddubsw                   m4,                m0
+    pmaddubsw                   m5,                m0
+    phaddw                      m4,                m5                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+
+    phaddw                      m3,                m4                           ; all rows and col completed.
+
+    mova                        m5,                [interp8_hps_shuf]
+    vpermd                      m3,                m5,               m3
+    psubw                       m3,                m2
+
+    vextracti128                xm4,               m3,               1
+    movu                        [r2],              xm3                          ;row 0
+    movu                        [r2 + r3],         xm4                          ;row 1
+
+    lea                         r0,                [r0 + r1]                ; first loop src ->5th row(i.e 4)
+    lea                         r2,                [r2 + r3 * 2]                ; first loop dst ->5th row(i.e 4)
+    cmp                         r7,                r9
+    jnz                         .label
+    test                        r5d,              r5d
+    jz                          .end             
+
+    ; Row 15
+    vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m3,          m6
+    pshufb                      m3,                m1
+    pmaddubsw                   m3,                m0
+    pmaddubsw                   m4,                m0
+    phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+    phaddw                      m3,                m4 
+
+    mova                        m4,                [interp8_hps_shuf]
+    vpermd                      m3,                m4,            m3
+    psubw                       m3,                m2
+
+    movu                        [r2],              xm3
+.end
+RET
+%endif
+%endmacro
+
+IPFILTER_LUMA_PS_8x_AVX2 8 , 8
+IPFILTER_LUMA_PS_8x_AVX2 8 , 16
+IPFILTER_LUMA_PS_8x_AVX2 8 , 32
+IPFILTER_LUMA_PS_8x_AVX2 8 , 4
+
+
+%macro IPFILTER_LUMA_PS_16x_AVX2 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7
+    mov                         r5d,               r5m
+    mov                         r4d,               r4m
+%ifdef PIC
+    lea                         r6,                [tab_LumaCoeff]
+    vpbroadcastq                m0,                [r6 + r4 * 8]
+%else
+    vpbroadcastq                m0,                [tab_LumaCoeff + r4 * 8]
+%endif
+    mova                        m6,                [tab_Lm + 32]
+    mova                        m1,                [tab_Lm]
+    mov                         r9,                %2                           ;height
+    add                         r3d,               r3d
+    vbroadcasti128              m2,                [pw_2000]
+
+    ; register map
+    ; m0      - interpolate coeff
+    ; m1 , m6 - shuffle order table
+    ; m2      - pw_2000
+
+    xor                         r7,                r7                          ; loop count variable
+    sub                         r0,                3
+    test                        r5d,               r5d
+    jz                          .label
+    lea                         r8,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride
+    sub                         r0,                r8                           ; r0(src)-r8
+    add                         r9,                7                            ; blkheight += N - 1  (7 - 1 = 6 ; since the last one row not in loop)
+
+.label
+    ; Row 0
+    vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m4,                m3,             m6           ; row 0 (col 4 to 7)
+    pshufb                      m3,                m1                           ; shuffled based on the col order tab_Lm row 0 (col 0 to 3)
+    pmaddubsw                   m3,                m0
+    pmaddubsw                   m4,                m0
+    phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+    vbroadcasti128              m4,                [r0 + 8]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    pshufb                      m5,                m4,            m6            ;row 1 (col 4 to 7)
+    pshufb                      m4,                m1                           ;row 1 (col 0 to 3)
+    pmaddubsw                   m4,                m0
+    pmaddubsw                   m5,                m0
+    phaddw                      m4,                m5                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+    phaddw                      m3,                m4                           ; all rows and col completed.
+
+    mova                        m5,                [interp8_hps_shuf]
+    vpermd                      m3,                m5,               m3
+    psubw                       m3,                m2
+
+    movu                        [r2],              m3                          ;row 0
+
+    lea                         r0,                [r0 + r1]                ; first loop src ->5th row(i.e 4)
+    lea                         r2,                [r2 + r3]                ; first loop dst ->5th row(i.e 4)
+    dec                         r9d
+    jnz                         .label
+
+RET
+%endif
+%endmacro
+
+
+IPFILTER_LUMA_PS_16x_AVX2 16 , 16
+IPFILTER_LUMA_PS_16x_AVX2 16 , 8
+IPFILTER_LUMA_PS_16x_AVX2 16 , 12
+IPFILTER_LUMA_PS_16x_AVX2 16 , 4
+IPFILTER_LUMA_PS_16x_AVX2 16 , 32
+IPFILTER_LUMA_PS_16x_AVX2 16 , 64
+
+
 ;--------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
@@ -1963,6 +2294,60 @@ pextrw      [r2 + r3], m2, 6
 
 RET
 
+%macro FILTER_VER_CHROMA_AVX2_2x4 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_2x4, 4, 6, 2
+    mov             r4d, r4m
+    shl             r4d, 5
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff_V]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeff_V + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+
+    pinsrw          xm1, [r0], 0
+    pinsrw          xm1, [r0 + r1], 1
+    pinsrw          xm1, [r0 + r1 * 2], 2
+    pinsrw          xm1, [r0 + r4], 3
+    lea             r0, [r0 + r1 * 4]
+    pinsrw          xm1, [r0], 4
+    pinsrw          xm1, [r0 + r1], 5
+    pinsrw          xm1, [r0 + r1 * 2], 6
+
+    pshufb          xm0, xm1, [interp_vert_shuf]
+    pshufb          xm1, [interp_vert_shuf + 16]
+    vinserti128     m0, m0, xm1, 1
+    pmaddubsw       m0, [r5]
+    vextracti128    xm1, m0, 1
+    paddw           xm0, xm1
+%ifidn %1,pp
+    pmulhrsw        xm0, [pw_512]
+    packuswb        xm0, xm0
+    lea             r4, [r3 * 3]
+    pextrw          [r2], xm0, 0
+    pextrw          [r2 + r3], xm0, 1
+    pextrw          [r2 + r3 * 2], xm0, 2
+    pextrw          [r2 + r4], xm0, 3
+%else
+    add             r3d, r3d
+    lea             r4, [r3 * 3]
+    psubw           xm0, [pw_2000]
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 1
+    pextrd          [r2 + r3 * 2], xm0, 2
+    pextrd          [r2 + r4], xm0, 3
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_2x4 pp
+FILTER_VER_CHROMA_AVX2_2x4 ps
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -2167,11 +2552,10 @@ pextrd      [r2 + r3], m2, 1
 lea         r2,        [r2 + 2 * r3]
 pextrd      [r2],      m2, 2
 pextrd      [r2 + r3], m2, 3
-
 RET
-
+%macro FILTER_VER_CHROMA_AVX2_4x4 1
 INIT_YMM avx2
-cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
+cglobal interp_4tap_vert_%1_4x4, 4, 6, 3
     mov             r4d, r4m
     shl             r4d, 6
     sub             r0, r1
@@ -2205,6 +2589,7 @@ cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
     pmaddubsw       m0, [r5]
     pmaddubsw       m1, [r5 + mmsize]
     paddw           m0, m1                                  ; m0 = WORD ROW[3 2 1 0]
+%ifidn %1,pp
     pmulhrsw        m0, [pw_512]
     vextracti128    xm1, m0, 1
     packuswb        xm0, xm1
@@ -2213,7 +2598,107 @@ cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
     pextrd          [r2 + r3], xm0, 1
     pextrd          [r2 + r3 * 2], xm0, 2
     pextrd          [r2 + r5], xm0, 3
-    RET
+%else
+    add             r3d, r3d
+    psubw           m0, [pw_2000]
+    vextracti128    xm1, m0, 1
+    lea             r5, [r3 * 3]
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm1
+    movhps          [r2 + r5], xm1
+%endif
+    RET
+%endmacro
+FILTER_VER_CHROMA_AVX2_4x4 pp
+FILTER_VER_CHROMA_AVX2_4x4 ps
+
+%macro FILTER_VER_CHROMA_AVX2_4x8 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_4x8, 4, 6, 5
+    mov             r4d, r4m
+    shl             r4d, 6
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+
+    movd            xm1, [r0]
+    pinsrd          xm1, [r0 + r1], 1
+    pinsrd          xm1, [r0 + r1 * 2], 2
+    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm2, [r0]
+    pinsrd          xm2, [r0 + r1], 1
+    pinsrd          xm2, [r0 + r1 * 2], 2
+    pinsrd          xm2, [r0 + r4], 3                       ; m2 = row[7 6 5 4]
+    vinserti128     m1, m1, xm2, 1                          ; m1 = row[7 6 5 4 3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm3, [r0]
+    pinsrd          xm3, [r0 + r1], 1
+    pinsrd          xm3, [r0 + r1 * 2], 2                   ; m3 = row[x 10 9 8]
+    vinserti128     m2, m2, xm3, 1                          ; m2 = row[x 10 9 8 7 6 5 4]
+    mova            m3, [interp4_vpp_shuf1]
+    vpermd          m0, m3, m1                              ; m0 = row[4 3 3 2 2 1 1 0]
+    vpermd          m4, m3, m2                              ; m4 = row[8 7 7 6 6 5 5 4]
+    mova            m3, [interp4_vpp_shuf1 + mmsize]
+    vpermd          m1, m3, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
+    vpermd          m2, m3, m2                              ; m2 = row[10 9 9 8 8 7 7 6]
+
+    mova            m3, [interp4_vpp_shuf]
+    pshufb          m0, m0, m3
+    pshufb          m1, m1, m3
+    pshufb          m2, m2, m3
+    pshufb          m4, m4, m3
+    pmaddubsw       m0, [r5]
+    pmaddubsw       m4, [r5]
+    pmaddubsw       m1, [r5 + mmsize]
+    pmaddubsw       m2, [r5 + mmsize]
+    paddw           m0, m1                                  ; m0 = WORD ROW[3 2 1 0]
+    paddw           m4, m2                                  ; m4 = WORD ROW[7 6 5 4]
+%ifidn %1,pp
+    pmulhrsw        m0, [pw_512]
+    pmulhrsw        m4, [pw_512]
+    packuswb        m0, m4
+    vextracti128    xm1, m0, 1
+    lea             r5, [r3 * 3]
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 1
+    movd            [r2 + r3 * 2], xm1
+    pextrd          [r2 + r5], xm1, 1
+    lea             r2, [r2 + r3 * 4]
+    pextrd          [r2], xm0, 2
+    pextrd          [r2 + r3], xm0, 3
+    pextrd          [r2 + r3 * 2], xm1, 2
+    pextrd          [r2 + r5], xm1, 3
+%else
+    add             r3d, r3d
+    psubw           m0, [pw_2000]
+    psubw           m4, [pw_2000]
+    vextracti128    xm1, m0, 1
+    vextracti128    xm2, m4, 1
+    lea             r5, [r3 * 3]
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm1
+    movhps          [r2 + r5], xm1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    movhps          [r2 + r3], xm4
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r5], xm2
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_4x8 pp
+FILTER_VER_CHROMA_AVX2_4x8 ps
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3462,8 +3947,9 @@ FILTER_V4_W8_H8_H16_H32 8, 64
     paddw           m4, m0
 %endmacro
 
+%macro FILTER_VER_CHROMA_AVX2_8x8 1
 INIT_YMM avx2
-cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
+cglobal interp_4tap_vert_%1_8x8, 4, 6, 7
     mov             r4d, r4m
     shl             r4d, 6
 
@@ -3477,6 +3963,7 @@ cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
     lea             r4, [r1 * 3]
     sub             r0, r1
     PROCESS_CHROMA_AVX2_W8_8R
+%ifidn %1,pp
     lea             r4, [r3 * 3]
     mova            m3, [pw_512]
     pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
@@ -3496,7 +3983,285 @@ cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
     movq            [r2 + r3], xm4
     movhps          [r2 + r3 * 2], xm1
     movhps          [r2 + r4], xm4
-    RET
+%else
+    add             r3d, r3d
+    vbroadcasti128  m3, [pw_2000]
+    lea             r4, [r3 * 3]
+    psubw           m5, m3                          ; m5 = word: row 0, row 1
+    psubw           m2, m3                          ; m2 = word: row 2, row 3
+    psubw           m1, m3                          ; m1 = word: row 4, row 5
+    psubw           m4, m3                          ; m4 = word: row 6, row 7
+    vextracti128    xm6, m5, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm0, m1, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm6
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm1
+    movu            [r2 + r3], xm0
+    movu            [r2 + r3 * 2], xm4
+    vextracti128    xm4, m4, 1
+    movu            [r2 + r4], xm4
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_8x8 pp
+FILTER_VER_CHROMA_AVX2_8x8 ps
+%macro PROCESS_CHROMA_AVX2_W8_16R 1
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3
+    vinserti128     m5, m1, xm2, 1
+    pmaddubsw       m5, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 4
+    punpcklbw       xm4, xm1
+    vinserti128     m2, m3, xm4, 1
+    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
+    paddw           m5, m0
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
+    paddw           m2, m0
+    pmaddubsw       m1, [r5]
+    movq            xm3, [r0 + r4]                  ; m3 = row 7
+    punpcklbw       xm4, xm3
+    lea             r0, [r0 + r1 * 4]
+    movq            xm0, [r0]                       ; m0 = row 8
+    punpcklbw       xm3, xm0
+    vinserti128     m4, m4, xm3, 1
+    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    pmaddubsw       m4, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 9
+    punpcklbw       xm0, xm3
+    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
+    punpcklbw       xm3, xm6
+    vinserti128     m0, m0, xm3, 1
+    pmaddubsw       m3, m0, [r5 + 1 * mmsize]
+    paddw           m4, m3
+    pmaddubsw       m0, [r5]
+%ifidn %1,pp
+    pmulhrsw        m5, m7                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m7                          ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m7                          ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m7                          ; m4 = word: row 6, row 7
+    packuswb        m5, m2
+    packuswb        m1, m4
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r6], xm2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm1
+    movq            [r2 + r3], xm4
+    movhps          [r2 + r3 * 2], xm1
+    movhps          [r2 + r6], xm4
+%else
+    psubw           m5, m7                          ; m5 = word: row 0, row 1
+    psubw           m2, m7                          ; m2 = word: row 2, row 3
+    psubw           m1, m7                          ; m1 = word: row 4, row 5
+    psubw           m4, m7                          ; m4 = word: row 6, row 7
+    vextracti128    xm3, m5, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm3
+    vextracti128    xm3, m2, 1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    vextracti128    xm5, m1, 1
+    vextracti128    xm3, m4, 1
+    movu            [r2], xm1
+    movu            [r2 + r3], xm5
+    movu            [r2 + r3 * 2], xm4
+    movu            [r2 + r6], xm3
+%endif
+    movq            xm3, [r0 + r4]                  ; m3 = row 11
+    punpcklbw       xm6, xm3
+    lea             r0, [r0 + r1 * 4]
+    movq            xm5, [r0]                       ; m5 = row 12
+    punpcklbw       xm3, xm5
+    vinserti128     m6, m6, xm3, 1
+    pmaddubsw       m3, m6, [r5 + 1 * mmsize]
+    paddw           m0, m3
+    pmaddubsw       m6, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 13
+    punpcklbw       xm5, xm3
+    movq            xm2, [r0 + r1 * 2]              ; m2 = row 14
+    punpcklbw       xm3, xm2
+    vinserti128     m5, m5, xm3, 1
+    pmaddubsw       m3, m5, [r5 + 1 * mmsize]
+    paddw           m6, m3
+    pmaddubsw       m5, [r5]
+    movq            xm3, [r0 + r4]                  ; m3 = row 15
+    punpcklbw       xm2, xm3
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 16
+    punpcklbw       xm3, xm1
+    vinserti128     m2, m2, xm3, 1
+    pmaddubsw       m3, m2, [r5 + 1 * mmsize]
+    paddw           m5, m3
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 17
+    punpcklbw       xm1, xm3
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpcklbw       xm3, xm4
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5 + 1 * mmsize]
+    paddw           m2, m1
+    lea             r2, [r2 + r3 * 4]
+%ifidn %1,pp
+    pmulhrsw        m0, m7                          ; m0 = word: row 8, row 9
+    pmulhrsw        m6, m7                          ; m6 = word: row 10, row 11
+    pmulhrsw        m5, m7                          ; m5 = word: row 12, row 13
+    pmulhrsw        m2, m7                          ; m2 = word: row 14, row 15
+    packuswb        m0, m6
+    packuswb        m5, m2
+    vextracti128    xm6, m0, 1
+    vextracti128    xm2, m5, 1
+    movq            [r2], xm0
+    movq            [r2 + r3], xm6
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm6
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm5
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm5
+    movhps          [r2 + r6], xm2
+%else
+    psubw           m0, m7                          ; m0 = word: row 8, row 9
+    psubw           m6, m7                          ; m6 = word: row 10, row 11
+    psubw           m5, m7                          ; m5 = word: row 12, row 13
+    psubw           m2, m7                          ; m2 = word: row 14, row 15
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m6, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    vextracti128    xm1, m5, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+%endif
+%endmacro
+
+%macro FILTER_VER_CHROMA_AVX2_8x16 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_8x16, 4, 7, 8
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+%ifidn %1,pp
+    mova            m7, [pw_512]
+%else
+    add             r3d, r3d
+    mova            m7, [pw_2000]
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_CHROMA_AVX2_W8_16R %1
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_8x16 pp
+FILTER_VER_CHROMA_AVX2_8x16 ps
+
+%macro PROCESS_CHROMA_AVX2_W8_4R 0
+    movq            xm1, [r0]                       ; m1 = row 0
+    movq            xm2, [r0 + r1]                  ; m2 = row 1
+    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
+    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+    vinserti128     m0, m1, xm2, 1                  ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+    pmaddubsw       m0, [r5]
+    movq            xm4, [r0 + r4]                  ; m4 = row 3
+    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    lea             r0, [r0 + r1 * 4]
+    movq            xm1, [r0]                       ; m1 = row 4
+    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    movq            xm3, [r0 + r1]                  ; m3 = row 5
+    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
+    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+    pmaddubsw       m1, [r5 + 1 * mmsize]
+    paddw           m2, m1
+%endmacro
+
+%macro FILTER_VER_CHROMA_AVX2_8x4 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_8x4, 4, 6, 5
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+    PROCESS_CHROMA_AVX2_W8_4R
+%ifidn %1,pp
+    lea             r4, [r3 * 3]
+    mova            m3, [pw_512]
+    pmulhrsw        m0, m3                          ; m0 = word: row 0, row 1
+    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
+    packuswb        m0, m2
+    vextracti128    xm2, m0, 1
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r4], xm2
+%else
+    add             r3d, r3d
+    vbroadcasti128  m3, [pw_2000]
+    lea             r4, [r3 * 3]
+    psubw           m0, m3                          ; m0 = word: row 0, row 1
+    psubw           m2, m3                          ; m2 = word: row 2, row 3
+    vextracti128    xm1, m0, 1
+    vextracti128    xm4, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm4
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_8x4 pp
+FILTER_VER_CHROMA_AVX2_8x4 ps
 
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -3779,9 +4544,10 @@ FILTER_V4_W16_H2 16, 32
 FILTER_V4_W16_H2 16, 24
 FILTER_V4_W16_H2 16, 64
 
+%macro FILTER_VER_CHROMA_AVX2_16x16 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
+cglobal interp_4tap_vert_%1_16x16, 4, 6, 15
     mov             r4d, r4m
     shl             r4d, 6
 
@@ -3796,8 +4562,13 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6,
     mova            m13, [r5 + mmsize]
     lea             r4, [r1 * 3]
     sub             r0, r1
+%ifidn %1,pp
+    mova            m14, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%endif
     lea             r5, [r3 * 3]
-    mova            m14, [pw_512]
 
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
@@ -3869,6 +4640,7 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6,
     paddw           m7, m11
     pmaddubsw       m9, m12
 
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -3898,6 +4670,25 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6,
     movu            [r2 + r3], xm5
     movu            [r2 + r3 * 2], xm6
     movu            [r2 + r5], xm7
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r5], m3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m4
+    movu            [r2 + r3], m5
+    movu            [r2 + r3 * 2], m6
+    movu            [r2 + r5], m7
+%endif
     lea             r2, [r2 + r3 * 4]
 
     movu            xm11, [r0 + r4]                 ; m11 = row 11
@@ -3958,6 +4749,7 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6,
     pmaddubsw       m3, m13
     paddw           m1, m3
 
+%ifidn %1,pp
     pmulhrsw        m8, m14                         ; m8 = word: row 8
     pmulhrsw        m9, m14                         ; m9 = word: row 9
     pmulhrsw        m10, m14                        ; m10 = word: row 10
@@ -3987,8 +4779,186 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6,
     movu            [r2 + r3], xm7
     movu            [r2 + r3 * 2], xm0
     movu            [r2 + r5], xm1
-    RET
-%endif
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    psubw           m6, m14                         ; m6 = word: row 12
+    psubw           m7, m14                         ; m7 = word: row 13
+    psubw           m0, m14                         ; m0 = word: row 14
+    psubw           m1, m14                         ; m1 = word: row 15
+    movu            [r2], m8
+    movu            [r2 + r3], m9
+    movu            [r2 + r3 * 2], m10
+    movu            [r2 + r5], m11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m6
+    movu            [r2 + r3], m7
+    movu            [r2 + r3 * 2], m0
+    movu            [r2 + r5], m1
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_16x16 pp
+FILTER_VER_CHROMA_AVX2_16x16 ps
+%macro FILTER_VER_CHROMA_AVX2_16x8 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_16x8, 4, 7, 7
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+%ifidn %1,pp
+    mova            m6, [pw_512]
+%else
+    add             r3d, r3d
+    mova            m6, [pw_2000]
+%endif
+    lea             r6, [r3 * 3]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+%ifidn %1,pp
+    pmulhrsw        m0, m6                          ; m0 = word: row 0
+    pmulhrsw        m1, m6                          ; m1 = word: row 1
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+%else
+    psubw           m0, m6                          ; m0 = word: row 0
+    psubw           m1, m6                          ; m1 = word: row 1
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+%endif
+
+    movu            xm0, [r0 + r1]                  ; m0 = row 5
+    punpckhbw       xm1, xm4, xm0
+    punpcklbw       xm4, xm0
+    vinserti128     m4, m4, xm1, 1
+    pmaddubsw       m1, m4, [r5 + mmsize]
+    paddw           m2, m1
+    pmaddubsw       m4, [r5]
+    movu            xm1, [r0 + r1 * 2]              ; m1 = row 6
+    punpckhbw       xm5, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm5, 1
+    pmaddubsw       m5, m0, [r5 + mmsize]
+    paddw           m3, m5
+    pmaddubsw       m0, [r5]
+%ifidn %1,pp
+    pmulhrsw        m2, m6                          ; m2 = word: row 2
+    pmulhrsw        m3, m6                          ; m3 = word: row 3
+    packuswb        m2, m3
+    vpermq          m2, m2, 11011000b
+    vextracti128    xm3, m2, 1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+%else
+    psubw           m2, m6                          ; m2 = word: row 2
+    psubw           m3, m6                          ; m3 = word: row 3
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+%endif
+
+    movu            xm2, [r0 + r4]                  ; m2 = row 7
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + mmsize]
+    paddw           m4, m3
+    pmaddubsw       m1, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm3, [r0]                       ; m3 = row 8
+    punpckhbw       xm5, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm5, 1
+    pmaddubsw       m5, m2, [r5 + mmsize]
+    paddw           m0, m5
+    pmaddubsw       m2, [r5]
+    lea             r2, [r2 + r3 * 4]
+%ifidn %1,pp
+    pmulhrsw        m4, m6                          ; m4 = word: row 4
+    pmulhrsw        m0, m6                          ; m0 = word: row 5
+    packuswb        m4, m0
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm0, m4, 1
+    movu            [r2], xm4
+    movu            [r2 + r3], xm0
+%else
+    psubw           m4, m6                          ; m4 = word: row 4
+    psubw           m0, m6                          ; m0 = word: row 5
+    movu            [r2], m4
+    movu            [r2 + r3], m0
+%endif
+
+    movu            xm5, [r0 + r1]                  ; m5 = row 9
+    punpckhbw       xm4, xm3, xm5
+    punpcklbw       xm3, xm5
+    vinserti128     m3, m3, xm4, 1
+    pmaddubsw       m3, [r5 + mmsize]
+    paddw           m1, m3
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 10
+    punpckhbw       xm0, xm5, xm4
+    punpcklbw       xm5, xm4
+    vinserti128     m5, m5, xm0, 1
+    pmaddubsw       m5, [r5 + mmsize]
+    paddw           m2, m5
+%ifidn %1,pp
+    pmulhrsw        m1, m6                          ; m1 = word: row 6
+    pmulhrsw        m2, m6                          ; m2 = word: row 7
+    packuswb        m1, m2
+    vpermq          m1, m1, 11011000b
+    vextracti128    xm2, m1, 1
+    movu            [r2 + r3 * 2], xm1
+    movu            [r2 + r6], xm2
+%else
+    psubw           m1, m6                          ; m1 = word: row 6
+    psubw           m2, m6                          ; m2 = word: row 7
+    movu            [r2 + r3 * 2], m1
+    movu            [r2 + r6], m2
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_16x8 pp
+FILTER_VER_CHROMA_AVX2_16x8 ps
 
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -4211,9 +5181,10 @@ FILTER_V4_W32 32, 32
 FILTER_V4_W32 32, 48
 FILTER_V4_W32 32, 64
 
+%macro FILTER_VER_CHROMA_AVX2_32x32 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
+cglobal interp_4tap_vert_%1_32x32, 4, 7, 13
     mov             r4d, r4m
     shl             r4d, 6
 
@@ -4228,8 +5199,13 @@ cglobal interp_4tap_vert_pp_32x32, 4, 7,
     mova            m11, [r5 + mmsize]
     lea             r4, [r1 * 3]
     sub             r0, r1
+%ifidn %1,pp
+    mova            m12, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m12, [pw_2000]
+%endif
     lea             r5, [r3 * 3]
-    mova            m12, [pw_512]
     mov             r6d, 8
 .loopW:
     movu            m0, [r0]                        ; m0 = row 0
@@ -4252,11 +5228,19 @@ cglobal interp_4tap_vert_pp_32x32, 4, 7,
     pmaddubsw       m7, m10
     paddw           m2, m8
     paddw           m3, m9
+%ifidn %1,pp
     pmulhrsw        m2, m12
     pmulhrsw        m3, m12
     packuswb        m2, m3
     movu            [r2], m2
-
+%else
+    psubw           m2, m12
+    psubw           m3, m12
+    vperm2i128      m0, m2, m3, 0x20
+    vperm2i128      m2, m2, m3, 0x31
+    movu            [r2], m0
+    movu            [r2 + mmsize], m2
+%endif
     lea             r0, [r0 + r1 * 4]
     movu            m0, [r0]                        ; m0 = row 4
     punpcklbw       m2, m1, m0
@@ -4267,10 +5251,19 @@ cglobal interp_4tap_vert_pp_32x32, 4, 7,
     pmaddubsw       m3, m10
     paddw           m4, m8
     paddw           m5, m9
+%ifidn %1,pp
     pmulhrsw        m4, m12
     pmulhrsw        m5, m12
     packuswb        m4, m5
     movu            [r2 + r3], m4
+%else
+    psubw           m4, m12
+    psubw           m5, m12
+    vperm2i128      m1, m4, m5, 0x20
+    vperm2i128      m4, m4, m5, 0x31
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 + mmsize], m4
+%endif
 
     movu            m1, [r0 + r1]                   ; m1 = row 5
     punpcklbw       m4, m0, m1
@@ -4279,10 +5272,19 @@ cglobal interp_4tap_vert_pp_32x32, 4, 7,
     pmaddubsw       m5, m11
     paddw           m6, m4
     paddw           m7, m5
+%ifidn %1,pp
     pmulhrsw        m6, m12
     pmulhrsw        m7, m12
     packuswb        m6, m7
     movu            [r2 + r3 * 2], m6
+%else
+    psubw           m6, m12
+    psubw           m7, m12
+    vperm2i128      m0, m6, m7, 0x20
+    vperm2i128      m6, m6, m7, 0x31
+    movu            [r2 + r3 * 2], m0
+    movu            [r2 + r3 * 2 + mmsize], m6
+%endif
 
     movu            m0, [r0 + r1 * 2]               ; m0 = row 6
     punpcklbw       m6, m1, m0
@@ -4291,16 +5293,28 @@ cglobal interp_4tap_vert_pp_32x32, 4, 7,
     pmaddubsw       m7, m11
     paddw           m2, m6
     paddw           m3, m7
+%ifidn %1,pp
     pmulhrsw        m2, m12
     pmulhrsw        m3, m12
     packuswb        m2, m3
     movu            [r2 + r5], m2
-
+%else
+    psubw           m2, m12
+    psubw           m3, m12
+    vperm2i128      m0, m2, m3, 0x20
+    vperm2i128      m2, m2, m3, 0x31
+    movu            [r2 + r5], m0
+    movu            [r2 + r5 + mmsize], m2
+%endif
     lea             r2, [r2 + r3 * 4]
     dec             r6d
     jnz             .loopW
     RET
 %endif
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_32x32 pp
+FILTER_VER_CHROMA_AVX2_32x32 ps
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -4402,28 +5416,24 @@ FILTER_V4_W16n_H2 64, 32
 FILTER_V4_W16n_H2 64, 48
 FILTER_V4_W16n_H2 48, 64
 FILTER_V4_W16n_H2 64, 16
-
-
 ;-----------------------------------------------------------------------------
-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
 ;-----------------------------------------------------------------------------
+%macro PIXEL_WH_4xN 2
 INIT_XMM ssse3
-cglobal luma_p2s, 3, 7, 6
+cglobal pixelToShort_%1x%2, 3, 7, 6
 
     ; load width and height
-    mov         r3d, r3m
-    mov         r4d, r4m
-
+    mov         r3d, %1
+    mov         r4d, %2
     ; load constant
     mova        m4, [pb_128]
     mova        m5, [tab_c_64_n64]
-
 .loopH:
-
     xor         r5d, r5d
+
 .loopW:
-    lea         r6, [r0 + r5]
-
+    mov         r6, r0
     movh        m0, [r6]
     punpcklbw   m0, m4
     pmaddubsw   m0, m5
@@ -4463,8 +5473,263 @@ cglobal luma_p2s, 3, 7, 6
 
     sub         r4d, 4
     jnz         .loopH
-
-    RET
+    RET
+%endmacro
+PIXEL_WH_4xN 4, 4
+PIXEL_WH_4xN 4, 8
+PIXEL_WH_4xN 4, 16
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_8xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH
+    xor         r5d, r5d
+.loopW
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + FENC_STRIDE * 0], m0
+    movu        [r2 + FENC_STRIDE * 2], m1
+    movu        [r2 + FENC_STRIDE * 4], m2
+    movu        [r2 + FENC_STRIDE * 6], m3
+
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+    RET
+%endmacro
+PIXEL_WH_8xN 8, 8
+PIXEL_WH_8xN 8, 4
+PIXEL_WH_8xN 8, 16
+PIXEL_WH_8xN 8, 32
+
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_16xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_16xN 16, 16
+PIXEL_WH_16xN 16, 8
+PIXEL_WH_16xN 16, 4
+PIXEL_WH_16xN 16, 12
+PIXEL_WH_16xN 16, 32
+PIXEL_WH_16xN 16, 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_32xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_32xN 32, 32
+PIXEL_WH_32xN 32, 8
+PIXEL_WH_32xN 32, 16
+PIXEL_WH_32xN 32, 24
+PIXEL_WH_32xN 32, 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_64xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_64xN 64, 64
+PIXEL_WH_64xN 64, 16
+PIXEL_WH_64xN 64, 32
+PIXEL_WH_64xN 64, 48
 
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]
@@ -4789,6 +6054,121 @@ cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
     movhps          [r2 + r5], xm2
     RET
 
+%macro FILTER_VER_LUMA_AVX2_4xN 3
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+    lea             r6, [r1 * 4]
+%ifidn %3,pp
+    mova            m6, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m6, [pw_2000]
+%endif
+    lea             r8, [r3 * 3]
+    mova            m5, [interp4_vpp_shuf]
+    mova            m0, [interp4_vpp_shuf1]
+    mova            m7, [interp4_vpp_shuf1 + mmsize]
+    mov             r7d, %2 / 8
+.loop:
+    movd            xm1, [r0]
+    pinsrd          xm1, [r0 + r1], 1
+    pinsrd          xm1, [r0 + r1 * 2], 2
+    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm2, [r0]
+    pinsrd          xm2, [r0 + r1], 1
+    pinsrd          xm2, [r0 + r1 * 2], 2
+    pinsrd          xm2, [r0 + r4], 3                       ; m2 = row[7 6 5 4]
+    vinserti128     m1, m1, xm2, 1                          ; m1 = row[7 6 5 4 3 2 1 0]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm3, [r0]
+    pinsrd          xm3, [r0 + r1], 1
+    pinsrd          xm3, [r0 + r1 * 2], 2
+    pinsrd          xm3, [r0 + r4], 3                       ; m3 = row[11 10 9 8]
+    vinserti128     m2, m2, xm3, 1                          ; m2 = row[11 10 9 8 7 6 5 4]
+    lea             r0, [r0 + r1 * 4]
+    movd            xm4, [r0]
+    pinsrd          xm4, [r0 + r1], 1
+    pinsrd          xm4, [r0 + r1 * 2], 2                   ; m4 = row[x 14 13 12]
+    vinserti128     m3, m3, xm4, 1                          ; m3 = row[x 14 13 12 11 10 9 8]
+    vpermd          m8, m0, m1                              ; m8 = row[4 3 3 2 2 1 1 0]
+    vpermd          m4, m0, m2                              ; m4 = row[8 7 7 6 6 5 5 4]
+    vpermd          m1, m7, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
+    vpermd          m2, m7, m2                              ; m2 = row[10 9 9 8 8 7 7 6]
+    vpermd          m9, m0, m3                              ; m9 = row[12 11 11 10 10 9 9 8]
+    vpermd          m3, m7, m3                              ; m3 = row[14 13 13 12 12 11 11 10]
+
+    pshufb          m8, m8, m5
+    pshufb          m1, m1, m5
+    pshufb          m4, m4, m5
+    pshufb          m9, m9, m5
+    pshufb          m2, m2, m5
+    pshufb          m3, m3, m5
+    pmaddubsw       m8, [r5]
+    pmaddubsw       m1, [r5 + mmsize]
+    pmaddubsw       m9, [r5 + 2 * mmsize]
+    pmaddubsw       m3, [r5 + 3 * mmsize]
+    paddw           m8, m1
+    paddw           m9, m3
+    pmaddubsw       m1, m4, [r5 + 2 * mmsize]
+    pmaddubsw       m3, m2, [r5 + 3 * mmsize]
+    pmaddubsw       m4, [r5]
+    pmaddubsw       m2, [r5 + mmsize]
+    paddw           m3, m1
+    paddw           m2, m4
+    paddw           m8, m3                                  ; m8 = WORD ROW[3 2 1 0]
+    paddw           m9, m2                                  ; m9 = WORD ROW[7 6 5 4]
+
+%ifidn %3,pp
+    pmulhrsw        m8, m6
+    pmulhrsw        m9, m6
+    packuswb        m8, m9
+    vextracti128    xm1, m8, 1
+    movd            [r2], xm8
+    pextrd          [r2 + r3], xm8, 1
+    movd            [r2 + r3 * 2], xm1
+    pextrd          [r2 + r8], xm1, 1
+    lea             r2, [r2 + r3 * 4]
+    pextrd          [r2], xm8, 2
+    pextrd          [r2 + r3], xm8, 3
+    pextrd          [r2 + r3 * 2], xm1, 2
+    pextrd          [r2 + r8], xm1, 3
+%else
+    psubw           m8, m6
+    psubw           m9, m6
+    vextracti128    xm1, m8, 1
+    vextracti128    xm2, m9, 1
+    movq            [r2], xm8
+    movhps          [r2 + r3], xm8
+    movq            [r2 + r3 * 2], xm1
+    movhps          [r2 + r8], xm1
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm9
+    movhps          [r2 + r3], xm9
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r8], xm2
+%endif
+    lea             r2, [r2 + r3 * 4]
+    sub             r0, r6
+    dec             r7d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
@@ -4798,11 +6178,13 @@ FILTER_VER_LUMA_4xN 4, 4, pp
 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_4xN 4, 8, pp
+FILTER_VER_LUMA_AVX2_4xN 4, 8, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_4xN 4, 16, pp
+FILTER_VER_LUMA_AVX2_4xN 4, 16, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -4813,11 +6195,13 @@ FILTER_VER_LUMA_4xN 4, 4, ps
 ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_4xN 4, 8, ps
+FILTER_VER_LUMA_AVX2_4xN 4, 8, ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_4xN 4, 16, ps
+FILTER_VER_LUMA_AVX2_4xN 4, 16, ps
 
 %macro PROCESS_LUMA_AVX2_W8_8R 0
     movq            xm1, [r0]                       ; m1 = row 0
@@ -5004,9 +6388,9 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7,
     RET
 %endmacro
 
-%macro FILTER_VER_LUMA_AVX2_8xN 2
+%macro FILTER_VER_LUMA_AVX2_8xN 3
 INIT_YMM avx2
-cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
+cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -5019,11 +6403,17 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 7,
     lea             r4, [r1 * 3]
     sub             r0, r4
     lea             r6, [r1 * 4]
+%ifidn %3,pp
+    mova            m7, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m7, [pw_2000]
+%endif
     mov             word [rsp], %2 / 8
-    mova            m7, [pw_512]
 
 .loop:
     PROCESS_LUMA_AVX2_W8_8R
+%ifidn %3,pp
     pmulhrsw        m5, m7                          ; m5 = word: row 0, row 1
     pmulhrsw        m2, m7                          ; m2 = word: row 2, row 3
     pmulhrsw        m1, m7                          ; m1 = word: row 4, row 5
@@ -5043,6 +6433,27 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 7,
     lea             r2, [r2 + r3 * 2]
     movhps          [r2], xm1
     movhps          [r2 + r3], xm4
+%else
+    psubw           m5, m7                          ; m5 = word: row 0, row 1
+    psubw           m2, m7                          ; m2 = word: row 2, row 3
+    psubw           m1, m7                          ; m1 = word: row 4, row 5
+    psubw           m4, m7                          ; m4 = word: row 6, row 7
+    vextracti128    xm6, m5, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm0, m1, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm6
+    lea             r2, [r2 + r3 * 2]
+    movu            [r2], xm2
+    movu            [r2 + r3], xm3
+    lea             r2, [r2 + r3 * 2]
+    movu            [r2], xm1
+    movu            [r2 + r3], xm0
+    lea             r2, [r2 + r3 * 2]
+    movu            [r2], xm4
+    vextracti128    xm4, m4, 1
+    movu            [r2 + r3], xm4
+%endif
     lea             r2, [r2 + r3 * 2]
     sub             r0, r6
     dec             word [rsp]
@@ -5050,8 +6461,9 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 7,
     RET
 %endmacro
 
+%macro FILTER_VER_LUMA_AVX2_8x8 1
 INIT_YMM avx2
-cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
+cglobal interp_8tap_vert_%1_8x8, 4, 6, 7
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -5065,8 +6477,14 @@ cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
     lea             r4, [r1 * 3]
     sub             r0, r4
     PROCESS_LUMA_AVX2_W8_8R
+%ifidn %1,pp
+    mova            m3, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m3, [pw_2000]
+%endif
     lea             r4, [r3 * 3]
-    mova            m3, [pw_512]
+%ifidn %1,pp
     pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
     pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
     pmulhrsw        m1, m3                          ; m1 = word: row 4, row 5
@@ -5084,10 +6502,31 @@ cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
     movq            [r2 + r3], xm4
     movhps          [r2 + r3 * 2], xm1
     movhps          [r2 + r4], xm4
-    RET
-
+%else
+    psubw           m5, m3                          ; m5 = word: row 0, row 1
+    psubw           m2, m3                          ; m2 = word: row 2, row 3
+    psubw           m1, m3                          ; m1 = word: row 4, row 5
+    psubw           m4, m3                          ; m4 = word: row 6, row 7
+    vextracti128    xm6, m5, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm0, m1, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm6
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm1
+    movu            [r2 + r3], xm0
+    movu            [r2 + r3 * 2], xm4
+    vextracti128    xm4, m4, 1
+    movu            [r2 + r4], xm4
+%endif
+    RET
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_8x4 1
 INIT_YMM avx2
-cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
+cglobal interp_8tap_vert_%1_8x4, 4, 6, 7
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -5101,8 +6540,14 @@ cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
     lea             r4, [r1 * 3]
     sub             r0, r4
     PROCESS_LUMA_AVX2_W8_4R
+%ifidn %1,pp
+    mova            m3, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m3, [pw_2000]
+%endif
     lea             r4, [r3 * 3]
-    mova            m3, [pw_512]
+%ifidn %1,pp
     pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
     pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
     packuswb        m5, m2
@@ -5111,49 +6556,66 @@ cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
     movq            [r2 + r3], xm2
     movhps          [r2 + r3 * 2], xm5
     movhps          [r2 + r4], xm2
-    RET
+%else
+    psubw           m5, m3                          ; m5 = word: row 0, row 1
+    psubw           m2, m3                          ; m2 = word: row 2, row 3
+    movu            [r2], xm5
+    vextracti128    xm5, m5, 1
+    movu            [r2 + r3], xm5
+    movu            [r2 + r3 * 2], xm2
+    vextracti128    xm2, m2, 1
+    movu            [r2 + r4], xm2
+%endif
+    RET
+%endmacro
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 4, pp
+FILTER_VER_LUMA_AVX2_8x4 pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 8, pp
+FILTER_VER_LUMA_AVX2_8x8 pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 16, pp
-FILTER_VER_LUMA_AVX2_8xN 8, 16
+FILTER_VER_LUMA_AVX2_8xN 8, 16, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 32, pp
-FILTER_VER_LUMA_AVX2_8xN 8, 32
+FILTER_VER_LUMA_AVX2_8xN 8, 32, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 4, ps
+FILTER_VER_LUMA_AVX2_8x4 ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 8, ps
+FILTER_VER_LUMA_AVX2_8x8 ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 16, ps
+FILTER_VER_LUMA_AVX2_8xN 8, 16, ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_8xN 8, 32, ps
+FILTER_VER_LUMA_AVX2_8xN 8, 32, ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -5269,9 +6731,10 @@ FILTER_VER_LUMA_12xN 12, 16, pp
 ;-------------------------------------------------------------------------------------------------------------
 FILTER_VER_LUMA_12xN 12, 16, ps
 
+%macro FILTER_VER_LUMA_AVX2_12x16 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
+cglobal interp_8tap_vert_%1_12x16, 4, 7, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -5284,315 +6747,13 @@ cglobal interp_8tap_vert_pp_12x16, 4, 7,
 
     lea             r4, [r1 * 3]
     sub             r0, r4
-    lea             r6, [r3 * 3]
+%ifidn %1,pp
     mova            m14, [pw_512]
-
-    movu            xm0, [r0]                       ; m0 = row 0
-    movu            xm1, [r0 + r1]                  ; m1 = row 1
-    punpckhbw       xm2, xm0, xm1
-    punpcklbw       xm0, xm1
-    vinserti128     m0, m0, xm2, 1
-    pmaddubsw       m0, [r5]
-    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
-    punpckhbw       xm3, xm1, xm2
-    punpcklbw       xm1, xm2
-    vinserti128     m1, m1, xm3, 1
-    pmaddubsw       m1, [r5]
-    movu            xm3, [r0 + r4]                  ; m3 = row 3
-    punpckhbw       xm4, xm2, xm3
-    punpcklbw       xm2, xm3
-    vinserti128     m2, m2, xm4, 1
-    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
-    paddw           m0, m4
-    pmaddubsw       m2, [r5]
-    lea             r0, [r0 + r1 * 4]
-    movu            xm4, [r0]                       ; m4 = row 4
-    punpckhbw       xm5, xm3, xm4
-    punpcklbw       xm3, xm4
-    vinserti128     m3, m3, xm5, 1
-    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
-    paddw           m1, m5
-    pmaddubsw       m3, [r5]
-    movu            xm5, [r0 + r1]                  ; m5 = row 5
-    punpckhbw       xm6, xm4, xm5
-    punpcklbw       xm4, xm5
-    vinserti128     m4, m4, xm6, 1
-    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
-    paddw           m0, m6
-    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
-    paddw           m2, m6
-    pmaddubsw       m4, [r5]
-    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
-    punpckhbw       xm7, xm5, xm6
-    punpcklbw       xm5, xm6
-    vinserti128     m5, m5, xm7, 1
-    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
-    paddw           m1, m7
-    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
-    paddw           m3, m7
-    pmaddubsw       m5, [r5]
-    movu            xm7, [r0 + r4]                  ; m7 = row 7
-    punpckhbw       xm8, xm6, xm7
-    punpcklbw       xm6, xm7
-    vinserti128     m6, m6, xm8, 1
-    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
-    paddw           m0, m8
-    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
-    paddw           m2, m8
-    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
-    paddw           m4, m8
-    pmaddubsw       m6, [r5]
-    lea             r0, [r0 + r1 * 4]
-    movu            xm8, [r0]                       ; m8 = row 8
-    punpckhbw       xm9, xm7, xm8
-    punpcklbw       xm7, xm8
-    vinserti128     m7, m7, xm9, 1
-    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
-    paddw           m1, m9
-    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
-    paddw           m3, m9
-    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
-    paddw           m5, m9
-    pmaddubsw       m7, [r5]
-    movu            xm9, [r0 + r1]                  ; m9 = row 9
-    punpckhbw       xm10, xm8, xm9
-    punpcklbw       xm8, xm9
-    vinserti128     m8, m8, xm10, 1
-    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
-    paddw           m2, m10
-    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
-    paddw           m4, m10
-    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
-    paddw           m6, m10
-    pmaddubsw       m8, [r5]
-    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
-    punpckhbw       xm11, xm9, xm10
-    punpcklbw       xm9, xm10
-    vinserti128     m9, m9, xm11, 1
-    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
-    paddw           m3, m11
-    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
-    paddw           m5, m11
-    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
-    paddw           m7, m11
-    pmaddubsw       m9, [r5]
-    movu            xm11, [r0 + r4]                 ; m11 = row 11
-    punpckhbw       xm12, xm10, xm11
-    punpcklbw       xm10, xm11
-    vinserti128     m10, m10, xm12, 1
-    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
-    paddw           m4, m12
-    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
-    paddw           m6, m12
-    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
-    paddw           m8, m12
-    pmaddubsw       m10, [r5]
-    lea             r0, [r0 + r1 * 4]
-    movu            xm12, [r0]                      ; m12 = row 12
-    punpckhbw       xm13, xm11, xm12
-    punpcklbw       xm11, xm12
-    vinserti128     m11, m11, xm13, 1
-    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
-    paddw           m5, m13
-    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
-    paddw           m7, m13
-    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
-    paddw           m9, m13
-    pmaddubsw       m11, [r5]
-
-    pmulhrsw        m0, m14                         ; m0 = word: row 0
-    pmulhrsw        m1, m14                         ; m1 = word: row 1
-    pmulhrsw        m2, m14                         ; m2 = word: row 2
-    pmulhrsw        m3, m14                         ; m3 = word: row 3
-    pmulhrsw        m4, m14                         ; m4 = word: row 4
-    pmulhrsw        m5, m14                         ; m5 = word: row 5
-    packuswb        m0, m1
-    packuswb        m2, m3
-    packuswb        m4, m5
-    vpermq          m0, m0, 11011000b
-    vpermq          m2, m2, 11011000b
-    vpermq          m4, m4, 11011000b
-    vextracti128    xm1, m0, 1
-    vextracti128    xm3, m2, 1
-    vextracti128    xm5, m4, 1
-    movq            [r2], xm0
-    pextrd          [r2 + 8], xm0, 2
-    movq            [r2 + r3], xm1
-    pextrd          [r2 + r3 + 8], xm1, 2
-    movq            [r2 + r3 * 2], xm2
-    pextrd          [r2 + r3 * 2 + 8], xm2, 2
-    movq            [r2 + r6], xm3
-    pextrd          [r2 + r6 + 8], xm3, 2
-    lea             r2, [r2 + r3 * 4]
-    movq            [r2], xm4
-    pextrd          [r2 + 8], xm4, 2
-    movq            [r2 + r3], xm5
-    pextrd          [r2 + r3 + 8], xm5, 2
-
-    movu            xm13, [r0 + r1]                 ; m13 = row 13
-    punpckhbw       xm0, xm12, xm13
-    punpcklbw       xm12, xm13
-    vinserti128     m12, m12, xm0, 1
-    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
-    paddw           m6, m0
-    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
-    paddw           m8, m0
-    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
-    paddw           m10, m0
-    pmaddubsw       m12, [r5]
-    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
-    punpckhbw       xm1, xm13, xm0
-    punpcklbw       xm13, xm0
-    vinserti128     m13, m13, xm1, 1
-    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
-    paddw           m7, m1
-    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
-    paddw           m9, m1
-    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
-    paddw           m11, m1
-    pmaddubsw       m13, [r5]
-
-    pmulhrsw        m6, m14                         ; m6 = word: row 6
-    pmulhrsw        m7, m14                         ; m7 = word: row 7
-    packuswb        m6, m7
-    vpermq          m6, m6, 11011000b
-    vextracti128    xm7, m6, 1
-    movq            [r2 + r3 * 2], xm6
-    pextrd          [r2 + r3 * 2 + 8], xm6, 2
-    movq            [r2 + r6], xm7
-    pextrd          [r2 + r6 + 8], xm7, 2
-    lea             r2, [r2 + r3 * 4]
-
-    movu            xm1, [r0 + r4]                  ; m1 = row 15
-    punpckhbw       xm2, xm0, xm1
-    punpcklbw       xm0, xm1
-    vinserti128     m0, m0, xm2, 1
-    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
-    paddw           m8, m2
-    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
-    paddw           m10, m2
-    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
-    paddw           m12, m2
-    pmaddubsw       m0, [r5]
-    lea             r0, [r0 + r1 * 4]
-    movu            xm2, [r0]                       ; m2 = row 16
-    punpckhbw       xm3, xm1, xm2
-    punpcklbw       xm1, xm2
-    vinserti128     m1, m1, xm3, 1
-    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
-    paddw           m9, m3
-    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
-    paddw           m11, m3
-    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
-    paddw           m13, m3
-    pmaddubsw       m1, [r5]
-    movu            xm3, [r0 + r1]                  ; m3 = row 17
-    punpckhbw       xm4, xm2, xm3
-    punpcklbw       xm2, xm3
-    vinserti128     m2, m2, xm4, 1
-    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
-    paddw           m10, m4
-    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
-    paddw           m12, m4
-    pmaddubsw       m2, [r5 + 1 * mmsize]
-    paddw           m0, m2
-    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
-    punpckhbw       xm5, xm3, xm4
-    punpcklbw       xm3, xm4
-    vinserti128     m3, m3, xm5, 1
-    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
-    paddw           m11, m5
-    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
-    paddw           m13, m5
-    pmaddubsw       m3, [r5 + 1 * mmsize]
-    paddw           m1, m3
-    movu            xm5, [r0 + r4]                  ; m5 = row 19
-    punpckhbw       xm6, xm4, xm5
-    punpcklbw       xm4, xm5
-    vinserti128     m4, m4, xm6, 1
-    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
-    paddw           m12, m6
-    pmaddubsw       m4, [r5 + 2 * mmsize]
-    paddw           m0, m4
-    lea             r0, [r0 + r1 * 4]
-    movu            xm6, [r0]                       ; m6 = row 20
-    punpckhbw       xm7, xm5, xm6
-    punpcklbw       xm5, xm6
-    vinserti128     m5, m5, xm7, 1
-    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
-    paddw           m13, m7
-    pmaddubsw       m5, [r5 + 2 * mmsize]
-    paddw           m1, m5
-    movu            xm7, [r0 + r1]                  ; m7 = row 21
-    punpckhbw       xm2, xm6, xm7
-    punpcklbw       xm6, xm7
-    vinserti128     m6, m6, xm2, 1
-    pmaddubsw       m6, [r5 + 3 * mmsize]
-    paddw           m0, m6
-    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
-    punpckhbw       xm3, xm7, xm2
-    punpcklbw       xm7, xm2
-    vinserti128     m7, m7, xm3, 1
-    pmaddubsw       m7, [r5 + 3 * mmsize]
-    paddw           m1, m7
-
-    pmulhrsw        m8, m14                         ; m8 = word: row 8
-    pmulhrsw        m9, m14                         ; m9 = word: row 9
-    pmulhrsw        m10, m14                        ; m10 = word: row 10
-    pmulhrsw        m11, m14                        ; m11 = word: row 11
-    pmulhrsw        m12, m14                        ; m12 = word: row 12
-    pmulhrsw        m13, m14                        ; m13 = word: row 13
-    pmulhrsw        m0, m14                         ; m0 = word: row 14
-    pmulhrsw        m1, m14                         ; m1 = word: row 15
-    packuswb        m8, m9
-    packuswb        m10, m11
-    packuswb        m12, m13
-    packuswb        m0, m1
-    vpermq          m8, m8, 11011000b
-    vpermq          m10, m10, 11011000b
-    vpermq          m12, m12, 11011000b
-    vpermq          m0, m0, 11011000b
-    vextracti128    xm9, m8, 1
-    vextracti128    xm11, m10, 1
-    vextracti128    xm13, m12, 1
-    vextracti128    xm1, m0, 1
-    movq            [r2], xm8
-    pextrd          [r2 + 8], xm8, 2
-    movq            [r2 + r3], xm9
-    pextrd          [r2 + r3 + 8], xm9, 2
-    movq            [r2 + r3 * 2], xm10
-    pextrd          [r2 + r3 * 2 + 8], xm10, 2
-    movq            [r2 + r6], xm11
-    pextrd          [r2 + r6 + 8], xm11, 2
-    lea             r2, [r2 + r3 * 4]
-    movq            [r2], xm12
-    pextrd          [r2 + 8], xm12, 2
-    movq            [r2 + r3], xm13
-    pextrd          [r2 + r3 + 8], xm13, 2
-    movq            [r2 + r3 * 2], xm0
-    pextrd          [r2 + r3 * 2 + 8], xm0, 2
-    movq            [r2 + r6], xm1
-    pextrd          [r2 + r6 + 8], xm1, 2
-    RET
-%endif
-
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
-    mov             r4d, r4m
-    shl             r4d, 7
-
-%ifdef PIC
-    lea             r5, [tab_LumaCoeffVer_32]
-    add             r5, r4
-%else
-    lea             r5, [tab_LumaCoeffVer_32 + r4]
-%endif
-
-    lea             r4, [r1 * 3]
-    sub             r0, r4
+%else
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%endif
     lea             r6, [r3 * 3]
-    mova            m14, [pw_512]
 
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
@@ -5707,6 +6868,7 @@ cglobal interp_8tap_vert_pp_16x16, 4, 7,
     paddw           m9, m13
     pmaddubsw       m11, [r5]
 
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -5722,13 +6884,46 @@ cglobal interp_8tap_vert_pp_16x16, 4, 7,
     vextracti128    xm1, m0, 1
     vextracti128    xm3, m2, 1
     vextracti128    xm5, m4, 1
+    movq            [r2], xm0
+    pextrd          [r2 + 8], xm0, 2
+    movq            [r2 + r3], xm1
+    pextrd          [r2 + r3 + 8], xm1, 2
+    movq            [r2 + r3 * 2], xm2
+    pextrd          [r2 + r3 * 2 + 8], xm2, 2
+    movq            [r2 + r6], xm3
+    pextrd          [r2 + r6 + 8], xm3, 2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    pextrd          [r2 + 8], xm4, 2
+    movq            [r2 + r3], xm5
+    pextrd          [r2 + r3 + 8], xm5, 2
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
     movu            [r2], xm0
+    vextracti128    xm0, m0, 1
+    movq            [r2 + 16], xm0
     movu            [r2 + r3], xm1
+    vextracti128    xm1, m1, 1
+    movq            [r2 + r3 + 16], xm1
     movu            [r2 + r3 * 2], xm2
+    vextracti128    xm2, m2, 1
+    movq            [r2 + r3 * 2 + 16], xm2
     movu            [r2 + r6], xm3
+    vextracti128    xm3, m3, 1
+    movq            [r2 + r6 + 16], xm3
     lea             r2, [r2 + r3 * 4]
     movu            [r2], xm4
+    vextracti128    xm4, m4, 1
+    movq            [r2 + 16], xm4
     movu            [r2 + r3], xm5
+    vextracti128    xm5, m5, 1
+    movq            [r2 + r3 + 16], xm5
+%endif
 
     movu            xm13, [r0 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
@@ -5753,13 +6948,26 @@ cglobal interp_8tap_vert_pp_16x16, 4, 7,
     paddw           m11, m1
     pmaddubsw       m13, [r5]
 
+%ifidn %1,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
     vpermq          m6, m6, 11011000b
     vextracti128    xm7, m6, 1
+    movq            [r2 + r3 * 2], xm6
+    pextrd          [r2 + r3 * 2 + 8], xm6, 2
+    movq            [r2 + r6], xm7
+    pextrd          [r2 + r6 + 8], xm7, 2
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
     movu            [r2 + r3 * 2], xm6
+    vextracti128    xm6, m6, 1
+    movq            [r2 + r3 * 2 + 16], xm6
     movu            [r2 + r6], xm7
+    vextracti128    xm7, m7, 1
+    movq            [r2 + r6 + 16], xm7
+%endif
     lea             r2, [r2 + r3 * 4]
 
     movu            xm1, [r0 + r4]                  ; m1 = row 15
@@ -5835,6 +7043,7 @@ cglobal interp_8tap_vert_pp_16x16, 4, 7,
     pmaddubsw       m7, [r5 + 3 * mmsize]
     paddw           m1, m7
 
+%ifidn %1,pp
     pmulhrsw        m8, m14                         ; m8 = word: row 8
     pmulhrsw        m9, m14                         ; m9 = word: row 9
     pmulhrsw        m10, m14                        ; m10 = word: row 10
@@ -5855,21 +7064,69 @@ cglobal interp_8tap_vert_pp_16x16, 4, 7,
     vextracti128    xm11, m10, 1
     vextracti128    xm13, m12, 1
     vextracti128    xm1, m0, 1
+    movq            [r2], xm8
+    pextrd          [r2 + 8], xm8, 2
+    movq            [r2 + r3], xm9
+    pextrd          [r2 + r3 + 8], xm9, 2
+    movq            [r2 + r3 * 2], xm10
+    pextrd          [r2 + r3 * 2 + 8], xm10, 2
+    movq            [r2 + r6], xm11
+    pextrd          [r2 + r6 + 8], xm11, 2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm12
+    pextrd          [r2 + 8], xm12, 2
+    movq            [r2 + r3], xm13
+    pextrd          [r2 + r3 + 8], xm13, 2
+    movq            [r2 + r3 * 2], xm0
+    pextrd          [r2 + r3 * 2 + 8], xm0, 2
+    movq            [r2 + r6], xm1
+    pextrd          [r2 + r6 + 8], xm1, 2
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    psubw           m12, m14                        ; m12 = word: row 12
+    psubw           m13, m14                        ; m13 = word: row 13
+    psubw           m0, m14                         ; m0 = word: row 14
+    psubw           m1, m14                         ; m1 = word: row 15
     movu            [r2], xm8
+    vextracti128    xm8, m8, 1
+    movq            [r2 + 16], xm8
     movu            [r2 + r3], xm9
+    vextracti128    xm9, m9, 1
+    movq            [r2 + r3 + 16], xm9
     movu            [r2 + r3 * 2], xm10
+    vextracti128    xm10, m10, 1
+    movq            [r2 + r3 * 2 + 16], xm10
     movu            [r2 + r6], xm11
+    vextracti128    xm11, m11, 1
+    movq            [r2 + r6 + 16], xm11
     lea             r2, [r2 + r3 * 4]
     movu            [r2], xm12
+    vextracti128    xm12, m12, 1
+    movq            [r2 + 16], xm12
     movu            [r2 + r3], xm13
+    vextracti128    xm13, m13, 1
+    movq            [r2 + r3 + 16], xm13
     movu            [r2 + r3 * 2], xm0
+    vextracti128    xm0, m0, 1
+    movq            [r2 + r3 * 2 + 16], xm0
     movu            [r2 + r6], xm1
-    RET
-%endif
-
+    vextracti128    xm1, m1, 1
+    movq            [r2 + r6 + 16], xm1
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_12x16 pp
+FILTER_VER_LUMA_AVX2_12x16 ps
+
+%macro FILTER_VER_LUMA_AVX2_16x16 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
+cglobal interp_8tap_vert_%1_16x16, 4, 7, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -5882,8 +7139,13 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %1,pp
+    mova            m14, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%endif
     lea             r6, [r3 * 3]
-    mova            m14, [pw_512]
 
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
@@ -5998,6 +7260,7 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     paddw           m9, m13
     pmaddubsw       m11, [r5]
 
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -6020,6 +7283,365 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     lea             r2, [r2 + r3 * 4]
     movu            [r2], xm4
     movu            [r2 + r3], xm5
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m4
+    movu            [r2 + r3], m5
+%endif
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhbw       xm0, xm12, xm13
+    punpcklbw       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
+    paddw           m6, m0
+    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
+    paddw           m8, m0
+    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
+    paddw           m10, m0
+    pmaddubsw       m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhbw       xm1, xm13, xm0
+    punpcklbw       xm13, xm0
+    vinserti128     m13, m13, xm1, 1
+    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
+    paddw           m7, m1
+    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
+    paddw           m9, m1
+    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
+    paddw           m11, m1
+    pmaddubsw       m13, [r5]
+
+%ifidn %1,pp
+    pmulhrsw        m6, m14                         ; m6 = word: row 6
+    pmulhrsw        m7, m14                         ; m7 = word: row 7
+    packuswb        m6, m7
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm7, m6, 1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r2 + r3 * 2], m6
+    movu            [r2 + r6], m7
+%endif
+    lea             r2, [r2 + r3 * 4]
+
+    movu            xm1, [r0 + r4]                  ; m1 = row 15
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
+    paddw           m8, m2
+    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
+    paddw           m10, m2
+    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
+    paddw           m12, m2
+    pmaddubsw       m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
+    paddw           m9, m3
+    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
+    paddw           m11, m3
+    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
+    paddw           m13, m3
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
+    paddw           m10, m4
+    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
+    paddw           m12, m4
+    pmaddubsw       m2, [r5 + 1 * mmsize]
+    paddw           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
+    paddw           m11, m5
+    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
+    paddw           m13, m5
+    pmaddubsw       m3, [r5 + 1 * mmsize]
+    paddw           m1, m3
+    movu            xm5, [r0 + r4]                  ; m5 = row 19
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
+    paddw           m12, m6
+    pmaddubsw       m4, [r5 + 2 * mmsize]
+    paddw           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm6, [r0]                       ; m6 = row 20
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
+    paddw           m13, m7
+    pmaddubsw       m5, [r5 + 2 * mmsize]
+    paddw           m1, m5
+    movu            xm7, [r0 + r1]                  ; m7 = row 21
+    punpckhbw       xm2, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm2, 1
+    pmaddubsw       m6, [r5 + 3 * mmsize]
+    paddw           m0, m6
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhbw       xm3, xm7, xm2
+    punpcklbw       xm7, xm2
+    vinserti128     m7, m7, xm3, 1
+    pmaddubsw       m7, [r5 + 3 * mmsize]
+    paddw           m1, m7
+
+%ifidn %1,pp
+    pmulhrsw        m8, m14                         ; m8 = word: row 8
+    pmulhrsw        m9, m14                         ; m9 = word: row 9
+    pmulhrsw        m10, m14                        ; m10 = word: row 10
+    pmulhrsw        m11, m14                        ; m11 = word: row 11
+    pmulhrsw        m12, m14                        ; m12 = word: row 12
+    pmulhrsw        m13, m14                        ; m13 = word: row 13
+    pmulhrsw        m0, m14                         ; m0 = word: row 14
+    pmulhrsw        m1, m14                         ; m1 = word: row 15
+    packuswb        m8, m9
+    packuswb        m10, m11
+    packuswb        m12, m13
+    packuswb        m0, m1
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm1, m0, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm12
+    movu            [r2 + r3], xm13
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r6], xm1
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    psubw           m12, m14                        ; m12 = word: row 12
+    psubw           m13, m14                        ; m13 = word: row 13
+    psubw           m0, m14                         ; m0 = word: row 14
+    psubw           m1, m14                         ; m1 = word: row 15
+    movu            [r2], m8
+    movu            [r2 + r3], m9
+    movu            [r2 + r3 * 2], m10
+    movu            [r2 + r6], m11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m12
+    movu            [r2 + r3], m13
+    movu            [r2 + r3 * 2], m0
+    movu            [r2 + r6], m1
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x16 pp
+FILTER_VER_LUMA_AVX2_16x16 ps
+
+%macro FILTER_VER_LUMA_AVX2_16x12 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_16x12, 4, 7, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    mova            m14, [pw_512]
+%else
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%endif
+    lea             r6, [r3 * 3]
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhbw       xm2, xm0, xm1
+    punpcklbw       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddubsw       m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhbw       xm3, xm1, xm2
+    punpcklbw       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddubsw       m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhbw       xm4, xm2, xm3
+    punpcklbw       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
+    paddw           m0, m4
+    pmaddubsw       m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhbw       xm5, xm3, xm4
+    punpcklbw       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
+    paddw           m1, m5
+    pmaddubsw       m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhbw       xm6, xm4, xm5
+    punpcklbw       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
+    paddw           m0, m6
+    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
+    paddw           m2, m6
+    pmaddubsw       m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhbw       xm7, xm5, xm6
+    punpcklbw       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
+    paddw           m1, m7
+    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
+    paddw           m3, m7
+    pmaddubsw       m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhbw       xm8, xm6, xm7
+    punpcklbw       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
+    paddw           m0, m8
+    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
+    paddw           m2, m8
+    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
+    paddw           m4, m8
+    pmaddubsw       m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhbw       xm9, xm7, xm8
+    punpcklbw       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
+    paddw           m1, m9
+    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
+    paddw           m3, m9
+    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
+    paddw           m5, m9
+    pmaddubsw       m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhbw       xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
+    paddw           m2, m10
+    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
+    paddw           m4, m10
+    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
+    paddw           m6, m10
+    pmaddubsw       m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhbw       xm11, xm9, xm10
+    punpcklbw       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
+    paddw           m3, m11
+    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
+    paddw           m5, m11
+    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
+    paddw           m7, m11
+    pmaddubsw       m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhbw       xm12, xm10, xm11
+    punpcklbw       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
+    paddw           m4, m12
+    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
+    paddw           m6, m12
+    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
+    paddw           m8, m12
+    pmaddubsw       m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhbw       xm13, xm11, xm12
+    punpcklbw       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
+    paddw           m5, m13
+    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
+    paddw           m7, m13
+    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
+    paddw           m9, m13
+    pmaddubsw       m11, [r5]
+
+%ifidn %1,pp
+    pmulhrsw        m0, m14                         ; m0 = word: row 0
+    pmulhrsw        m1, m14                         ; m1 = word: row 1
+    pmulhrsw        m2, m14                         ; m2 = word: row 2
+    pmulhrsw        m3, m14                         ; m3 = word: row 3
+    pmulhrsw        m4, m14                         ; m4 = word: row 4
+    pmulhrsw        m5, m14                         ; m5 = word: row 5
+    packuswb        m0, m1
+    packuswb        m2, m3
+    packuswb        m4, m5
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm5, m4, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m4
+    movu            [r2 + r3], m5
+%endif
 
     movu            xm13, [r0 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
@@ -6042,6 +7664,7 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     pmaddubsw       m1, m13, [r5 + 1 * mmsize]
     paddw           m11, m1
 
+%ifidn %1,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
@@ -6049,6 +7672,12 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     vextracti128    xm7, m6, 1
     movu            [r2 + r3 * 2], xm6
     movu            [r2 + r6], xm7
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r2 + r3 * 2], m6
+    movu            [r2 + r6], m7
+%endif
     lea             r2, [r2 + r3 * 4]
 
     movu            xm1, [r0 + r4]                  ; m1 = row 15
@@ -6081,6 +7710,7 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     pmaddubsw       m5, m3, [r5 + 3 * mmsize]
     paddw           m11, m5
 
+%ifidn %1,pp
     pmulhrsw        m8, m14                         ; m8 = word: row 8
     pmulhrsw        m9, m14                         ; m9 = word: row 9
     pmulhrsw        m10, m14                        ; m10 = word: row 10
@@ -6095,27 +7725,43 @@ cglobal interp_8tap_vert_pp_16x12, 4, 7,
     movu            [r2 + r3], xm9
     movu            [r2 + r3 * 2], xm10
     movu            [r2 + r6], xm11
-    RET
-%endif
-
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    movu            [r2], m8
+    movu            [r2 + r3], m9
+    movu            [r2 + r3 * 2], m10
+    movu            [r2 + r6], m11
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x12 pp
+FILTER_VER_LUMA_AVX2_16x12 ps
+
+%macro FILTER_VER_LUMA_AVX2_16x8 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
+cglobal interp_8tap_vert_%1_16x8, 4, 6, 15
     mov             r4d, r4m
     shl             r4d, 7
-
 %ifdef PIC
     lea             r5, [tab_LumaCoeffVer_32]
     add             r5, r4
 %else
     lea             r5, [tab_LumaCoeffVer_32 + r4]
 %endif
-
     lea             r4, [r1 * 3]
     sub             r0, r4
-    lea             r6, [r3 * 3]
+%ifidn %1,pp
     mova            m14, [pw_512]
-
+%else
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%endif
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
     punpckhbw       xm2, xm0, xm1
@@ -6220,7 +7866,8 @@ cglobal interp_8tap_vert_pp_16x8, 4, 7, 
     paddw           m5, m13
     pmaddubsw       m13, m11, [r5 + 2 * mmsize]
     paddw           m7, m13
-
+    lea             r4, [r3 * 3]
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -6239,11 +7886,25 @@ cglobal interp_8tap_vert_pp_16x8, 4, 7, 
     movu            [r2], xm0
     movu            [r2 + r3], xm1
     movu            [r2 + r3 * 2], xm2
-    movu            [r2 + r6], xm3
+    movu            [r2 + r4], xm3
     lea             r2, [r2 + r3 * 4]
     movu            [r2], xm4
     movu            [r2 + r3], xm5
-
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r4], m3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m4
+    movu            [r2 + r3], m5
+%endif
     movu            xm13, [r0 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
     punpcklbw       xm12, xm13
@@ -6256,35 +7917,47 @@ cglobal interp_8tap_vert_pp_16x8, 4, 7, 
     vinserti128     m13, m13, xm1, 1
     pmaddubsw       m1, m13, [r5 + 3 * mmsize]
     paddw           m7, m1
-
+%ifidn %1,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
     vpermq          m6, m6, 11011000b
     vextracti128    xm7, m6, 1
     movu            [r2 + r3 * 2], xm6
-    movu            [r2 + r6], xm7
-    RET
-%endif
-
+    movu            [r2 + r4], xm7
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r2 + r3 * 2], m6
+    movu            [r2 + r4], m7
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x8 pp
+FILTER_VER_LUMA_AVX2_16x8 ps
+
+%macro FILTER_VER_LUMA_AVX2_16x4 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
+cglobal interp_8tap_vert_%1_16x4, 4, 6, 13
     mov             r4d, r4m
     shl             r4d, 7
-
 %ifdef PIC
     lea             r5, [tab_LumaCoeffVer_32]
     add             r5, r4
 %else
     lea             r5, [tab_LumaCoeffVer_32 + r4]
 %endif
-
     lea             r4, [r1 * 3]
     sub             r0, r4
-    lea             r6, [r3 * 3]
+%ifidn %1,pp
     mova            m12, [pw_512]
-
+%else
+    add             r3d, r3d
+    vbroadcasti128  m12, [pw_2000]
+%endif
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
     punpckhbw       xm2, xm0, xm1
@@ -6356,7 +8029,7 @@ cglobal interp_8tap_vert_pp_16x4, 4, 7, 
     vinserti128     m9, m9, xm11, 1
     pmaddubsw       m11, m9, [r5 + 3 * mmsize]
     paddw           m3, m11
-
+%ifidn %1,pp
     pmulhrsw        m0, m12                         ; m0 = word: row 0
     pmulhrsw        m1, m12                         ; m1 = word: row 1
     pmulhrsw        m2, m12                         ; m2 = word: row 2
@@ -6370,14 +8043,29 @@ cglobal interp_8tap_vert_pp_16x4, 4, 7, 
     movu            [r2], xm0
     movu            [r2 + r3], xm1
     movu            [r2 + r3 * 2], xm2
-    movu            [r2 + r6], xm3
-    RET
-%endif
-
-%macro FILTER_VER_LUMA_AVX2_16xN 2
+    lea             r4, [r3 * 3]
+    movu            [r2 + r4], xm3
+%else
+    psubw           m0, m12                         ; m0 = word: row 0
+    psubw           m1, m12                         ; m1 = word: row 1
+    psubw           m2, m12                         ; m2 = word: row 2
+    psubw           m3, m12                         ; m3 = word: row 3
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    lea             r4, [r3 * 3]
+    movu            [r2 + r4], m3
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_16x4 pp
+FILTER_VER_LUMA_AVX2_16x4 ps
+%macro FILTER_VER_LUMA_AVX2_16xN 3
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
+cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -6390,9 +8078,14 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %3,ps
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%else
+    mova            m14, [pw_512]
+%endif
     lea             r6, [r3 * 3]
     lea             r7, [r1 * 4]
-    mova            m14, [pw_512]
     mov             r8d, %2 / 16
 
 .loop:
@@ -6509,6 +8202,7 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     paddw           m9, m13
     pmaddubsw       m11, [r5]
 
+%ifidn %3,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -6531,6 +8225,21 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     lea             r2, [r2 + r3 * 4]
     movu            [r2], xm4
     movu            [r2 + r3], xm5
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m4
+    movu            [r2 + r3], m5
+%endif
 
     movu            xm13, [r0 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
@@ -6555,6 +8264,7 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     paddw           m11, m1
     pmaddubsw       m13, [r5]
 
+%ifidn %3,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
@@ -6562,6 +8272,13 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     vextracti128    xm7, m6, 1
     movu            [r2 + r3 * 2], xm6
     movu            [r2 + r6], xm7
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r2 + r3 * 2], m6
+    movu            [r2 + r6], m7
+%endif
+
     lea             r2, [r2 + r3 * 4]
 
     movu            xm1, [r0 + r4]                  ; m1 = row 15
@@ -6637,6 +8354,7 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     pmaddubsw       m7, [r5 + 3 * mmsize]
     paddw           m1, m7
 
+%ifidn %3,pp
     pmulhrsw        m8, m14                         ; m8 = word: row 8
     pmulhrsw        m9, m14                         ; m9 = word: row 9
     pmulhrsw        m10, m14                        ; m10 = word: row 10
@@ -6666,6 +8384,26 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
     movu            [r2 + r3], xm13
     movu            [r2 + r3 * 2], xm0
     movu            [r2 + r6], xm1
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    psubw           m12, m14                        ; m12 = word: row 12
+    psubw           m13, m14                        ; m13 = word: row 13
+    psubw           m0, m14                         ; m0 = word: row 14
+    psubw           m1, m14                         ; m1 = word: row 15
+    movu            [r2], m8
+    movu            [r2 + r3], m9
+    movu            [r2 + r3 * 2], m10
+    movu            [r2 + r6], m11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], m12
+    movu            [r2 + r3], m13
+    movu            [r2 + r3 * 2], m0
+    movu            [r2 + r6], m1
+%endif
+
     lea             r2, [r2 + r3 * 4]
     sub             r0, r7
     dec             r8d
@@ -6674,10 +8412,12 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 9,
 %endif
 %endmacro
 
-FILTER_VER_LUMA_AVX2_16xN 16, 32
-FILTER_VER_LUMA_AVX2_16xN 16, 64
-
-%macro PROCESS_LUMA_AVX2_W16_16R 0
+FILTER_VER_LUMA_AVX2_16xN 16, 32, pp
+FILTER_VER_LUMA_AVX2_16xN 16, 64, pp
+FILTER_VER_LUMA_AVX2_16xN 16, 32, ps
+FILTER_VER_LUMA_AVX2_16xN 16, 64, ps
+
+%macro PROCESS_LUMA_AVX2_W16_16R 1
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
     punpckhbw       xm2, xm0, xm1
@@ -6791,6 +8531,7 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     paddw           m9, m13
     pmaddubsw       m11, [r5]
 
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -6813,6 +8554,21 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     lea             r8, [r2 + r3 * 4]
     movu            [r8], xm4
     movu            [r8 + r3], xm5
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], m4
+    movu            [r8 + r3], m5
+%endif
 
     movu            xm13, [r7 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
@@ -6837,6 +8593,7 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     paddw           m11, m1
     pmaddubsw       m13, [r5]
 
+%ifidn %1,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
@@ -6844,6 +8601,13 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     vextracti128    xm7, m6, 1
     movu            [r8 + r3 * 2], xm6
     movu            [r8 + r6], xm7
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r8 + r3 * 2], m6
+    movu            [r8 + r6], m7
+%endif
+
     lea             r8, [r8 + r3 * 4]
 
     movu            xm1, [r7 + r4]                  ; m1 = row 15
@@ -6919,6 +8683,7 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     pmaddubsw       m7, [r5 + 3 * mmsize]
     paddw           m1, m7
 
+%ifidn %1,pp
     pmulhrsw        m8, m14                         ; m8 = word: row 8
     pmulhrsw        m9, m14                         ; m9 = word: row 9
     pmulhrsw        m10, m14                        ; m10 = word: row 10
@@ -6948,9 +8713,28 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     movu            [r8 + r3], xm13
     movu            [r8 + r3 * 2], xm0
     movu            [r8 + r6], xm1
-%endmacro
-
-%macro PROCESS_LUMA_AVX2_W16_8R 0
+%else
+    psubw           m8, m14                         ; m8 = word: row 8
+    psubw           m9, m14                         ; m9 = word: row 9
+    psubw           m10, m14                        ; m10 = word: row 10
+    psubw           m11, m14                        ; m11 = word: row 11
+    psubw           m12, m14                        ; m12 = word: row 12
+    psubw           m13, m14                        ; m13 = word: row 13
+    psubw           m0, m14                         ; m0 = word: row 14
+    psubw           m1, m14                         ; m1 = word: row 15
+    movu            [r8], m8
+    movu            [r8 + r3], m9
+    movu            [r8 + r3 * 2], m10
+    movu            [r8 + r6], m11
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], m12
+    movu            [r8 + r3], m13
+    movu            [r8 + r3 * 2], m0
+    movu            [r8 + r6], m1
+%endif
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W16_8R 1
     movu            xm0, [r0]                       ; m0 = row 0
     movu            xm1, [r0 + r1]                  ; m1 = row 1
     punpckhbw       xm2, xm0, xm1
@@ -7056,6 +8840,7 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     pmaddubsw       m13, m11, [r5 + 2 * mmsize]
     paddw           m7, m13
 
+%ifidn %1,pp
     pmulhrsw        m0, m14                         ; m0 = word: row 0
     pmulhrsw        m1, m14                         ; m1 = word: row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2
@@ -7078,6 +8863,21 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     lea             r8, [r2 + r3 * 4]
     movu            [r8], xm4
     movu            [r8 + r3], xm5
+%else
+    psubw           m0, m14                         ; m0 = word: row 0
+    psubw           m1, m14                         ; m1 = word: row 1
+    psubw           m2, m14                         ; m2 = word: row 2
+    psubw           m3, m14                         ; m3 = word: row 3
+    psubw           m4, m14                         ; m4 = word: row 4
+    psubw           m5, m14                         ; m5 = word: row 5
+    movu            [r2], m0
+    movu            [r2 + r3], m1
+    movu            [r2 + r3 * 2], m2
+    movu            [r2 + r6], m3
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], m4
+    movu            [r8 + r3], m5
+%endif
 
     movu            xm13, [r7 + r1]                 ; m13 = row 13
     punpckhbw       xm0, xm12, xm13
@@ -7092,6 +8892,7 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     pmaddubsw       m1, m13, [r5 + 3 * mmsize]
     paddw           m7, m1
 
+%ifidn %1,pp
     pmulhrsw        m6, m14                         ; m6 = word: row 6
     pmulhrsw        m7, m14                         ; m7 = word: row 7
     packuswb        m6, m7
@@ -7099,32 +8900,45 @@ FILTER_VER_LUMA_AVX2_16xN 16, 64
     vextracti128    xm7, m6, 1
     movu            [r8 + r3 * 2], xm6
     movu            [r8 + r6], xm7
-%endmacro
-
+%else
+    psubw           m6, m14                         ; m6 = word: row 6
+    psubw           m7, m14                         ; m7 = word: row 7
+    movu            [r8 + r3 * 2], m6
+    movu            [r8 + r6], m7
+%endif
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_24x32 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
+cglobal interp_8tap_vert_%1_24x32, 4, 11, 15
     mov             r4d, r4m
     shl             r4d, 7
-
 %ifdef PIC
     lea             r5, [tab_LumaCoeffVer_32]
     add             r5, r4
 %else
     lea             r5, [tab_LumaCoeffVer_32 + r4]
 %endif
-
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %1,ps
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%else
+    mova            m14, [pw_512]
+%endif
     lea             r6, [r3 * 3]
     lea             r10, [r1 * 4]
-    mova            m14, [pw_512]
     mov             r9d, 2
 .loopH:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_16R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
-
     movq            xm1, [r0]                       ; m1 = row 0
     movq            xm2, [r0 + r1]                  ; m2 = row 1
     punpcklbw       xm1, xm2
@@ -7243,7 +9057,7 @@ cglobal interp_8tap_vert_pp_24x32, 4, 11
     vinserti128     m9, m9, xm3, 1
     pmaddubsw       m3, m9, [r5 + 3 * mmsize]
     paddw           m8, m3
-
+%ifidn %1,pp
     pmulhrsw        m5, m14                         ; m5 = word: row 0, row 1
     pmulhrsw        m2, m14                         ; m2 = word: row 2, row 3
     pmulhrsw        m1, m14                         ; m1 = word: row 4, row 5
@@ -7279,85 +9093,116 @@ cglobal interp_8tap_vert_pp_24x32, 4, 11
     movq            [r8 + r3], xm8
     movhps          [r8 + r3 * 2], xm7
     movhps          [r8 + r6], xm8
-
+%else
+    psubw           m5, m14                         ; m5 = word: row 0, row 1
+    psubw           m2, m14                         ; m2 = word: row 2, row 3
+    psubw           m1, m14                         ; m1 = word: row 4, row 5
+    psubw           m4, m14                         ; m4 = word: row 6, row 7
+    psubw           m0, m14                         ; m0 = word: row 8, row 9
+    psubw           m6, m14                         ; m6 = word: row 10, row 11
+    psubw           m7, m14                         ; m7 = word: row 12, row 13
+    psubw           m8, m14                         ; m8 = word: row 14, row 15
+    vextracti128    xm3, m5, 1
+    movu            [r2], xm5
+    movu            [r2 + r3], xm3
+    vextracti128    xm3, m2, 1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+    vextracti128    xm3, m1, 1
+    lea             r8, [r2 + r3 * 4]
+    movu            [r8], xm1
+    movu            [r8 + r3], xm3
+    vextracti128    xm3, m4, 1
+    movu            [r8 + r3 * 2], xm4
+    movu            [r8 + r6], xm3
+    vextracti128    xm3, m0, 1
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm0
+    movu            [r8 + r3], xm3
+    vextracti128    xm3, m6, 1
+    movu            [r8 + r3 * 2], xm6
+    movu            [r8 + r6], xm3
+    vextracti128    xm3, m7, 1
+    lea             r8, [r8 + r3 * 4]
+    movu            [r8], xm7
+    movu            [r8 + r3], xm3
+    vextracti128    xm3, m8, 1
+    movu            [r8 + r3 * 2], xm8
+    movu            [r8 + r6], xm3
+%endif
     sub             r7, r10
     lea             r0, [r7 - 16]
+%ifidn %1,pp
     lea             r2, [r8 + r3 * 4 - 16]
+%else
+    lea             r2, [r8 + r3 * 4 - 32]
+%endif
     dec             r9d
     jnz             .loopH
     RET
 %endif
-
-%macro FILTER_VER_LUMA_AVX2_32xN 2
+%endmacro
+
+FILTER_VER_LUMA_AVX2_24x32 pp
+FILTER_VER_LUMA_AVX2_24x32 ps
+
+%macro FILTER_VER_LUMA_AVX2_32xN 3
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
     mov             r4d, r4m
     shl             r4d, 7
-
 %ifdef PIC
     lea             r5, [tab_LumaCoeffVer_32]
     add             r5, r4
 %else
     lea             r5, [tab_LumaCoeffVer_32 + r4]
 %endif
-
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %3,ps
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%else
+    mova            m14, [pw_512]
+%endif
     lea             r6, [r3 * 3]
     lea             r11, [r1 * 4]
-    mova            m14, [pw_512]
     mov             r9d, %2 / 16
 .loopH:
     mov             r10d, %1 / 16
 .loopW:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_16R %3
+%ifidn %3,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
     dec             r10d
     jnz             .loopW
     sub             r7, r11
     lea             r0, [r7 - 16]
+%ifidn %3,pp
     lea             r2, [r8 + r3 * 4 - 16]
+%else
+    lea             r2, [r8 + r3 * 4 - 32]
+%endif
     dec             r9d
     jnz             .loopH
     RET
 %endif
 %endmacro
 
-FILTER_VER_LUMA_AVX2_32xN 32, 32
-FILTER_VER_LUMA_AVX2_32xN 32, 64
-
+FILTER_VER_LUMA_AVX2_32xN 32, 32, pp
+FILTER_VER_LUMA_AVX2_32xN 32, 64, pp
+FILTER_VER_LUMA_AVX2_32xN 32, 32, ps
+FILTER_VER_LUMA_AVX2_32xN 32, 64, ps
+
+%macro FILTER_VER_LUMA_AVX2_32x16 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
-    mov             r4d, r4m
-    shl             r4d, 7
-
-%ifdef PIC
-    lea             r5, [tab_LumaCoeffVer_32]
-    add             r5, r4
-%else
-    lea             r5, [tab_LumaCoeffVer_32 + r4]
-%endif
-
-    lea             r4, [r1 * 3]
-    sub             r0, r4
-    lea             r6, [r3 * 3]
-    mova            m14, [pw_512]
-    mov             r9d, 2
-.loopW:
-    PROCESS_LUMA_AVX2_W16_16R
-    add             r2, 16
-    add             r0, 16
-    dec             r9d
-    jnz             .loopW
-    RET
-%endif
-
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
+cglobal interp_8tap_vert_%1_32x16, 4, 10, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -7370,59 +9215,95 @@ cglobal interp_8tap_vert_pp_32x24, 4, 10
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %1,ps
+    add             r3d, r3d
+    vbroadcasti128  m14, [pw_2000]
+%else
+    mova            m14, [pw_512]
+%endif
     lea             r6, [r3 * 3]
-    mova            m14, [pw_512]
     mov             r9d, 2
 .loopW:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_16R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32x16 pp
+FILTER_VER_LUMA_AVX2_32x16 ps
+ 
+%macro FILTER_VER_LUMA_AVX2_32x24 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_32x24, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,ps
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+%ifidn %1,pp
+    mova            m14, [pw_512]
+%else
+    vbroadcasti128  m14, [pw_2000]
+%endif
+    mov             r9d, 2
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R %1
+%ifidn %1,pp
+    add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
     dec             r9d
     jnz             .loopW
     lea             r9, [r1 * 4]
     sub             r7, r9
     lea             r0, [r7 - 16]
+%ifidn %1,pp
     lea             r2, [r8 + r3 * 4 - 16]
+%else
+    lea             r2, [r8 + r3 * 4 - 32]
+%endif
     mov             r9d, 2
 .loop:
-    PROCESS_LUMA_AVX2_W16_8R
+    PROCESS_LUMA_AVX2_W16_8R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
     dec             r9d
     jnz             .loop
     RET
 %endif
-
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32x24 pp
+FILTER_VER_LUMA_AVX2_32x24 ps
+
+%macro FILTER_VER_LUMA_AVX2_32x8 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
-    mov             r4d, r4m
-    shl             r4d, 7
-
-%ifdef PIC
-    lea             r5, [tab_LumaCoeffVer_32]
-    add             r5, r4
-%else
-    lea             r5, [tab_LumaCoeffVer_32 + r4]
-%endif
-
-    lea             r4, [r1 * 3]
-    sub             r0, r4
-    lea             r6, [r3 * 3]
-    mova            m14, [pw_512]
-    mov             r9d, 2
-.loopW:
-    PROCESS_LUMA_AVX2_W16_8R
-    add             r2, 16
-    add             r0, 16
-    dec             r9d
-    jnz             .loopW
-    RET
-%endif
-
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
+cglobal interp_8tap_vert_%1_32x8, 4, 10, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -7435,30 +9316,37 @@ cglobal interp_8tap_vert_pp_48x64, 4, 12
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+%ifidn %1,ps
+    add             r3d, r3d
+%endif
     lea             r6, [r3 * 3]
-    lea             r11, [r1 * 4]
+%ifidn %1,pp
     mova            m14, [pw_512]
-    mov             r9d, 4
-.loopH:
-    mov             r10d, 3
+%else
+    vbroadcasti128  m14, [pw_2000]
+%endif
+    mov             r9d, 2
 .loopW:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_8R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
-    dec             r10d
+    dec             r9d
     jnz             .loopW
-    sub             r7, r11
-    lea             r0, [r7 - 32]
-    lea             r2, [r8 + r3 * 4 - 32]
-    dec             r9d
-    jnz             .loopH
-    RET
-%endif
-
-%macro FILTER_VER_LUMA_AVX2_64xN 2
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_32x8 pp
+FILTER_VER_LUMA_AVX2_32x8 ps
+
+%macro FILTER_VER_LUMA_AVX2_48x64 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
+cglobal interp_8tap_vert_%1_48x64, 4, 12, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -7471,34 +9359,53 @@ cglobal interp_8tap_vert_pp_%1x%2, 4, 12
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+
+%ifidn %1,ps
+    add             r3d, r3d
+%endif
+
     lea             r6, [r3 * 3]
     lea             r11, [r1 * 4]
+
+%ifidn %1,pp
     mova            m14, [pw_512]
-    mov             r9d, %2 / 16
+%else
+    vbroadcasti128  m14, [pw_2000]
+%endif
+
+    mov             r9d, 4
 .loopH:
-    mov             r10d, %1 / 16
+    mov             r10d, 3
 .loopW:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_16R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
     dec             r10d
     jnz             .loopW
     sub             r7, r11
-    lea             r0, [r7 - 48]
-    lea             r2, [r8 + r3 * 4 - 48]
+    lea             r0, [r7 - 32]
+%ifidn %1,pp
+    lea             r2, [r8 + r3 * 4 - 32]
+%else
+    lea             r2, [r8 + r3 * 4 - 64]
+%endif
     dec             r9d
     jnz             .loopH
     RET
 %endif
 %endmacro
 
-FILTER_VER_LUMA_AVX2_64xN 64, 32
-FILTER_VER_LUMA_AVX2_64xN 64, 48
-FILTER_VER_LUMA_AVX2_64xN 64, 64
-
+FILTER_VER_LUMA_AVX2_48x64 pp
+FILTER_VER_LUMA_AVX2_48x64 ps
+
+%macro FILTER_VER_LUMA_AVX2_64xN 3
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
+cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
     mov             r4d, r4m
     shl             r4d, 7
 
@@ -7511,17 +9418,99 @@ cglobal interp_8tap_vert_pp_64x16, 4, 10
 
     lea             r4, [r1 * 3]
     sub             r0, r4
+
+%ifidn %3,ps
+    add             r3d, r3d
+%endif
+
     lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+
+%ifidn %3,pp
     mova            m14, [pw_512]
+%else
+    vbroadcasti128  m14, [pw_2000]
+%endif
+
+    mov             r9d, %2 / 16
+.loopH:
+    mov             r10d, %1 / 16
+.loopW:
+    PROCESS_LUMA_AVX2_W16_16R %3
+%ifidn %3,pp
+    add             r2, 16
+%else
+    add             r2, 32
+%endif
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 48]
+%ifidn %3,pp
+    lea             r2, [r8 + r3 * 4 - 48]
+%else
+    lea             r2, [r8 + r3 * 4 - 96]
+%endif
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_64xN 64, 32, pp
+FILTER_VER_LUMA_AVX2_64xN 64, 48, pp
+FILTER_VER_LUMA_AVX2_64xN 64, 64, pp
+FILTER_VER_LUMA_AVX2_64xN 64, 32, ps
+FILTER_VER_LUMA_AVX2_64xN 64, 48, ps
+FILTER_VER_LUMA_AVX2_64xN 64, 64, ps
+
+%macro FILTER_VER_LUMA_AVX2_64x16 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_64x16, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,ps
+    add             r3d, r3d
+%endif
+
+    lea             r6, [r3 * 3]
+
+%ifidn %1,pp
+    mova            m14, [pw_512]
+%else
+    vbroadcasti128  m14, [pw_2000]
+%endif
+
     mov             r9d, 4
 .loopW:
-    PROCESS_LUMA_AVX2_W16_16R
+    PROCESS_LUMA_AVX2_W16_16R %1
+%ifidn %1,pp
     add             r2, 16
+%else
+    add             r2, 32
+%endif
     add             r0, 16
     dec             r9d
     jnz             .loopW
     RET
 %endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_64x16 pp
+FILTER_VER_LUMA_AVX2_64x16 ps
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -9446,3 +11435,2422 @@ cglobal interp_8tap_vert_ss_%1x%2, 5, 7,
     FILTER_VER_LUMA_SS 48, 64
     FILTER_VER_LUMA_SS 64, 16
     FILTER_VER_LUMA_SS 16, 64
+
+%macro FILTER_VER_LUMA_AVX2_4x4 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
+    mov             r4d, r4m
+    add             r1d, r1d
+    shl             r4d, 7
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,sp
+    mova            m6, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    pmaddwd         m4, [r5 + 1 * mmsize]
+    paddd           m0, m5
+    paddd           m2, m4
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    pmaddwd         m1, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    paddd           m2, m1
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + 2 * r1]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [A 9 9 8]
+    pmaddwd         m4, [r5 + 3 * mmsize]
+    paddd           m2, m4
+
+%ifidn %1,sp
+    paddd           m0, m6
+    paddd           m2, m6
+    psrad           m0, 12
+    psrad           m2, 12
+%else
+    psrad           m0, 6
+    psrad           m2, 6
+%endif
+    packssdw        m0, m2
+    vextracti128    xm2, m0, 1
+    lea             r4, [r3 * 3]
+
+%ifidn %1,sp
+    packuswb        xm0, xm2
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 2
+    pextrd          [r2 + r3 * 2], xm0, 1
+    pextrd          [r2 + r4], xm0, 3
+%else
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r4], xm2
+%endif
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x4 sp
+FILTER_VER_LUMA_AVX2_4x4 ss
+
+%macro FILTER_VER_LUMA_AVX2_4x8 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,sp
+    mova            m7, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m4, [r5 + 1 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m4, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm6, [r0]
+    punpcklwd       xm3, xm6
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m1, [r5 + 2 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m5, m1, [r5 + 1 * mmsize]
+    paddd           m4, m5
+    pmaddwd         m1, [r5]
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm6, xm3
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm3, xm5
+    vinserti128     m6, m6, xm3, 1                  ; m6 = [A 9 9 8]
+    pmaddwd         m3, m6, [r5 + 3 * mmsize]
+    paddd           m2, m3
+    pmaddwd         m3, m6, [r5 + 2 * mmsize]
+    paddd           m4, m3
+    pmaddwd         m6, [r5 + 1 * mmsize]
+    paddd           m1, m6
+
+%ifidn %1,sp
+    paddd           m0, m7
+    paddd           m2, m7
+    psrad           m0, 12
+    psrad           m2, 12
+%else
+    psrad           m0, 6
+    psrad           m2, 6
+%endif
+    packssdw        m0, m2
+
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm5, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm2, [r0]
+    punpcklwd       xm3, xm2
+    vinserti128     m5, m5, xm3, 1                  ; m5 = [C B B A]
+    pmaddwd         m3, m5, [r5 + 3 * mmsize]
+    paddd           m4, m3
+    pmaddwd         m5, [r5 + 2 * mmsize]
+    paddd           m1, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm2, xm3
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm3, xm5
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [E D D C]
+    pmaddwd         m2, [r5 + 3 * mmsize]
+    paddd           m1, m2
+
+%ifidn %1,sp
+    paddd           m4, m7
+    paddd           m1, m7
+    psrad           m4, 12
+    psrad           m1, 12
+%else
+    psrad           m4, 6
+    psrad           m1, 6
+%endif
+    packssdw        m4, m1
+
+%ifidn %1,sp
+    packuswb        m0, m4
+    vextracti128    xm2, m0, 1
+    movd            [r2], xm0
+    movd            [r2 + r3], xm2
+    pextrd          [r2 + r3 * 2], xm0, 1
+    pextrd          [r2 + r6], xm2, 1
+    lea             r2, [r2 + r3 * 4]
+    pextrd          [r2], xm0, 2
+    pextrd          [r2 + r3], xm2, 2
+    pextrd          [r2 + r3 * 2], xm0, 3
+    pextrd          [r2 + r6], xm2, 3
+%else
+    vextracti128    xm2, m0, 1
+    vextracti128    xm1, m4, 1
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm4
+    movq            [r2 + r3], xm1
+    movhps          [r2 + r3 * 2], xm4
+    movhps          [r2 + r6], xm1
+%endif
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x8 sp
+FILTER_VER_LUMA_AVX2_4x8 ss
+
+%macro PROCESS_LUMA_AVX2_W4_16R 1
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m4, [r5 + 1 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m4, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm1, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm6, [r0]
+    punpcklwd       xm3, xm6
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m5, m1, [r5 + 2 * mmsize]
+    paddd           m2, m5
+    pmaddwd         m5, m1, [r5 + 1 * mmsize]
+    paddd           m4, m5
+    pmaddwd         m1, [r5]
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm6, xm3
+    movq            xm5, [r0 + 2 * r1]
+    punpcklwd       xm3, xm5
+    vinserti128     m6, m6, xm3, 1                  ; m6 = [10 9 9 8]
+    pmaddwd         m3, m6, [r5 + 3 * mmsize]
+    paddd           m2, m3
+    pmaddwd         m3, m6, [r5 + 2 * mmsize]
+    paddd           m4, m3
+    pmaddwd         m3, m6, [r5 + 1 * mmsize]
+    paddd           m1, m3
+    pmaddwd         m6, [r5]
+
+%ifidn %1,sp
+    paddd           m0, m7
+    paddd           m2, m7
+    psrad           m0, 12
+    psrad           m2, 12
+%else
+    psrad           m0, 6
+    psrad           m2, 6
+%endif
+    packssdw        m0, m2
+    vextracti128    xm2, m0, 1
+%ifidn %1,sp
+    packuswb        xm0, xm2
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 2
+    pextrd          [r2 + r3 * 2], xm0, 1
+    pextrd          [r2 + r6], xm0, 3
+%else
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm2
+%endif
+
+    movq            xm2, [r0 + r4]
+    punpcklwd       xm5, xm2
+    lea             r0, [r0 + 4 * r1]
+    movq            xm0, [r0]
+    punpcklwd       xm2, xm0
+    vinserti128     m5, m5, xm2, 1                  ; m5 = [12 11 11 10]
+    pmaddwd         m2, m5, [r5 + 3 * mmsize]
+    paddd           m4, m2
+    pmaddwd         m2, m5, [r5 + 2 * mmsize]
+    paddd           m1, m2
+    pmaddwd         m2, m5, [r5 + 1 * mmsize]
+    paddd           m6, m2
+    pmaddwd         m5, [r5]
+    movq            xm2, [r0 + r1]
+    punpcklwd       xm0, xm2
+    movq            xm3, [r0 + 2 * r1]
+    punpcklwd       xm2, xm3
+    vinserti128     m0, m0, xm2, 1                  ; m0 = [14 13 13 12]
+    pmaddwd         m2, m0, [r5 + 3 * mmsize]
+    paddd           m1, m2
+    pmaddwd         m2, m0, [r5 + 2 * mmsize]
+    paddd           m6, m2
+    pmaddwd         m2, m0, [r5 + 1 * mmsize]
+    paddd           m5, m2
+    pmaddwd         m0, [r5]
+
+%ifidn %1,sp
+    paddd           m4, m7
+    paddd           m1, m7
+    psrad           m4, 12
+    psrad           m1, 12
+%else
+    psrad           m4, 6
+    psrad           m1, 6
+%endif
+    packssdw        m4, m1
+    vextracti128    xm1, m4, 1
+    lea             r2, [r2 + r3 * 4]
+%ifidn %1,sp
+    packuswb        xm4, xm1
+    movd            [r2], xm4
+    pextrd          [r2 + r3], xm4, 2
+    pextrd          [r2 + r3 * 2], xm4, 1
+    pextrd          [r2 + r6], xm4, 3
+%else
+    movq            [r2], xm4
+    movq            [r2 + r3], xm1
+    movhps          [r2 + r3 * 2], xm4
+    movhps          [r2 + r6], xm1
+%endif
+
+    movq            xm4, [r0 + r4]
+    punpcklwd       xm3, xm4
+    lea             r0, [r0 + 4 * r1]
+    movq            xm1, [r0]
+    punpcklwd       xm4, xm1
+    vinserti128     m3, m3, xm4, 1                  ; m3 = [16 15 15 14]
+    pmaddwd         m4, m3, [r5 + 3 * mmsize]
+    paddd           m6, m4
+    pmaddwd         m4, m3, [r5 + 2 * mmsize]
+    paddd           m5, m4
+    pmaddwd         m4, m3, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m3, [r5]
+    movq            xm4, [r0 + r1]
+    punpcklwd       xm1, xm4
+    movq            xm2, [r0 + 2 * r1]
+    punpcklwd       xm4, xm2
+    vinserti128     m1, m1, xm4, 1                  ; m1 = [18 17 17 16]
+    pmaddwd         m4, m1, [r5 + 3 * mmsize]
+    paddd           m5, m4
+    pmaddwd         m4, m1, [r5 + 2 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m1, [r5 + 1 * mmsize]
+    paddd           m3, m1
+    movq            xm4, [r0 + r4]
+    punpcklwd       xm2, xm4
+    lea             r0, [r0 + 4 * r1]
+    movq            xm1, [r0]
+    punpcklwd       xm4, xm1
+    vinserti128     m2, m2, xm4, 1                  ; m2 = [20 19 19 18]
+    pmaddwd         m4, m2, [r5 + 3 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5 + 2 * mmsize]
+    paddd           m3, m2
+    movq            xm4, [r0 + r1]
+    punpcklwd       xm1, xm4
+    movq            xm2, [r0 + 2 * r1]
+    punpcklwd       xm4, xm2
+    vinserti128     m1, m1, xm4, 1                  ; m1 = [22 21 21 20]
+    pmaddwd         m1, [r5 + 3 * mmsize]
+    paddd           m3, m1
+
+%ifidn %1,sp
+    paddd           m6, m7
+    paddd           m5, m7
+    paddd           m0, m7
+    paddd           m3, m7
+    psrad           m6, 12
+    psrad           m5, 12
+    psrad           m0, 12
+    psrad           m3, 12
+%else
+    psrad           m6, 6
+    psrad           m5, 6
+    psrad           m0, 6
+    psrad           m3, 6
+%endif
+    packssdw        m6, m5
+    packssdw        m0, m3
+    lea             r2, [r2 + r3 * 4]
+
+%ifidn %1,sp
+    packuswb        m6, m0
+    vextracti128    xm0, m6, 1
+    movd            [r2], xm6
+    movd            [r2 + r3], xm0
+    pextrd          [r2 + r3 * 2], xm6, 1
+    pextrd          [r2 + r6], xm0, 1
+    lea             r2, [r2 + r3 * 4]
+    pextrd          [r2], xm6, 2
+    pextrd          [r2 + r3], xm0, 2
+    pextrd          [r2 + r3 * 2], xm6, 3
+    pextrd          [r2 + r6], xm0, 3
+%else
+    vextracti128    xm5, m6, 1
+    vextracti128    xm3, m0, 1
+    movq            [r2], xm6
+    movq            [r2 + r3], xm5
+    movhps          [r2 + r3 * 2], xm6
+    movhps          [r2 + r6], xm5
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm0
+    movq            [r2 + r3], xm3
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm3
+%endif
+%endmacro
+
+%macro FILTER_VER_LUMA_AVX2_4x16 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_4x16, 4, 7, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,sp
+    mova            m7, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_LUMA_AVX2_W4_16R %1
+    RET
+%endmacro
+
+FILTER_VER_LUMA_AVX2_4x16 sp
+FILTER_VER_LUMA_AVX2_4x16 ss
+
+%macro FILTER_VER_LUMA_S_AVX2_8x8 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_8x8, 4, 6, 12
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %1,sp
+    mova            m11, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    pmaddwd         m3, [r5]
+    paddd           m1, m5
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    pmaddwd         m5, [r5]
+    paddd           m3, m7
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    pmaddwd         m6, [r5]
+    paddd           m4, m8
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    pmaddwd         m7, [r5]
+    paddd           m5, m9
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    pmaddwd         m8, [r5 + 1 * mmsize]
+    paddd           m4, m10
+    paddd           m6, m8
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm8, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm8, 1
+    pmaddwd         m8, m9, [r5 + 3 * mmsize]
+    paddd           m3, m8
+    pmaddwd         m8, m9, [r5 + 2 * mmsize]
+    pmaddwd         m9, [r5 + 1 * mmsize]
+    paddd           m5, m8
+    paddd           m7, m9
+    movu            xm8, [r0 + r4]                  ; m8 = row 11
+    punpckhwd       xm9, xm10, xm8
+    punpcklwd       xm10, xm8
+    vinserti128     m10, m10, xm9, 1
+    pmaddwd         m9, m10, [r5 + 3 * mmsize]
+    pmaddwd         m10, [r5 + 2 * mmsize]
+    paddd           m4, m9
+    paddd           m6, m10
+
+    lea             r4, [r3 * 3]
+%ifidn %1,sp
+    paddd           m0, m11
+    paddd           m1, m11
+    paddd           m2, m11
+    paddd           m3, m11
+    psrad           m0, 12
+    psrad           m1, 12
+    psrad           m2, 12
+    psrad           m3, 12
+%else
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%endif
+    packssdw        m0, m1
+    packssdw        m2, m3
+%ifidn %1,sp
+    packuswb        m0, m2
+    mova            m1, [interp8_hps_shuf]
+    vpermd          m0, m1, m0
+    vextracti128    xm2, m0, 1
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r4], xm2
+%else
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm3
+%endif
+
+    lea             r0, [r0 + r1 * 4]
+    movu            xm9, [r0]                       ; m9 = row 12
+    punpckhwd       xm3, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm3, 1
+    pmaddwd         m3, m8, [r5 + 3 * mmsize]
+    pmaddwd         m8, [r5 + 2 * mmsize]
+    paddd           m5, m3
+    paddd           m7, m8
+    movu            xm3, [r0 + r1]                  ; m3 = row 13
+    punpckhwd       xm0, xm9, xm3
+    punpcklwd       xm9, xm3
+    vinserti128     m9, m9, xm0, 1
+    pmaddwd         m9, [r5 + 3 * mmsize]
+    paddd           m6, m9
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm9, xm3, xm0
+    punpcklwd       xm3, xm0
+    vinserti128     m3, m3, xm9, 1
+    pmaddwd         m3, [r5 + 3 * mmsize]
+    paddd           m7, m3
+
+%ifidn %1,sp
+    paddd           m4, m11
+    paddd           m5, m11
+    paddd           m6, m11
+    paddd           m7, m11
+    psrad           m4, 12
+    psrad           m5, 12
+    psrad           m6, 12
+    psrad           m7, 12
+%else
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%endif
+    packssdw        m4, m5
+    packssdw        m6, m7
+    lea             r2, [r2 + r3 * 4]
+%ifidn %1,sp
+    packuswb        m4, m6
+    vpermd          m4, m1, m4
+    vextracti128    xm6, m4, 1
+    movq            [r2], xm4
+    movhps          [r2 + r3], xm4
+    movq            [r2 + r3 * 2], xm6
+    movhps          [r2 + r4], xm6
+%else
+    vpermq          m4, m4, 11011000b
+    vpermq          m6, m6, 11011000b
+    vextracti128    xm5, m4, 1
+    vextracti128    xm7, m6, 1
+    movu            [r2], xm4
+    movu            [r2 + r3], xm5
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r4], xm7
+%endif
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_S_AVX2_8x8 sp
+FILTER_VER_LUMA_S_AVX2_8x8 ss
+
+%macro FILTER_VER_LUMA_S_AVX2_8xN 2
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,sp
+    mova            m14, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+    lea             r7, [r1 * 4]
+    mov             r8d, %2 / 16
+.loopH:
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r0 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm8, [r0]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r0 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m10, m8, [r5 + 1 * mmsize]
+    paddd           m6, m10
+    pmaddwd         m8, [r5]
+    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm11, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddwd         m11, m9, [r5 + 3 * mmsize]
+    paddd           m3, m11
+    pmaddwd         m11, m9, [r5 + 2 * mmsize]
+    paddd           m5, m11
+    pmaddwd         m11, m9, [r5 + 1 * mmsize]
+    paddd           m7, m11
+    pmaddwd         m9, [r5]
+    movu            xm11, [r0 + r4]                 ; m11 = row 11
+    punpckhwd       xm12, xm10, xm11
+    punpcklwd       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddwd         m12, m10, [r5 + 3 * mmsize]
+    paddd           m4, m12
+    pmaddwd         m12, m10, [r5 + 2 * mmsize]
+    paddd           m6, m12
+    pmaddwd         m12, m10, [r5 + 1 * mmsize]
+    paddd           m8, m12
+    pmaddwd         m10, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm12, [r0]                      ; m12 = row 12
+    punpckhwd       xm13, xm11, xm12
+    punpcklwd       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddwd         m13, m11, [r5 + 3 * mmsize]
+    paddd           m5, m13
+    pmaddwd         m13, m11, [r5 + 2 * mmsize]
+    paddd           m7, m13
+    pmaddwd         m13, m11, [r5 + 1 * mmsize]
+    paddd           m9, m13
+    pmaddwd         m11, [r5]
+
+%ifidn %1,sp
+    paddd           m0, m14
+    paddd           m1, m14
+    paddd           m2, m14
+    paddd           m3, m14
+    paddd           m4, m14
+    paddd           m5, m14
+    psrad           m0, 12
+    psrad           m1, 12
+    psrad           m2, 12
+    psrad           m3, 12
+    psrad           m4, 12
+    psrad           m5, 12
+%else
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%endif
+    packssdw        m0, m1
+    packssdw        m2, m3
+    packssdw        m4, m5
+%ifidn %1,sp
+    packuswb        m0, m2
+    mova            m1, [interp8_hps_shuf]
+    vpermd          m0, m1, m0
+    vextracti128    xm2, m0, 1
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r6], xm2
+%else
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r6], xm3
+%endif
+
+    movu            xm13, [r0 + r1]                 ; m13 = row 13
+    punpckhwd       xm0, xm12, xm13
+    punpcklwd       xm12, xm13
+    vinserti128     m12, m12, xm0, 1
+    pmaddwd         m0, m12, [r5 + 3 * mmsize]
+    paddd           m6, m0
+    pmaddwd         m0, m12, [r5 + 2 * mmsize]
+    paddd           m8, m0
+    pmaddwd         m0, m12, [r5 + 1 * mmsize]
+    paddd           m10, m0
+    pmaddwd         m12, [r5]
+    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
+    punpckhwd       xm2, xm13, xm0
+    punpcklwd       xm13, xm0
+    vinserti128     m13, m13, xm2, 1
+    pmaddwd         m2, m13, [r5 + 3 * mmsize]
+    paddd           m7, m2
+    pmaddwd         m2, m13, [r5 + 2 * mmsize]
+    paddd           m9, m2
+    pmaddwd         m2, m13, [r5 + 1 * mmsize]
+    paddd           m11, m2
+    pmaddwd         m13, [r5]
+
+%ifidn %1,sp
+    paddd           m6, m14
+    paddd           m7, m14
+    psrad           m6, 12
+    psrad           m7, 12
+%else
+    psrad           m6, 6
+    psrad           m7, 6
+%endif
+    packssdw        m6, m7
+    lea             r2, [r2 + r3 * 4]
+
+%ifidn %1,sp
+    packuswb        m4, m6
+    vpermd          m4, m1, m4
+    vextracti128    xm6, m4, 1
+    movq            [r2], xm4
+    movhps          [r2 + r3], xm4
+    movq            [r2 + r3 * 2], xm6
+    movhps          [r2 + r6], xm6
+%else
+    vpermq          m6, m6, 11011000b
+    vpermq          m4, m4, 11011000b
+    vextracti128    xm1, m4, 1
+    vextracti128    xm7, m6, 1
+    movu            [r2], xm4
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm6
+    movu            [r2 + r6], xm7
+%endif
+
+    movu            xm6, [r0 + r4]                  ; m6 = row 15
+    punpckhwd       xm5, xm0, xm6
+    punpcklwd       xm0, xm6
+    vinserti128     m0, m0, xm5, 1
+    pmaddwd         m5, m0, [r5 + 3 * mmsize]
+    paddd           m8, m5
+    pmaddwd         m5, m0, [r5 + 2 * mmsize]
+    paddd           m10, m5
+    pmaddwd         m5, m0, [r5 + 1 * mmsize]
+    paddd           m12, m5
+    pmaddwd         m0, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm2, [r0]                       ; m2 = row 16
+    punpckhwd       xm3, xm6, xm2
+    punpcklwd       xm6, xm2
+    vinserti128     m6, m6, xm3, 1
+    pmaddwd         m3, m6, [r5 + 3 * mmsize]
+    paddd           m9, m3
+    pmaddwd         m3, m6, [r5 + 2 * mmsize]
+    paddd           m11, m3
+    pmaddwd         m3, m6, [r5 + 1 * mmsize]
+    paddd           m13, m3
+    pmaddwd         m6, [r5]
+    movu            xm3, [r0 + r1]                  ; m3 = row 17
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 3 * mmsize]
+    paddd           m10, m4
+    pmaddwd         m4, m2, [r5 + 2 * mmsize]
+    paddd           m12, m4
+    pmaddwd         m2, [r5 + 1 * mmsize]
+    paddd           m0, m2
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
+    punpckhwd       xm2, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm2, 1
+    pmaddwd         m2, m3, [r5 + 3 * mmsize]
+    paddd           m11, m2
+    pmaddwd         m2, m3, [r5 + 2 * mmsize]
+    paddd           m13, m2
+    pmaddwd         m3, [r5 + 1 * mmsize]
+    paddd           m6, m3
+    movu            xm2, [r0 + r4]                  ; m2 = row 19
+    punpckhwd       xm7, xm4, xm2
+    punpcklwd       xm4, xm2
+    vinserti128     m4, m4, xm7, 1
+    pmaddwd         m7, m4, [r5 + 3 * mmsize]
+    paddd           m12, m7
+    pmaddwd         m4, [r5 + 2 * mmsize]
+    paddd           m0, m4
+    lea             r0, [r0 + r1 * 4]
+    movu            xm7, [r0]                       ; m7 = row 20
+    punpckhwd       xm3, xm2, xm7
+    punpcklwd       xm2, xm7
+    vinserti128     m2, m2, xm3, 1
+    pmaddwd         m3, m2, [r5 + 3 * mmsize]
+    paddd           m13, m3
+    pmaddwd         m2, [r5 + 2 * mmsize]
+    paddd           m6, m2
+    movu            xm3, [r0 + r1]                  ; m3 = row 21
+    punpckhwd       xm2, xm7, xm3
+    punpcklwd       xm7, xm3
+    vinserti128     m7, m7, xm2, 1
+    pmaddwd         m7, [r5 + 3 * mmsize]
+    paddd           m0, m7
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
+    punpckhwd       xm7, xm3, xm2
+    punpcklwd       xm3, xm2
+    vinserti128     m3, m3, xm7, 1
+    pmaddwd         m3, [r5 + 3 * mmsize]
+    paddd           m6, m3
+
+%ifidn %1,sp
+    paddd           m8, m14
+    paddd           m9, m14
+    paddd           m10, m14
+    paddd           m11, m14
+    paddd           m12, m14
+    paddd           m13, m14
+    paddd           m0, m14
+    paddd           m6, m14
+    psrad           m8, 12
+    psrad           m9, 12
+    psrad           m10, 12
+    psrad           m11, 12
+    psrad           m12, 12
+    psrad           m13, 12
+    psrad           m0, 12
+    psrad           m6, 12
+%else
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m6, 6
+%endif
+    packssdw        m8, m9
+    packssdw        m10, m11
+    packssdw        m12, m13
+    packssdw        m0, m6
+    lea             r2, [r2 + r3 * 4]
+
+%ifidn %1,sp
+    packuswb        m8, m10
+    packuswb        m12, m0
+    vpermd          m8, m1, m8
+    vpermd          m12, m1, m12
+    vextracti128    xm10, m8, 1
+    vextracti128    xm0, m12, 1
+    movq            [r2], xm8
+    movhps          [r2 + r3], xm8
+    movq            [r2 + r3 * 2], xm10
+    movhps          [r2 + r6], xm10
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm12
+    movhps          [r2 + r3], xm12
+    movq            [r2 + r3 * 2], xm0
+    movhps          [r2 + r6], xm0
+%else
+    vpermq          m8, m8, 11011000b
+    vpermq          m10, m10, 11011000b
+    vpermq          m12, m12, 11011000b
+    vpermq          m0, m0, 11011000b
+    vextracti128    xm9, m8, 1
+    vextracti128    xm11, m10, 1
+    vextracti128    xm13, m12, 1
+    vextracti128    xm6, m0, 1
+    movu            [r2], xm8
+    movu            [r2 + r3], xm9
+    movu            [r2 + r3 * 2], xm10
+    movu            [r2 + r6], xm11
+    lea             r2, [r2 + r3 * 4]
+    movu            [r2], xm12
+    movu            [r2 + r3], xm13
+    movu            [r2 + r3 * 2], xm0
+    movu            [r2 + r6], xm6
+%endif
+
+    lea             r2, [r2 + r3 * 4]
+    sub             r0, r7
+    dec             r8d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_S_AVX2_8xN sp, 16
+FILTER_VER_LUMA_S_AVX2_8xN sp, 32
+FILTER_VER_LUMA_S_AVX2_8xN ss, 16
+FILTER_VER_LUMA_S_AVX2_8xN ss, 32
+
+%macro PROCESS_LUMA_S_AVX2_W8_4R 1
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r0, [r0 + r1 * 4]
+    movu            xm4, [r0]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r0 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m4, [r5 + 1 * mmsize]
+    paddd           m2, m4
+    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm4, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm4, 1
+    pmaddwd         m4, m5, [r5 + 2 * mmsize]
+    paddd           m1, m4
+    pmaddwd         m5, [r5 + 1 * mmsize]
+    paddd           m3, m5
+    movu            xm4, [r0 + r4]                  ; m4 = row 7
+    punpckhwd       xm5, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm5, 1
+    pmaddwd         m5, m6, [r5 + 3 * mmsize]
+    paddd           m0, m5
+    pmaddwd         m6, [r5 + 2 * mmsize]
+    paddd           m2, m6
+    lea             r0, [r0 + r1 * 4]
+    movu            xm5, [r0]                       ; m5 = row 8
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 3 * mmsize]
+    paddd           m1, m6
+    pmaddwd         m4, [r5 + 2 * mmsize]
+    paddd           m3, m4
+    movu            xm6, [r0 + r1]                  ; m6 = row 9
+    punpckhwd       xm4, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm4, 1
+    pmaddwd         m5, [r5 + 3 * mmsize]
+    paddd           m2, m5
+    movu            xm4, [r0 + r1 * 2]              ; m4 = row 10
+    punpckhwd       xm5, xm6, xm4
+    punpcklwd       xm6, xm4
+    vinserti128     m6, m6, xm5, 1
+    pmaddwd         m6, [r5 + 3 * mmsize]
+    paddd           m3, m6
+
+%ifidn %1,sp
+    paddd           m0, m7
+    paddd           m1, m7
+    paddd           m2, m7
+    paddd           m3, m7
+    psrad           m0, 12
+    psrad           m1, 12
+    psrad           m2, 12
+    psrad           m3, 12
+%else
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%endif
+    packssdw        m0, m1
+    packssdw        m2, m3
+%ifidn %1,sp
+    packuswb        m0, m2
+    mova            m4, [interp8_hps_shuf]
+    vpermd          m0, m4, m0
+    vextracti128    xm2, m0, 1
+%else
+    vpermq          m0, m0, 11011000b
+    vpermq          m2, m2, 11011000b
+    vextracti128    xm1, m0, 1
+    vextracti128    xm3, m2, 1
+%endif
+%endmacro
+
+%macro FILTER_VER_LUMA_S_AVX2_8x4 1
+INIT_YMM avx2
+cglobal interp_8tap_vert_%1_8x4, 4, 6, 8
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,sp
+    mova            m7, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+
+    PROCESS_LUMA_S_AVX2_W8_4R %1
+    lea             r4, [r3 * 3]
+%ifidn %1,sp
+    movq            [r2], xm0
+    movhps          [r2 + r3], xm0
+    movq            [r2 + r3 * 2], xm2
+    movhps          [r2 + r4], xm2
+%else
+    movu            [r2], xm0
+    movu            [r2 + r3], xm1
+    movu            [r2 + r3 * 2], xm2
+    movu            [r2 + r4], xm3
+%endif
+    RET
+%endmacro
+
+FILTER_VER_LUMA_S_AVX2_8x4 sp
+FILTER_VER_LUMA_S_AVX2_8x4 ss
+
+%macro PROCESS_LUMA_AVX2_W8_16R 1
+    movu            xm0, [r0]                       ; m0 = row 0
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
+    punpckhwd       xm2, xm0, xm1
+    punpcklwd       xm0, xm1
+    vinserti128     m0, m0, xm2, 1
+    pmaddwd         m0, [r5]
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
+    punpckhwd       xm3, xm1, xm2
+    punpcklwd       xm1, xm2
+    vinserti128     m1, m1, xm3, 1
+    pmaddwd         m1, [r5]
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
+    punpckhwd       xm4, xm2, xm3
+    punpcklwd       xm2, xm3
+    vinserti128     m2, m2, xm4, 1
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
+    paddd           m0, m4
+    pmaddwd         m2, [r5]
+    lea             r7, [r0 + r1 * 4]
+    movu            xm4, [r7]                       ; m4 = row 4
+    punpckhwd       xm5, xm3, xm4
+    punpcklwd       xm3, xm4
+    vinserti128     m3, m3, xm5, 1
+    pmaddwd         m5, m3, [r5 + 1 * mmsize]
+    paddd           m1, m5
+    pmaddwd         m3, [r5]
+    movu            xm5, [r7 + r1]                  ; m5 = row 5
+    punpckhwd       xm6, xm4, xm5
+    punpcklwd       xm4, xm5
+    vinserti128     m4, m4, xm6, 1
+    pmaddwd         m6, m4, [r5 + 2 * mmsize]
+    paddd           m0, m6
+    pmaddwd         m6, m4, [r5 + 1 * mmsize]
+    paddd           m2, m6
+    pmaddwd         m4, [r5]
+    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
+    punpckhwd       xm7, xm5, xm6
+    punpcklwd       xm5, xm6
+    vinserti128     m5, m5, xm7, 1
+    pmaddwd         m7, m5, [r5 + 2 * mmsize]
+    paddd           m1, m7
+    pmaddwd         m7, m5, [r5 + 1 * mmsize]
+    paddd           m3, m7
+    pmaddwd         m5, [r5]
+    movu            xm7, [r7 + r4]                  ; m7 = row 7
+    punpckhwd       xm8, xm6, xm7
+    punpcklwd       xm6, xm7
+    vinserti128     m6, m6, xm8, 1
+    pmaddwd         m8, m6, [r5 + 3 * mmsize]
+    paddd           m0, m8
+    pmaddwd         m8, m6, [r5 + 2 * mmsize]
+    paddd           m2, m8
+    pmaddwd         m8, m6, [r5 + 1 * mmsize]
+    paddd           m4, m8
+    pmaddwd         m6, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm8, [r7]                       ; m8 = row 8
+    punpckhwd       xm9, xm7, xm8
+    punpcklwd       xm7, xm8
+    vinserti128     m7, m7, xm9, 1
+    pmaddwd         m9, m7, [r5 + 3 * mmsize]
+    paddd           m1, m9
+    pmaddwd         m9, m7, [r5 + 2 * mmsize]
+    paddd           m3, m9
+    pmaddwd         m9, m7, [r5 + 1 * mmsize]
+    paddd           m5, m9
+    pmaddwd         m7, [r5]
+    movu            xm9, [r7 + r1]                  ; m9 = row 9
+    punpckhwd       xm10, xm8, xm9
+    punpcklwd       xm8, xm9
+    vinserti128     m8, m8, xm10, 1
+    pmaddwd         m10, m8, [r5 + 3 * mmsize]
+    paddd           m2, m10
+    pmaddwd         m10, m8, [r5 + 2 * mmsize]
+    paddd           m4, m10
+    pmaddwd         m10, m8, [r5 + 1 * mmsize]
+    paddd           m6, m10
+    pmaddwd         m8, [r5]
+    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
+    punpckhwd       xm11, xm9, xm10
+    punpcklwd       xm9, xm10
+    vinserti128     m9, m9, xm11, 1
+    pmaddwd         m11, m9, [r5 + 3 * mmsize]
+    paddd           m3, m11
+    pmaddwd         m11, m9, [r5 + 2 * mmsize]
+    paddd           m5, m11
+    pmaddwd         m11, m9, [r5 + 1 * mmsize]
+    paddd           m7, m11
+    pmaddwd         m9, [r5]
+    movu            xm11, [r7 + r4]                 ; m11 = row 11
+    punpckhwd       xm12, xm10, xm11
+    punpcklwd       xm10, xm11
+    vinserti128     m10, m10, xm12, 1
+    pmaddwd         m12, m10, [r5 + 3 * mmsize]
+    paddd           m4, m12
+    pmaddwd         m12, m10, [r5 + 2 * mmsize]
+    paddd           m6, m12
+    pmaddwd         m12, m10, [r5 + 1 * mmsize]
+    paddd           m8, m12
+    pmaddwd         m10, [r5]
+    lea             r7, [r7 + r1 * 4]
+    movu            xm12, [r7]                      ; m12 = row 12
+    punpckhwd       xm13, xm11, xm12
+    punpcklwd       xm11, xm12
+    vinserti128     m11, m11, xm13, 1
+    pmaddwd         m13, m11, [r5 + 3 * mmsize]
+    paddd           m5, m13
+    pmaddwd         m13, m11, [r5 + 2 * mmsize]
+    paddd           m7, m13