view source/common/x86/ipfilter8.asm @ 12733:e31211b2f00c draft default tip master

Disable fall-back on traditional scenecut algorithm with --hist-scenecut
author Praveen Karadugattu <praveenkumar@multicorewareinc.com>
date Sun, 20 Jun 2021 21:20:50 +0530
parents f161d75ee3bb
children
line wrap: on
line source

;*****************************************************************************
;* Copyright (C) 2013-2020 MulticoreWare, Inc
;*
;* Authors: Min Chen <chenm003@163.com>
;*          Nabajit Deka <nabajit@multicorewareinc.com>
;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 64
const tab_Tm,    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
                 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
                 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14

const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15

const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
                         dd 2, 3, 3, 4, 4, 5, 5, 6

const tab_Lm,    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
                 db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
                 db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
                 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14

const pd_526336, times 8 dd 8192*64+2048

const tab_ChromaCoeff, db  0, 64,  0,  0
                       db -2, 58, 10, -2
                       db -4, 54, 16, -2
                       db -6, 46, 28, -4
                       db -4, 36, 36, -4
                       db -4, 28, 46, -6
                       db -2, 16, 54, -4
                       db -2, 10, 58, -2

const tab_LumaCoeff,   db   0, 0,  0,  64,  0,   0,  0,  0
                       db  -1, 4, -10, 58,  17, -5,  1,  0
                       db  -1, 4, -11, 40,  40, -11, 4, -1
                       db   0, 1, -5,  17,  58, -10, 4, -1

const tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
                       dw  -1, 4, -10, 58,  17, -5,  1,  0
                       dw  -1, 4, -11, 40,  40, -11, 4, -1
                       dw   0, 1, -5,  17,  58, -10, 4, -1

const tab_LumaCoeffV,   times 4 dw 0, 0
                        times 4 dw 0, 64
                        times 4 dw 0, 0
                        times 4 dw 0, 0

                        times 4 dw -1, 4
                        times 4 dw -10, 58
                        times 4 dw 17, -5
                        times 4 dw 1, 0

                        times 4 dw -1, 4
                        times 4 dw -11, 40
                        times 4 dw 40, -11
                        times 4 dw 4, -1

                        times 4 dw 0, 1
                        times 4 dw -5, 17
                        times 4 dw 58, -10
                        times 4 dw 4, -1

const pw_LumaCoeffVer,  times 8 dw 0, 0
                        times 8 dw 0, 64
                        times 8 dw 0, 0
                        times 8 dw 0, 0

                        times 8 dw -1, 4
                        times 8 dw -10, 58
                        times 8 dw 17, -5
                        times 8 dw 1, 0

                        times 8 dw -1, 4
                        times 8 dw -11, 40
                        times 8 dw 40, -11
                        times 8 dw 4, -1

                        times 8 dw 0, 1
                        times 8 dw -5, 17
                        times 8 dw 58, -10
                        times 8 dw 4, -1

const tab_LumaCoeffVer, times 8 db 0, 0
                        times 8 db 0, 64
                        times 8 db 0, 0
                        times 8 db 0, 0

                        times 8 db -1, 4
                        times 8 db -10, 58
                        times 8 db 17, -5
                        times 8 db 1, 0

                        times 8 db -1, 4
                        times 8 db -11, 40
                        times 8 db 40, -11
                        times 8 db 4, -1

                        times 8 db 0, 1
                        times 8 db -5, 17
                        times 8 db 58, -10
                        times 8 db 4, -1

const tab_LumaCoeffVer_32,  times 16 db 0, 0
                            times 16 db 0, 64
                            times 16 db 0, 0
                            times 16 db 0, 0

                            times 16 db -1, 4
                            times 16 db -10, 58
                            times 16 db 17, -5
                            times 16 db 1, 0

                            times 16 db -1, 4
                            times 16 db -11, 40
                            times 16 db 40, -11
                            times 16 db 4, -1

                            times 16 db 0, 1
                            times 16 db -5, 17
                            times 16 db 58, -10
                            times 16 db 4, -1

ALIGN 64
const tab_ChromaCoeffVer_32_avx512,     times 32 db 0, 64
                                        times 32 db 0, 0

                                        times 32 db -2, 58
                                        times 32 db 10, -2

                                        times 32 db -4, 54
                                        times 32 db 16, -2

                                        times 32 db -6, 46
                                        times 32 db 28, -4

                                        times 32 db -4, 36
                                        times 32 db 36, -4

                                        times 32 db -4, 28
                                        times 32 db 46, -6

                                        times 32 db -2, 16
                                        times 32 db 54, -4

                                        times 32 db -2, 10
                                        times 32 db 58, -2

ALIGN 64
const pw_ChromaCoeffVer_32_avx512,      times 16 dw 0, 64
                                        times 16 dw 0, 0

                                        times 16 dw -2, 58
                                        times 16 dw 10, -2

                                        times 16 dw -4, 54
                                        times 16 dw 16, -2

                                        times 16 dw -6, 46
                                        times 16 dw 28, -4

                                        times 16 dw -4, 36
                                        times 16 dw 36, -4

                                        times 16 dw -4, 28
                                        times 16 dw 46, -6

                                        times 16 dw -2, 16
                                        times 16 dw 54, -4

                                        times 16 dw -2, 10
                                        times 16 dw 58, -2

ALIGN 64
const pw_LumaCoeffVer_avx512,           times 16 dw 0, 0
                                        times 16 dw 0, 64
                                        times 16 dw 0, 0
                                        times 16 dw 0, 0

                                        times 16 dw -1, 4
                                        times 16 dw -10, 58
                                        times 16 dw 17, -5
                                        times 16 dw 1, 0

                                        times 16 dw -1, 4
                                        times 16 dw -11, 40
                                        times 16 dw 40, -11
                                        times 16 dw 4, -1

                                        times 16 dw 0, 1
                                        times 16 dw -5, 17
                                        times 16 dw 58, -10
                                        times 16 dw 4, -1

ALIGN 64
const tab_LumaCoeffVer_32_avx512,       times 32 db 0, 0
                                        times 32 db 0, 64
                                        times 32 db 0, 0
                                        times 32 db 0, 0

                                        times 32 db -1, 4
                                        times 32 db -10, 58
                                        times 32 db 17, -5
                                        times 32 db 1, 0

                                        times 32 db -1, 4
                                        times 32 db -11, 40
                                        times 32 db 40, -11
                                        times 32 db 4, -1

                                        times 32 db 0, 1
                                        times 32 db -5, 17
                                        times 32 db 58, -10
                                        times 32 db 4, -1

const tab_c_64_n64, times 8 db 64, -64

const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7

const interp4_horiz_shuf_load1_avx512,  times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10

ALIGN 64
interp4_vps_store1_avx512:   dq 0, 1, 8, 9, 2, 3, 10, 11
interp4_vps_store2_avx512:   dq 4, 5, 12, 13, 6, 7, 14, 15
const interp4_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
const interp4_hps_store_16xN_avx512,  dq 0, 2, 1, 3, 4, 6, 5, 7
const interp8_hps_store_avx512,  dq 0, 1, 4, 5, 2, 3, 6, 7
const interp8_vsp_store_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7

SECTION .text
cextern pb_128
cextern pw_1
cextern pw_32
cextern pw_512
cextern pw_2000
cextern pw_8192

%macro FILTER_H8_W8_sse2 0
    movh        m1, [r0 + x - 3]
    movh        m4, [r0 + x - 2]
    punpcklbw   m1, m6
    punpcklbw   m4, m6
    movh        m5, [r0 + x - 1]
    movh        m0, [r0 + x]
    punpcklbw   m5, m6
    punpcklbw   m0, m6
    pmaddwd     m1, m3
    pmaddwd     m4, m3
    pmaddwd     m5, m3
    pmaddwd     m0, m3
    packssdw    m1, m4
    packssdw    m5, m0
    pshuflw     m4, m1, q2301
    pshufhw     m4, m4, q2301
    pshuflw     m0, m5, q2301
    pshufhw     m0, m0, q2301
    paddw       m1, m4
    paddw       m5, m0
    psrldq      m1, 2
    psrldq      m5, 2
    pshufd      m1, m1, q3120
    pshufd      m5, m5, q3120
    punpcklqdq  m1, m5
    movh        m7, [r0 + x + 1]
    movh        m4, [r0 + x + 2]
    punpcklbw   m7, m6
    punpcklbw   m4, m6
    movh        m5, [r0 + x + 3]
    movh        m0, [r0 + x + 4]
    punpcklbw   m5, m6
    punpcklbw   m0, m6
    pmaddwd     m7, m3
    pmaddwd     m4, m3
    pmaddwd     m5, m3
    pmaddwd     m0, m3
    packssdw    m7, m4
    packssdw    m5, m0
    pshuflw     m4, m7, q2301
    pshufhw     m4, m4, q2301
    pshuflw     m0, m5, q2301
    pshufhw     m0, m0, q2301
    paddw       m7, m4
    paddw       m5, m0
    psrldq      m7, 2
    psrldq      m5, 2
    pshufd      m7, m7, q3120
    pshufd      m5, m5, q3120
    punpcklqdq  m7, m5
    pshuflw     m4, m1, q2301
    pshufhw     m4, m4, q2301
    pshuflw     m0, m7, q2301
    pshufhw     m0, m0, q2301
    paddw       m1, m4
    paddw       m7, m0
    psrldq      m1, 2
    psrldq      m7, 2
    pshufd      m1, m1, q3120
    pshufd      m7, m7, q3120
    punpcklqdq  m1, m7
%endmacro

%macro FILTER_H8_W4_sse2 0
    movh        m1, [r0 + x - 3]
    movh        m0, [r0 + x - 2]
    punpcklbw   m1, m6
    punpcklbw   m0, m6
    movh        m4, [r0 + x - 1]
    movh        m5, [r0 + x]
    punpcklbw   m4, m6
    punpcklbw   m5, m6
    pmaddwd     m1, m3
    pmaddwd     m0, m3
    pmaddwd     m4, m3
    pmaddwd     m5, m3
    packssdw    m1, m0
    packssdw    m4, m5
    pshuflw     m0, m1, q2301
    pshufhw     m0, m0, q2301
    pshuflw     m5, m4, q2301
    pshufhw     m5, m5, q2301
    paddw       m1, m0
    paddw       m4, m5
    psrldq      m1, 2
    psrldq      m4, 2
    pshufd      m1, m1, q3120
    pshufd      m4, m4, q3120
    punpcklqdq  m1, m4
    pshuflw     m0, m1, q2301
    pshufhw     m0, m0, q2301
    paddw       m1, m0
    psrldq      m1, 2
    pshufd      m1, m1, q3120
%endmacro
%macro PROCESS_LUMA_W4_4R_sse2 0
    movd        m2,     [r0]
    movd        m7,     [r0 + r1]
    punpcklbw   m2,     m7                      ; m2=[0 1]

    lea         r0,     [r0 + 2 * r1]
    movd        m3,     [r0]
    punpcklbw   m7,     m3                      ; m7=[1 2]
    punpcklbw   m2,     m0
    punpcklbw   m7,     m0
    pmaddwd     m2,     [r6 + 0 * 32]
    pmaddwd     m7,     [r6 + 0 * 32]
    packssdw    m2,     m7                      ; m2=[0+1 1+2]

    movd        m7,     [r0 + r1]
    punpcklbw   m3,     m7                      ; m3=[2 3]
    lea         r0,     [r0 + 2 * r1]
    movd        m5,     [r0]
    punpcklbw   m7,     m5                      ; m7=[3 4]
    punpcklbw   m3,     m0
    punpcklbw   m7,     m0
    pmaddwd     m4,     m3,     [r6 + 1 * 32]
    pmaddwd     m6,     m7,     [r6 + 1 * 32]
    packssdw    m4,     m6                      ; m4=[2+3 3+4]
    paddw       m2,     m4                      ; m2=[0+1+2+3 1+2+3+4]                   Row1-2
    pmaddwd     m3,     [r6 + 0 * 32]
    pmaddwd     m7,     [r6 + 0 * 32]
    packssdw    m3,     m7                      ; m3=[2+3 3+4]                           Row3-4

    movd        m7,     [r0 + r1]
    punpcklbw   m5,     m7                      ; m5=[4 5]
    lea         r0,     [r0 + 2 * r1]
    movd        m4,     [r0]
    punpcklbw   m7,     m4                      ; m7=[5 6]
    punpcklbw   m5,     m0
    punpcklbw   m7,     m0
    pmaddwd     m6,     m5,     [r6 + 2 * 32]
    pmaddwd     m8,     m7,     [r6 + 2 * 32]
    packssdw    m6,     m8                      ; m6=[4+5 5+6]
    paddw       m2,     m6                      ; m2=[0+1+2+3+4+5 1+2+3+4+5+6]           Row1-2
    pmaddwd     m5,     [r6 + 1 * 32]
    pmaddwd     m7,     [r6 + 1 * 32]
    packssdw    m5,     m7                      ; m5=[4+5 5+6]
    paddw       m3,     m5                      ; m3=[2+3+4+5 3+4+5+6]                   Row3-4

    movd        m7,     [r0 + r1]
    punpcklbw   m4,     m7                      ; m4=[6 7]
    lea         r0,     [r0 + 2 * r1]
    movd        m5,     [r0]
    punpcklbw   m7,     m5                      ; m7=[7 8]
    punpcklbw   m4,     m0
    punpcklbw   m7,     m0
    pmaddwd     m6,     m4,     [r6 + 3 * 32]
    pmaddwd     m8,     m7,     [r6 + 3 * 32]
    packssdw    m6,     m8                      ; m7=[6+7 7+8]
    paddw       m2,     m6                      ; m2=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8]   Row1-2 end
    pmaddwd     m4,     [r6 + 2 * 32]
    pmaddwd     m7,     [r6 + 2 * 32]
    packssdw    m4,     m7                      ; m4=[6+7 7+8]
    paddw       m3,     m4                      ; m3=[2+3+4+5+6+7 3+4+5+6+7+8]           Row3-4

    movd        m7,     [r0 + r1]
    punpcklbw   m5,     m7                      ; m5=[8 9]
    movd        m4,     [r0 + 2 * r1]
    punpcklbw   m7,     m4                      ; m7=[9 10]
    punpcklbw   m5,     m0
    punpcklbw   m7,     m0
    pmaddwd     m5,     [r6 + 3 * 32]
    pmaddwd     m7,     [r6 + 3 * 32]
    packssdw    m5,     m7                      ; m5=[8+9 9+10]
    paddw       m3,     m5                      ; m3=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10]  Row3-4 end
%endmacro

%macro PROCESS_LUMA_W8_4R_sse2 0
    movq        m7,     [r0]
    movq        m6,     [r0 + r1]
    punpcklbw   m7,     m6
    punpcklbw   m2,     m7,     m0
    punpckhbw   m7,     m0
    pmaddwd     m2,     [r6 + 0 * 32]
    pmaddwd     m7,     [r6 + 0 * 32]
    packssdw    m2,     m7                      ; m2=[0+1]               Row1

    lea         r0,     [r0 + 2 * r1]
    movq        m7,     [r0]
    punpcklbw   m6,     m7
    punpcklbw   m3,     m6,     m0
    punpckhbw   m6,     m0
    pmaddwd     m3,     [r6 + 0 * 32]
    pmaddwd     m6,     [r6 + 0 * 32]
    packssdw    m3,     m6                      ; m3=[1+2]               Row2

    movq        m6,     [r0 + r1]
    punpcklbw   m7,     m6
    punpckhbw   m8,     m7,     m0
    punpcklbw   m7,     m0
    pmaddwd     m4,     m7,     [r6 + 0 * 32]
    pmaddwd     m9,     m8,     [r6 + 0 * 32]
    packssdw    m4,     m9                      ; m4=[2+3]               Row3
    pmaddwd     m7,     [r6 + 1 * 32]
    pmaddwd     m8,     [r6 + 1 * 32]
    packssdw    m7,     m8
    paddw       m2,     m7                      ; m2=[0+1+2+3]           Row1

    lea         r0,     [r0 + 2 * r1]
    movq        m10,    [r0]
    punpcklbw   m6,     m10
    punpckhbw   m8,     m6,     m0
    punpcklbw   m6,     m0
    pmaddwd     m5,     m6,     [r6 + 0 * 32]
    pmaddwd     m9,     m8,     [r6 + 0 * 32]
    packssdw    m5,     m9                      ; m5=[3+4]               Row4
    pmaddwd     m6,     [r6 + 1 * 32]
    pmaddwd     m8,     [r6 + 1 * 32]
    packssdw    m6,     m8
    paddw       m3,     m6                      ; m3 = [1+2+3+4]         Row2

    movq        m6,     [r0 + r1]
    punpcklbw   m10,    m6
    punpckhbw   m8,     m10,    m0
    punpcklbw   m10,    m0
    pmaddwd     m7,     m10,    [r6 + 1 * 32]
    pmaddwd     m9,     m8,     [r6 + 1 * 32]
    packssdw    m7,     m9
    pmaddwd     m10,    [r6 + 2 * 32]
    pmaddwd     m8,     [r6 + 2 * 32]
    packssdw    m10,    m8
    paddw       m2,     m10                     ; m2=[0+1+2+3+4+5]       Row1
    paddw       m4,     m7                      ; m4=[2+3+4+5]           Row3

    lea         r0,     [r0 + 2 * r1]
    movq        m10,    [r0]
    punpcklbw   m6,     m10
    punpckhbw   m8,     m6,     m0
    punpcklbw   m6,     m0
    pmaddwd     m7,     m6,     [r6 + 1 * 32]
    pmaddwd     m9,     m8,     [r6 + 1 * 32]
    packssdw    m7,     m9
    pmaddwd     m6,     [r6 + 2 * 32]
    pmaddwd     m8,     [r6 + 2 * 32]
    packssdw    m6,     m8
    paddw       m3,     m6                      ; m3=[1+2+3+4+5+6]       Row2
    paddw       m5,     m7                      ; m5=[3+4+5+6]           Row4

    movq        m6,     [r0 + r1]
    punpcklbw   m10,    m6
    punpckhbw   m8,     m10,    m0
    punpcklbw   m10,    m0
    pmaddwd     m7,     m10,    [r6 + 2 * 32]
    pmaddwd     m9,     m8,     [r6 + 2 * 32]
    packssdw    m7,     m9
    pmaddwd     m10,    [r6 + 3 * 32]
    pmaddwd     m8,     [r6 + 3 * 32]
    packssdw    m10,    m8
    paddw       m2,     m10                     ; m2=[0+1+2+3+4+5+6+7]   Row1 end
    paddw       m4,     m7                      ; m4=[2+3+4+5+6+7]       Row3

    lea         r0,     [r0 + 2 * r1]
    movq        m10,    [r0]
    punpcklbw   m6,     m10
    punpckhbw   m8,     m6,     m0
    punpcklbw   m6,     m0
    pmaddwd     m7,     m6,     [r6 + 2 * 32]
    pmaddwd     m9,     m8,     [r6 + 2 * 32]
    packssdw    m7,     m9
    pmaddwd     m6,     [r6 + 3 * 32]
    pmaddwd     m8,     [r6 + 3 * 32]
    packssdw    m6,     m8
    paddw       m3,     m6                      ; m3=[1+2+3+4+5+6+7+8]   Row2 end
    paddw       m5,     m7                      ; m5=[3+4+5+6+7+8]       Row4

    movq        m6,     [r0 + r1]
    punpcklbw   m10,    m6
    punpckhbw   m8,     m10,     m0
    punpcklbw   m10,    m0
    pmaddwd     m8,     [r6 + 3 * 32]
    pmaddwd     m10,    [r6 + 3 * 32]
    packssdw    m10,    m8
    paddw       m4,     m10                     ; m4=[2+3+4+5+6+7+8+9]   Row3 end

    movq        m10,    [r0 + 2 * r1]
    punpcklbw   m6,     m10
    punpckhbw   m8,     m6,     m0
    punpcklbw   m6,     m0
    pmaddwd     m8,     [r6 + 3 * 32]
    pmaddwd     m6,     [r6 + 3 * 32]
    packssdw    m6,     m8
    paddw       m5,     m6                      ; m5=[3+4+5+6+7+8+9+10]  Row4 end
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_sse2 3
INIT_XMM sse2
cglobal interp_8tap_vert_%3_%1x%2, 5, 8, 11
    lea         r5,     [3 * r1]
    sub         r0,     r5
    shl         r4d,    7

%ifdef PIC
    lea         r6,     [pw_LumaCoeffVer]
    add         r6,     r4
%else
    lea         r6,     [pw_LumaCoeffVer + r4]
%endif

%ifidn %3,pp
    mova        m1,     [pw_32]
%else
    mova        m1,     [pw_2000]
    add         r3d,    r3d
%endif

    mov         r4d,    %2/4
    lea         r5,     [3 * r3]
    pxor        m0,     m0

.loopH:
%assign x 0
%rep (%1 / 8)
    PROCESS_LUMA_W8_4R_sse2

%ifidn %3,pp
    paddw       m2,     m1
    paddw       m3,     m1
    paddw       m4,     m1
    paddw       m5,     m1
    psraw       m2,     6
    psraw       m3,     6
    psraw       m4,     6
    psraw       m5,     6

    packuswb    m2,     m3
    packuswb    m4,     m5

    movh        [r2 + x], m2
    movhps      [r2 + r3 + x], m2
    movh        [r2 + 2 * r3 + x], m4
    movhps      [r2 + r5 + x], m4
%else
    psubw       m2,     m1
    psubw       m3,     m1
    psubw       m4,     m1
    psubw       m5,     m1

    movu        [r2 + (2*x)], m2
    movu        [r2 + r3 + (2*x)], m3
    movu        [r2 + 2 * r3 + (2*x)], m4
    movu        [r2 + r5 + (2*x)], m5
%endif
%assign x x+8
%if %1 > 8
    lea         r7,     [8 * r1 - 8]
    sub         r0,     r7
%endif
%endrep

%rep (%1 % 8)/4
    PROCESS_LUMA_W4_4R_sse2

%ifidn %3,pp
    paddw       m2,     m1
    psraw       m2,     6
    paddw       m3,     m1
    psraw       m3,     6

    packuswb    m2,     m3

    movd        [r2 + x], m2
    psrldq      m2,     4
    movd        [r2 + r3 + x], m2
    psrldq      m2,     4
    movd        [r2 + 2 * r3 + x], m2
    psrldq      m2,     4
    movd        [r2 + r5 + x], m2
%else
    psubw       m2,     m1
    psubw       m3,     m1

    movh        [r2 + (2*x)], m2
    movhps      [r2 + r3 + (2*x)], m2
    movh        [r2 + 2 * r3 + (2*x)], m3
    movhps      [r2 + r5 + (2*x)], m3
%endif
%endrep

    lea         r2,     [r2 + 4 * r3]
%if %1 <= 8
    lea         r7,     [4 * r1]
    sub         r0,     r7
%elif %1 == 12
    lea         r7,     [4 * r1 + 8]
    sub         r0,     r7
%else
    lea         r0,     [r0 + 4 * r1 - %1]
%endif

    dec         r4d
    jnz         .loopH

    RET

%endmacro

%if ARCH_X86_64
    FILTER_VER_LUMA_sse2 4, 4, pp
    FILTER_VER_LUMA_sse2 4, 8, pp
    FILTER_VER_LUMA_sse2 4, 16, pp
    FILTER_VER_LUMA_sse2 8, 4, pp
    FILTER_VER_LUMA_sse2 8, 8, pp
    FILTER_VER_LUMA_sse2 8, 16, pp
    FILTER_VER_LUMA_sse2 8, 32, pp
    FILTER_VER_LUMA_sse2 12, 16, pp
    FILTER_VER_LUMA_sse2 16, 4, pp
    FILTER_VER_LUMA_sse2 16, 8, pp
    FILTER_VER_LUMA_sse2 16, 12, pp
    FILTER_VER_LUMA_sse2 16, 16, pp
    FILTER_VER_LUMA_sse2 16, 32, pp
    FILTER_VER_LUMA_sse2 16, 64, pp
    FILTER_VER_LUMA_sse2 24, 32, pp
    FILTER_VER_LUMA_sse2 32, 8, pp
    FILTER_VER_LUMA_sse2 32, 16, pp
    FILTER_VER_LUMA_sse2 32, 24, pp
    FILTER_VER_LUMA_sse2 32, 32, pp
    FILTER_VER_LUMA_sse2 32, 64, pp
    FILTER_VER_LUMA_sse2 48, 64, pp
    FILTER_VER_LUMA_sse2 64, 16, pp
    FILTER_VER_LUMA_sse2 64, 32, pp
    FILTER_VER_LUMA_sse2 64, 48, pp
    FILTER_VER_LUMA_sse2 64, 64, pp

    FILTER_VER_LUMA_sse2 4, 4, ps
    FILTER_VER_LUMA_sse2 4, 8, ps
    FILTER_VER_LUMA_sse2 4, 16, ps
    FILTER_VER_LUMA_sse2 8, 4, ps
    FILTER_VER_LUMA_sse2 8, 8, ps
    FILTER_VER_LUMA_sse2 8, 16, ps
    FILTER_VER_LUMA_sse2 8, 32, ps
    FILTER_VER_LUMA_sse2 12, 16, ps
    FILTER_VER_LUMA_sse2 16, 4, ps
    FILTER_VER_LUMA_sse2 16, 8, ps
    FILTER_VER_LUMA_sse2 16, 12, ps
    FILTER_VER_LUMA_sse2 16, 16, ps
    FILTER_VER_LUMA_sse2 16, 32, ps
    FILTER_VER_LUMA_sse2 16, 64, ps
    FILTER_VER_LUMA_sse2 24, 32, ps
    FILTER_VER_LUMA_sse2 32, 8, ps
    FILTER_VER_LUMA_sse2 32, 16, ps
    FILTER_VER_LUMA_sse2 32, 24, ps
    FILTER_VER_LUMA_sse2 32, 32, ps
    FILTER_VER_LUMA_sse2 32, 64, ps
    FILTER_VER_LUMA_sse2 48, 64, ps
    FILTER_VER_LUMA_sse2 64, 16, ps
    FILTER_VER_LUMA_sse2 64, 32, ps
    FILTER_VER_LUMA_sse2 64, 48, ps
    FILTER_VER_LUMA_sse2 64, 64, ps
%endif

%macro FILTER_P2S_2_4_sse2 1
    movd        m2,     [r0 + %1]
    movd        m3,     [r0 + r1 + %1]
    punpcklwd   m2,     m3
    movd        m3,     [r0 + r1 * 2 + %1]
    movd        m4,     [r0 + r4 + %1]
    punpcklwd   m3,     m4
    punpckldq   m2,     m3
    punpcklbw   m2,     m0
    psllw       m2,     6
    psubw       m2,     m1

    movd        [r2 + r3 * 0 + %1 * 2], m2
    psrldq      m2,     4
    movd        [r2 + r3 * 1 + %1 * 2], m2
    psrldq      m2,     4
    movd        [r2 + r3 * 2 + %1 * 2], m2
    psrldq      m2,     4
    movd        [r2 + r5 + %1 * 2], m2
%endmacro

%macro FILTER_P2S_4_4_sse2 1
    movd        m2,     [r0 + %1]
    movd        m3,     [r0 + r1 + %1]
    movd        m4,     [r0 + r1 * 2 + %1]
    movd        m5,     [r0 + r4 + %1]
    punpckldq   m2,     m3
    punpcklbw   m2,     m0
    punpckldq   m4,     m5
    punpcklbw   m4,     m0
    psllw       m2,     6
    psllw       m4,     6
    psubw       m2,     m1
    psubw       m4,     m1
    movh        [r2 + r3 * 0 + %1 * 2], m2
    movh        [r2 + r3 * 2 + %1 * 2], m4
    movhps      [r2 + r3 * 1 + %1 * 2], m2
    movhps      [r2 + r5 + %1 * 2], m4
%endmacro

%macro FILTER_P2S_4_2_sse2 0
    movd        m2,     [r0]
    movd        m3,     [r0 + r1]
    punpckldq   m2,     m3
    punpcklbw   m2,     m0
    psllw       m2,     6
    psubw       m2,     [pw_8192]
    movh        [r2],   m2
    movhps      [r2 + r3 * 2], m2
%endmacro

%macro FILTER_P2S_8_4_sse2 1
    movh        m2,     [r0 + %1]
    movh        m3,     [r0 + r1 + %1]
    movh        m4,     [r0 + r1 * 2 + %1]
    movh        m5,     [r0 + r4 + %1]
    punpcklbw   m2,     m0
    punpcklbw   m3,     m0
    punpcklbw   m5,     m0
    punpcklbw   m4,     m0
    psllw       m2,     6
    psllw       m3,     6
    psllw       m5,     6
    psllw       m4,     6
    psubw       m2,     m1
    psubw       m3,     m1
    psubw       m4,     m1
    psubw       m5,     m1
    movu        [r2 + r3 * 0 + %1 * 2], m2
    movu        [r2 + r3 * 1 + %1 * 2], m3
    movu        [r2 + r3 * 2 + %1 * 2], m4
    movu        [r2 + r5 + %1 * 2], m5
%endmacro

%macro FILTER_P2S_8_2_sse2 1
    movh        m2,     [r0 + %1]
    movh        m3,     [r0 + r1 + %1]
    punpcklbw   m2,     m0
    punpcklbw   m3,     m0
    psllw       m2,     6
    psllw       m3,     6
    psubw       m2,     m1
    psubw       m3,     m1
    movu        [r2 + r3 * 0 + %1 * 2], m2
    movu        [r2 + r3 * 1 + %1 * 2], m3
%endmacro

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
;-----------------------------------------------------------------------------
%macro FILTER_PIX_TO_SHORT_sse2 2
INIT_XMM sse2
cglobal filterPixelToShort_%1x%2, 4, 6, 6
    pxor        m0,     m0
%if %2 == 2
%if %1 == 4
    FILTER_P2S_4_2_sse2
%elif %1 == 8
    add        r3d, r3d
    mova       m1, [pw_8192]
    FILTER_P2S_8_2_sse2 0
%endif
%else
    add        r3d, r3d
    mova       m1, [pw_8192]
    lea        r4, [r1 * 3]
    lea        r5, [r3 * 3]
%assign y 1
%rep %2/4
%assign x 0
%rep %1/8
    FILTER_P2S_8_4_sse2 x
%if %2 == 6
    lea         r0,     [r0 + 4 * r1]
    lea         r2,     [r2 + 4 * r3]
    FILTER_P2S_8_2_sse2 x
%endif
%assign x x+8
%endrep
%rep (%1 % 8)/4
    FILTER_P2S_4_4_sse2 x
%assign x x+4
%endrep
%rep (%1 % 4)/2
    FILTER_P2S_2_4_sse2 x
%endrep
%if y < %2/4
    lea         r0,     [r0 + 4 * r1]
    lea         r2,     [r2 + 4 * r3]
%assign y y+1
%endif
%endrep
%endif
RET
%endmacro

    FILTER_PIX_TO_SHORT_sse2 2, 4
    FILTER_PIX_TO_SHORT_sse2 2, 8
    FILTER_PIX_TO_SHORT_sse2 2, 16
    FILTER_PIX_TO_SHORT_sse2 4, 2
    FILTER_PIX_TO_SHORT_sse2 4, 4
    FILTER_PIX_TO_SHORT_sse2 4, 8
    FILTER_PIX_TO_SHORT_sse2 4, 16
    FILTER_PIX_TO_SHORT_sse2 4, 32
    FILTER_PIX_TO_SHORT_sse2 6, 8
    FILTER_PIX_TO_SHORT_sse2 6, 16
    FILTER_PIX_TO_SHORT_sse2 8, 2
    FILTER_PIX_TO_SHORT_sse2 8, 4
    FILTER_PIX_TO_SHORT_sse2 8, 6
    FILTER_PIX_TO_SHORT_sse2 8, 8
    FILTER_PIX_TO_SHORT_sse2 8, 12
    FILTER_PIX_TO_SHORT_sse2 8, 16
    FILTER_PIX_TO_SHORT_sse2 8, 32
    FILTER_PIX_TO_SHORT_sse2 8, 64
    FILTER_PIX_TO_SHORT_sse2 12, 16
    FILTER_PIX_TO_SHORT_sse2 12, 32
    FILTER_PIX_TO_SHORT_sse2 16, 4
    FILTER_PIX_TO_SHORT_sse2 16, 8
    FILTER_PIX_TO_SHORT_sse2 16, 12
    FILTER_PIX_TO_SHORT_sse2 16, 16
    FILTER_PIX_TO_SHORT_sse2 16, 24
    FILTER_PIX_TO_SHORT_sse2 16, 32
    FILTER_PIX_TO_SHORT_sse2 16, 64
    FILTER_PIX_TO_SHORT_sse2 24, 32
    FILTER_PIX_TO_SHORT_sse2 24, 64
    FILTER_PIX_TO_SHORT_sse2 32, 8
    FILTER_PIX_TO_SHORT_sse2 32, 16
    FILTER_PIX_TO_SHORT_sse2 32, 24
    FILTER_PIX_TO_SHORT_sse2 32, 32
    FILTER_PIX_TO_SHORT_sse2 32, 48
    FILTER_PIX_TO_SHORT_sse2 32, 64
    FILTER_PIX_TO_SHORT_sse2 48, 64
    FILTER_PIX_TO_SHORT_sse2 64, 16
    FILTER_PIX_TO_SHORT_sse2 64, 32
    FILTER_PIX_TO_SHORT_sse2 64, 48
    FILTER_PIX_TO_SHORT_sse2 64, 64

%macro FILTER_H8_W8 7-8   ; t0, t1, t2, t3, coef, c512, src, dst
    movu        %1, %7
    pshufb      %2, %1, [tab_Lm +  0]
    pmaddubsw   %2, %5
    pshufb      %3, %1, [tab_Lm + 16]
    pmaddubsw   %3, %5
    phaddw      %2, %3
    pshufb      %4, %1, [tab_Lm + 32]
    pmaddubsw   %4, %5
    pshufb      %1, %1, [tab_Lm + 48]
    pmaddubsw   %1, %5
    phaddw      %4, %1
    phaddw      %2, %4
  %if %0 == 8
    pmulhrsw    %2, %6
    packuswb    %2, %2
    movh        %8, %2
  %endif
%endmacro

;-----------------------------------------------------------------------------
; Interpolate HV
;-----------------------------------------------------------------------------
%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
    mova        %5, [r0 +  (%6 + 0) * 16]
    mova        %1, [r0 +  (%6 + 1) * 16]
    mova        %2, [r0 +  (%6 + 2) * 16]
    punpcklwd   %3, %5, %1
    punpckhwd   %5, %1
    pmaddwd     %3, [r5 + (%7) * 16]   ; R3 = L[0+1] -- Row 0
    pmaddwd     %5, [r5 + (%7) * 16]   ; R0 = H[0+1]
    punpcklwd   %4, %1, %2
    punpckhwd   %1, %2
    pmaddwd     %4, [r5 + (%7) * 16]   ; R4 = L[1+2] -- Row 1
    pmaddwd     %1, [r5 + (%7) * 16]   ; R1 = H[1+2]
%endmacro ; FILTER_HV8_START

%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
    mova        %8, [r0 +  (%9 + 0) * 16]
    mova        %1, [r0 +  (%9 + 1) * 16]
    punpcklwd   %7, %2, %8
    punpckhwd   %2, %8
    pmaddwd     %7, [r5 + %10 * 16]
    pmaddwd     %2, [r5 + %10 * 16]
    paddd       %3, %7              ; R3 = L[0+1+2+3] -- Row 0
    paddd       %5, %2              ; R0 = H[0+1+2+3]
    punpcklwd   %7, %8, %1
    punpckhwd   %8, %1
    pmaddwd     %7, [r5 + %10 * 16]
    pmaddwd     %8, [r5 + %10 * 16]
    paddd       %4, %7              ; R4 = L[1+2+3+4] -- Row 1
    paddd       %6, %8              ; R1 = H[1+2+3+4]
%endmacro ; FILTER_HV8_MID

; Round and Saturate
%macro FILTER_HV8_END 4 ; output in [1, 3]
    paddd       %1, [pd_526336]
    paddd       %2, [pd_526336]
    paddd       %3, [pd_526336]
    paddd       %4, [pd_526336]
    psrad       %1, 12
    psrad       %2, 12
    psrad       %3, 12
    psrad       %4, 12
    packssdw    %1, %2
    packssdw    %3, %4

    ; TODO: is merge better? I think this way is short dependency link
    packuswb    %1, %3
%endmacro ; FILTER_HV8_END

;-----------------------------------------------------------------------------
; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
;-----------------------------------------------------------------------------
INIT_XMM ssse3
cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
%define coef        m7
%define stk_buf     rsp

    mov         r4d,        r4m
    mov         r5d,        r5m

%ifdef PIC
    lea         r6,         [tab_LumaCoeff]
    movh        coef,       [r6 + r4 * 8]
%else
    movh        coef,       [tab_LumaCoeff + r4 * 8]
%endif
    punpcklqdq  coef,       coef

    ; move to row -3
    lea         r6,         [r1 + r1 * 2]
    sub         r0,         r6

    xor         r6,         r6
    mov         r4,         rsp

.loopH:
    FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
    psubw       m1,         [pw_2000]
    mova        [r4],       m1

    add         r0,         r1
    add         r4,         16
    inc         r6
    cmp         r6,         8+7
    jnz         .loopH

    ; ready to phase V
    ; Here all of mN is free

    ; load coeff table
    shl         r5,         6
    lea         r6,         [tab_LumaCoeffV]
    lea         r5,         [r5 + r6]

    ; load intermedia buffer
    mov         r0,         stk_buf

    ; register mapping
    ; r0 - src
    ; r5 - coeff
    ; r6 - loop_i

    ; let's go
    xor         r6,         r6

    ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
.loopV:

    FILTER_HV8_START    m1, m2, m3, m4, m0,             0, 0
    FILTER_HV8_MID      m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
    FILTER_HV8_MID      m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
    FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
    FILTER_HV8_END      m3, m0, m4, m1

    movh        [r2],       m3
    movhps      [r2 + r3],  m3

    lea         r0,         [r0 + 16 * 2]
    lea         r2,         [r2 + r3 * 2]

    inc         r6
    cmp         r6,         8/2
    jnz         .loopV

    RET

;-----------------------------------------------------------------------------
; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
;-----------------------------------------------------------------------------
INIT_XMM sse3
cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
    mov         r4d,        r4m
    mov         r5d,        r5m
    add         r4d,        r4d
    pxor        m6,         m6

%ifdef PIC
    lea         r6,         [tabw_LumaCoeff]
    mova        m3,         [r6 + r4 * 8]
%else
    mova        m3,         [tabw_LumaCoeff + r4 * 8]
%endif

    ; move to row -3
    lea         r6,         [r1 + r1 * 2]
    sub         r0,         r6

    mov         r4,         rsp

%assign x 0     ;needed for FILTER_H8_W8_sse2 macro
%assign y 1
%rep 15
    FILTER_H8_W8_sse2
    psubw       m1,         [pw_2000]
    mova        [r4],       m1

%if y < 15
    add         r0,         r1
    add         r4,         16
%endif
%assign y y+1
%endrep

    ; ready to phase V
    ; Here all of mN is free

    ; load coeff table
    shl         r5,         6
    lea         r6,         [tab_LumaCoeffV]
    lea         r5,         [r5 + r6]

    ; load intermedia buffer
    mov         r0,         rsp

    ; register mapping
    ; r0 - src
    ; r5 - coeff

    ; let's go
%assign y 1
%rep 4
    FILTER_HV8_START    m1, m2, m3, m4, m0,             0, 0
    FILTER_HV8_MID      m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
    FILTER_HV8_MID      m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
    FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
    FILTER_HV8_END      m3, m0, m4, m1

    movh        [r2],       m3
    movhps      [r2 + r3],  m3

%if y < 4
    lea         r0,         [r0 + 16 * 2]
    lea         r2,         [r2 + r3 * 2]
%endif
%assign y y+1
%endrep
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_2xN 1
INIT_XMM sse4
cglobal filterPixelToShort_2x%1, 3, 4, 3
    mov         r3d, r3m
    add         r3d, r3d

    ; load constant
    mova        m1, [pb_128]
    mova        m2, [tab_c_64_n64]

%rep %1/2
    movd        m0, [r0]
    pinsrd      m0, [r0 + r1], 1
    punpcklbw   m0, m1
    pmaddubsw   m0, m2

    movd        [r2 + r3 * 0], m0
    pextrd      [r2 + r3 * 1], m0, 2

    lea         r0, [r0 + r1 * 2]
    lea         r2, [r2 + r3 * 2]
%endrep
    RET
%endmacro
    P2S_H_2xN 4
    P2S_H_2xN 8
    P2S_H_2xN 16

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_4xN 1
INIT_XMM sse4
cglobal filterPixelToShort_4x%1, 3, 6, 4
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r3 * 3]
    lea         r5, [r1 * 3]

    ; load constant
    mova        m2, [pb_128]
    mova        m3, [tab_c_64_n64]

%assign x 0
%rep %1/4
    movd        m0, [r0]
    pinsrd      m0, [r0 + r1], 1
    punpcklbw   m0, m2
    pmaddubsw   m0, m3

    movd        m1, [r0 + r1 * 2]
    pinsrd      m1, [r0 + r5], 1
    punpcklbw   m1, m2
    pmaddubsw   m1, m3

    movq        [r2 + r3 * 0], m0
    movq        [r2 + r3 * 2], m1
    movhps      [r2 + r3 * 1], m0
    movhps      [r2 + r4], m1
%assign x x+1
%if (x != %1/4)
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endif
%endrep
    RET
%endmacro
    P2S_H_4xN 4
    P2S_H_4xN 8
    P2S_H_4xN 16
    P2S_H_4xN 32

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_6xN 1
INIT_XMM sse4
cglobal filterPixelToShort_6x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r5, [r3 * 3]

    ; load height
    mov         r6d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r4]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movh        [r2 + r3 * 0], m0
    pextrd      [r2 + r3 * 0 + 8], m0, 2
    movh        [r2 + r3 * 1], m1
    pextrd      [r2 + r3 * 1 + 8], m1, 2
    movh        [r2 + r3 * 2], m2
    pextrd      [r2 + r3 * 2 + 8], m2, 2
    movh        [r2 + r5], m3
    pextrd      [r2 + r5 + 8], m3, 2

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_6xN 8
    P2S_H_6xN 16

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_8xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_8x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load height
    mov         r4d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0], m0
    movu        [r2 + r3 * 1], m1
    movu        [r2 + r3 * 2], m2
    movu        [r2 + r6 ], m3

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r4d
    jnz         .loop
    RET
%endmacro
    P2S_H_8xN 8
    P2S_H_8xN 4
    P2S_H_8xN 16
    P2S_H_8xN 32
    P2S_H_8xN 12
    P2S_H_8xN 64

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_XMM ssse3
cglobal filterPixelToShort_8x6, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r5, [r1 * 5]
    lea         r6, [r3 * 3]

    ; load constant
    mova        m3, [pb_128]
    mova        m4, [tab_c_64_n64]

    movh        m0, [r0]
    punpcklbw   m0, m3
    pmaddubsw   m0, m4

    movh        m1, [r0 + r1]
    punpcklbw   m1, m3
    pmaddubsw   m1, m4

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4

    movu        [r2 + r3 * 0], m0
    movu        [r2 + r3 * 1], m1
    movu        [r2 + r3 * 2], m2

    movh        m0, [r0 + r4]
    punpcklbw   m0, m3
    pmaddubsw   m0, m4

    movh        m1, [r0 + r1 * 4]
    punpcklbw   m1, m3
    pmaddubsw   m1, m4

    movh        m2, [r0 + r5]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4

    movu        [r2 + r6 ], m0
    movu        [r2 + r3 * 4], m1
    lea         r2, [r2 + r3 * 4]
    movu        [r2 + r3], m2

    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_16xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_16x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r3 * 3]
    lea         r5, [r1 * 3]

   ; load height
    mov         r6d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0], m0
    movu        [r2 + r3 * 1], m1
    movu        [r2 + r3 * 2], m2
    movu        [r2 + r4], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 16], m0
    movu        [r2 + r3 * 1 + 16], m1
    movu        [r2 + r3 * 2 + 16], m2
    movu        [r2 + r4 + 16], m3

    lea         r0, [r0 + r1 * 4 - 8]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_16xN 16
    P2S_H_16xN 4
    P2S_H_16xN 8
    P2S_H_16xN 12
    P2S_H_16xN 32
    P2S_H_16xN 64
    P2S_H_16xN 24

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_16x4, 3, 4, 2
    mov             r3d, r3m
    add             r3d, r3d

    ; load constant
    vbroadcasti128  m1, [pw_2000]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    lea             r1, [r1 * 3]
    lea             r3, [r3 * 3]

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_16x8, 3, 6, 2
    mov             r3d, r3m
    add             r3d, r3d
    lea             r4, [r1 * 3]
    lea             r5, [r3 * 3]

    ; load constant
    vbroadcasti128  m1, [pw_2000]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_16x12, 3, 6, 2
    mov             r3d, r3m
    add             r3d, r3d
    lea             r4, [r1 * 3]
    lea             r5, [r3 * 3]

    ; load constant
    vbroadcasti128  m1, [pw_2000]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_16x16, 3, 6, 2
    mov             r3d, r3m
    add             r3d, r3d
    lea             r4, [r1 * 3]
    lea             r5, [r3 * 3]

    ; load constant
    vbroadcasti128  m1, [pw_2000]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_16x24, 3, 7, 2
    mov             r3d, r3m
    add             r3d, r3d
    lea             r4, [r1 * 3]
    lea             r5, [r3 * 3]
    mov             r6d, 3

    ; load constant
    vbroadcasti128  m1, [pw_2000]
.loop:
    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    dec             r6d
    jnz             .loop
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_16xN_avx2 1
INIT_YMM avx2
cglobal filterPixelToShort_16x%1, 3, 7, 2
    mov             r3d, r3m
    add             r3d, r3d
    lea             r4, [r1 * 3]
    lea             r5, [r3 * 3]
    mov             r6d, %1/16

    ; load constant
    vbroadcasti128  m1, [pw_2000]
.loop:
    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    pmovzxbw        m0, [r0]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2], m0

    pmovzxbw        m0, [r0 + r1]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3], m0

    pmovzxbw        m0, [r0 + r1 * 2]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r3 * 2], m0

    pmovzxbw        m0, [r0 + r4]
    psllw           m0, 6
    psubw           m0, m1
    movu            [r2 + r5], m0

    lea             r0, [r0 + r1 * 4]
    lea             r2, [r2 + r3 * 4]

    dec             r6d
    jnz             .loop
    RET
%endmacro
P2S_H_16xN_avx2 32
P2S_H_16xN_avx2 64

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_32xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_32x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r3 * 3]
    lea         r5, [r1 * 3]

    ; load height
    mov         r6d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0], m0
    movu        [r2 + r3 * 1], m1
    movu        [r2 + r3 * 2], m2
    movu        [r2 + r4], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 16], m0
    movu        [r2 + r3 * 1 + 16], m1
    movu        [r2 + r3 * 2 + 16], m2
    movu        [r2 + r4 + 16], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 32], m0
    movu        [r2 + r3 * 1 + 32], m1
    movu        [r2 + r3 * 2 + 32], m2
    movu        [r2 + r4 + 32], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 48], m0
    movu        [r2 + r3 * 1 + 48], m1
    movu        [r2 + r3 * 2 + 48], m2
    movu        [r2 + r4 + 48], m3

    lea         r0, [r0 + r1 * 4 - 24]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_32xN 32
    P2S_H_32xN 8
    P2S_H_32xN 16
    P2S_H_32xN 24
    P2S_H_32xN 64
    P2S_H_32xN 48

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_32xN_avx2 1
INIT_YMM avx2
cglobal filterPixelToShort_32x%1, 3, 7, 3
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load height
    mov         r4d, %1/4

    ; load constant
    vpbroadcastd m2, [pw_2000]

.loop:
    pmovzxbw    m0, [r0 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + 1 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psubw       m0, m2
    psubw       m1, m2
    movu        [r2 + 0 * mmsize], m0
    movu        [r2 + 1 * mmsize], m1

    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psubw       m0, m2
    psubw       m1, m2
    movu        [r2 + r3 + 0 * mmsize], m0
    movu        [r2 + r3 + 1 * mmsize], m1

    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psubw       m0, m2
    psubw       m1, m2
    movu        [r2 + r3 * 2 + 0 * mmsize], m0
    movu        [r2 + r3 * 2 + 1 * mmsize], m1

    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psubw       m0, m2
    psubw       m1, m2
    movu        [r2 + r6 + 0 * mmsize], m0
    movu        [r2 + r6 + 1 * mmsize], m1

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r4d
    jnz        .loop
    RET
%endmacro
    P2S_H_32xN_avx2 32
    P2S_H_32xN_avx2 8
    P2S_H_32xN_avx2 16
    P2S_H_32xN_avx2 24
    P2S_H_32xN_avx2 64
    P2S_H_32xN_avx2 48

;-----------------------------------------------------------------------------
;p2s and p2s_aligned 32xN avx512 code start
;-----------------------------------------------------------------------------

%macro PROCESS_P2S_32x4_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    movu        [r2],           m0
    movu        [r2 + r3],      m1
    movu        [r2 + r3 * 2],  m2
    movu        [r2 + r6],      m3
%endmacro

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_32x8, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_32x16, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 3
    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_32x24, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 5
    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_32x32, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 7
    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_32x48, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 11
    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_32x64, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 15
    PROCESS_P2S_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_32x4_AVX512
    RET
%endif

%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    mova        [r2],           m0
    mova        [r2 + r3],      m1
    mova        [r2 + r3 * 2],  m2
    mova        [r2 + r6],      m3
%endmacro

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x16, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 3
    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x24, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 5
    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x32, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 7
    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x48, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 11
    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_32x64, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 15
    PROCESS_P2S_ALIGNED_32x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_32x4_AVX512
    RET
%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 32xN avx512 code end
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_64xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_64x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r3 * 3]
    lea         r5, [r1 * 3]

    ; load height
    mov         r6d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0], m0
    movu        [r2 + r3 * 1], m1
    movu        [r2 + r3 * 2], m2
    movu        [r2 + r4], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 16], m0
    movu        [r2 + r3 * 1 + 16], m1
    movu        [r2 + r3 * 2 + 16], m2
    movu        [r2 + r4 + 16], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 32], m0
    movu        [r2 + r3 * 1 + 32], m1
    movu        [r2 + r3 * 2 + 32], m2
    movu        [r2 + r4 + 32], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 48], m0
    movu        [r2 + r3 * 1 + 48], m1
    movu        [r2 + r3 * 2 + 48], m2
    movu        [r2 + r4 + 48], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 64], m0
    movu        [r2 + r3 * 1 + 64], m1
    movu        [r2 + r3 * 2 + 64], m2
    movu        [r2 + r4 + 64], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 80], m0
    movu        [r2 + r3 * 1 + 80], m1
    movu        [r2 + r3 * 2 + 80], m2
    movu        [r2 + r4 + 80], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 96], m0
    movu        [r2 + r3 * 1 + 96], m1
    movu        [r2 + r3 * 2 + 96], m2
    movu        [r2 + r4 + 96], m3

    lea         r0, [r0 + 8]

    movh        m0, [r0]
    punpcklbw   m0, m4
    pmaddubsw   m0, m5

    movh        m1, [r0 + r1]
    punpcklbw   m1, m4
    pmaddubsw   m1, m5

    movh        m2, [r0 + r1 * 2]
    punpcklbw   m2, m4
    pmaddubsw   m2, m5

    movh        m3, [r0 + r5]
    punpcklbw   m3, m4
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0 + 112], m0
    movu        [r2 + r3 * 1 + 112], m1
    movu        [r2 + r3 * 2 + 112], m2
    movu        [r2 + r4 + 112], m3

    lea         r0, [r0 + r1 * 4 - 56]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_64xN 64
    P2S_H_64xN 16
    P2S_H_64xN 32
    P2S_H_64xN 48

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_64xN_avx2 1
INIT_YMM avx2
cglobal filterPixelToShort_64x%1, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load height
    mov         r4d, %1/4

    ; load constant
    vpbroadcastd m4, [pw_2000]

.loop:
    pmovzxbw    m0, [r0 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + 2 * mmsize/2]
    pmovzxbw    m3, [r0 + 3 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    movu        [r2 + 0 * mmsize], m0
    movu        [r2 + 1 * mmsize], m1
    movu        [r2 + 2 * mmsize], m2
    movu        [r2 + 3 * mmsize], m3

    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r1 + 2 * mmsize/2]
    pmovzxbw    m3, [r0 + r1 + 3 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    movu        [r2 + r3 + 0 * mmsize], m0
    movu        [r2 + r3 + 1 * mmsize], m1
    movu        [r2 + r3 + 2 * mmsize], m2
    movu        [r2 + r3 + 3 * mmsize], m3

    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r1 * 2 + 2 * mmsize/2]
    pmovzxbw    m3, [r0 + r1 * 2 + 3 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    movu        [r2 + r3 * 2 + 0 * mmsize], m0
    movu        [r2 + r3 * 2 + 1 * mmsize], m1
    movu        [r2 + r3 * 2 + 2 * mmsize], m2
    movu        [r2 + r3 * 2 + 3 * mmsize], m3

    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r5 + 2 * mmsize/2]
    pmovzxbw    m3, [r0 + r5 + 3 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4

    movu        [r2 + r6 + 0 * mmsize], m0
    movu        [r2 + r6 + 1 * mmsize], m1
    movu        [r2 + r6 + 2 * mmsize], m2
    movu        [r2 + r6 + 3 * mmsize], m3

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r4d
    jnz        .loop
    RET
%endmacro
    P2S_H_64xN_avx2 64
    P2S_H_64xN_avx2 16
    P2S_H_64xN_avx2 32
    P2S_H_64xN_avx2 48

;-----------------------------------------------------------------------------
;p2s and p2s_aligned 64xN avx512 code start
;-----------------------------------------------------------------------------
%macro PROCESS_P2S_64x4_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + mmsize/2]
    pmovzxbw    m2, [r0 + r1]
    pmovzxbw    m3, [r0 + r1 + mmsize/2]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    movu        [r2], m0
    movu        [r2 + mmsize], m1
    movu        [r2 + r3], m2
    movu        [r2 + r3 + mmsize], m3

    pmovzxbw    m0, [r0 + r1 * 2]
    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
    pmovzxbw    m2, [r0 + r5]
    pmovzxbw    m3, [r0 + r5 + mmsize/2]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    movu        [r2 + r3 * 2], m0
    movu        [r2 + r3 * 2 + mmsize], m1
    movu        [r2 + r6], m2
    movu        [r2 + r6 + mmsize], m3
%endmacro

%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + mmsize/2]
    pmovzxbw    m2, [r0 + r1]
    pmovzxbw    m3, [r0 + r1 + mmsize/2]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    mova        [r2], m0
    mova        [r2 + mmsize], m1
    mova        [r2 + r3], m2
    mova        [r2 + r3 + mmsize], m3

    pmovzxbw    m0, [r0 + r1 * 2]
    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
    pmovzxbw    m2, [r0 + r5]
    pmovzxbw    m3, [r0 + r5 + mmsize/2]

    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    mova        [r2 + r3 * 2], m0
    mova        [r2 + r3 * 2 + mmsize], m1
    mova        [r2 + r6], m2
    mova        [r2 + r6 + mmsize], m3
%endmacro
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_64x64, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 15
    PROCESS_P2S_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_64x48, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 11
    PROCESS_P2S_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_64x32, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 7
    PROCESS_P2S_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_64x16, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 3
    PROCESS_P2S_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_64x64, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 15
    PROCESS_P2S_ALIGNED_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_64x48, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 11
    PROCESS_P2S_ALIGNED_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_64x32, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 7
    PROCESS_P2S_ALIGNED_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_64x4_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_64x16, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd      m4, [pw_2000]

%rep 3
    PROCESS_P2S_ALIGNED_64x4_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
%endrep
    PROCESS_P2S_ALIGNED_64x4_AVX512
    RET
%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 64xN avx512 code end
;-----------------------------------------------------------------------------

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_12xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_12x%1, 3, 7, 6
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r6, [r3 * 3]
    mov         r5d, %1/4

    ; load constant
    mova        m4, [pb_128]
    mova        m5, [tab_c_64_n64]

.loop:
    movu        m0, [r0]
    punpcklbw   m1, m0, m4
    punpckhbw   m0, m4
    pmaddubsw   m0, m5
    pmaddubsw   m1, m5

    movu        m2, [r0 + r1]
    punpcklbw   m3, m2, m4
    punpckhbw   m2, m4
    pmaddubsw   m2, m5
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 0], m1
    movu        [r2 + r3 * 1], m3

    movh        [r2 + r3 * 0 + 16], m0
    movh        [r2 + r3 * 1 + 16], m2

    movu        m0, [r0 + r1 * 2]
    punpcklbw   m1, m0, m4
    punpckhbw   m0, m4
    pmaddubsw   m0, m5
    pmaddubsw   m1, m5

    movu        m2, [r0 + r4]
    punpcklbw   m3, m2, m4
    punpckhbw   m2, m4
    pmaddubsw   m2, m5
    pmaddubsw   m3, m5

    movu        [r2 + r3 * 2], m1
    movu        [r2 + r6], m3

    movh        [r2 + r3 * 2 + 16], m0
    movh        [r2 + r6 + 16], m2

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r5d
    jnz         .loop
    RET
%endmacro
    P2S_H_12xN 16
    P2S_H_12xN 32

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_24xN 1
INIT_XMM ssse3
cglobal filterPixelToShort_24x%1, 3, 7, 5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r5, [r3 * 3]
    mov         r6d, %1/4

    ; load constant
    mova        m3, [pb_128]
    mova        m4, [tab_c_64_n64]

.loop:
    movu        m0, [r0]
    punpcklbw   m1, m0, m3
    punpckhbw   m0, m3
    pmaddubsw   m0, m4
    pmaddubsw   m1, m4

    movu        m2, [r0 + 16]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4

    movu        [r2 +  r3 * 0], m1
    movu        [r2 +  r3 * 0 + 16], m0
    movu        [r2 +  r3 * 0 + 32], m2

    movu        m0, [r0 + r1]
    punpcklbw   m1, m0, m3
    punpckhbw   m0, m3
    pmaddubsw   m0, m4
    pmaddubsw   m1, m4

    movu        m2, [r0 + r1 + 16]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4

    movu        [r2 +  r3 * 1], m1
    movu        [r2 +  r3 * 1 + 16], m0
    movu        [r2 +  r3 * 1 + 32], m2

    movu        m0, [r0 + r1 * 2]
    punpcklbw   m1, m0, m3
    punpckhbw   m0, m3
    pmaddubsw   m0, m4
    pmaddubsw   m1, m4

    movu        m2, [r0 + r1 * 2 + 16]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4

    movu        [r2 +  r3 * 2], m1
    movu        [r2 +  r3 * 2 + 16], m0
    movu        [r2 +  r3 * 2 + 32], m2

    movu        m0, [r0 + r4]
    punpcklbw   m1, m0, m3
    punpckhbw   m0, m3
    pmaddubsw   m0, m4
    pmaddubsw   m1, m4

    movu        m2, [r0 + r4 + 16]
    punpcklbw   m2, m3
    pmaddubsw   m2, m4
    movu        [r2 +  r5], m1
    movu        [r2 +  r5 + 16], m0
    movu        [r2 +  r5 + 32], m2

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_24xN 32
    P2S_H_24xN 64

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%macro P2S_H_24xN_avx2 1
INIT_YMM avx2
cglobal filterPixelToShort_24x%1, 3, 7, 4
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r5, [r3 * 3]
    mov         r6d, %1/4

    ; load constant
    vpbroadcastd m1, [pw_2000]
    vpbroadcastd m2, [pb_128]
    vpbroadcastd m3, [tab_c_64_n64]

.loop:
    pmovzxbw    m0, [r0]
    psllw       m0, 6
    psubw       m0, m1
    movu        [r2], m0

    movu        m0, [r0 + mmsize/2]
    punpcklbw   m0, m2
    pmaddubsw   m0, m3
    movu        [r2 +  r3 * 0 + mmsize], xm0

    pmovzxbw    m0, [r0 + r1]
    psllw       m0, 6
    psubw       m0, m1
    movu        [r2 + r3], m0

    movu        m0, [r0 + r1 + mmsize/2]
    punpcklbw   m0, m2
    pmaddubsw   m0, m3
    movu        [r2 +  r3 * 1 + mmsize], xm0

    pmovzxbw    m0, [r0 + r1 * 2]
    psllw       m0, 6
    psubw       m0, m1
    movu        [r2 + r3 * 2], m0

    movu        m0, [r0 + r1 * 2 + mmsize/2]
    punpcklbw   m0, m2
    pmaddubsw   m0, m3
    movu        [r2 +  r3 * 2 + mmsize], xm0

    pmovzxbw    m0, [r0 + r4]
    psllw       m0, 6
    psubw       m0, m1
    movu        [r2 + r5], m0

    movu        m0, [r0 + r4 + mmsize/2]
    punpcklbw   m0, m2
    pmaddubsw   m0, m3
    movu        [r2 + r5 + mmsize], xm0

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET
%endmacro
    P2S_H_24xN_avx2 32
    P2S_H_24xN_avx2 64

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_XMM ssse3
cglobal filterPixelToShort_48x64, 3, 7, 4
    mov         r3d, r3m
    add         r3d, r3d
    lea         r4, [r1 * 3]
    lea         r5, [r3 * 3]
    mov         r6d, 16

    ; load constant
    mova        m2, [pb_128]
    mova        m3, [tab_c_64_n64]

.loop:
    movu        m0, [r0]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 0], m1
    movu        [r2 +  r3 * 0 + 16], m0

    movu        m0, [r0 + 16]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 0 + 32], m1
    movu        [r2 +  r3 * 0 + 48], m0

    movu        m0, [r0 + 32]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 0 + 64], m1
    movu        [r2 +  r3 * 0 + 80], m0

    movu        m0, [r0 + r1]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 1], m1
    movu        [r2 +  r3 * 1 + 16], m0

    movu        m0, [r0 + r1 + 16]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 1 + 32], m1
    movu        [r2 +  r3 * 1 + 48], m0

    movu        m0, [r0 + r1 + 32]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 1 + 64], m1
    movu        [r2 +  r3 * 1 + 80], m0

    movu        m0, [r0 + r1 * 2]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 2], m1
    movu        [r2 +  r3 * 2 + 16], m0

    movu        m0, [r0 + r1 * 2 + 16]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 2 + 32], m1
    movu        [r2 +  r3 * 2 + 48], m0

    movu        m0, [r0 + r1 * 2 + 32]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r3 * 2 + 64], m1
    movu        [r2 +  r3 * 2 + 80], m0

    movu        m0, [r0 + r4]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r5], m1
    movu        [r2 +  r5 + 16], m0

    movu        m0, [r0 + r4 + 16]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r5 + 32], m1
    movu        [r2 +  r5 + 48], m0

    movu        m0, [r0 + r4 + 32]
    punpcklbw   m1, m0, m2
    punpckhbw   m0, m2
    pmaddubsw   m0, m3
    pmaddubsw   m1, m3

    movu        [r2 +  r5 + 64], m1
    movu        [r2 +  r5 + 80], m0

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r6d
    jnz         .loop
    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal filterPixelToShort_48x64, 3,7,4
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load height
    mov         r4d, 64/4

    ; load constant
    vpbroadcastd m3, [pw_2000]

    ; just unroll(1) because it is best choice for 48x64
.loop:
    pmovzxbw    m0, [r0 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + 2 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psubw       m0, m3
    psubw       m1, m3
    psubw       m2, m3
    movu        [r2 + 0 * mmsize], m0
    movu        [r2 + 1 * mmsize], m1
    movu        [r2 + 2 * mmsize], m2

    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r1 + 2 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psubw       m0, m3
    psubw       m1, m3
    psubw       m2, m3
    movu        [r2 + r3 + 0 * mmsize], m0
    movu        [r2 + r3 + 1 * mmsize], m1
    movu        [r2 + r3 + 2 * mmsize], m2

    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r1 * 2 + 2 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psubw       m0, m3
    psubw       m1, m3
    psubw       m2, m3
    movu        [r2 + r3 * 2 + 0 * mmsize], m0
    movu        [r2 + r3 * 2 + 1 * mmsize], m1
    movu        [r2 + r3 * 2 + 2 * mmsize], m2

    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
    pmovzxbw    m2, [r0 + r5 + 2 * mmsize/2]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psubw       m0, m3
    psubw       m1, m3
    psubw       m2, m3
    movu        [r2 + r6 + 0 * mmsize], m0
    movu        [r2 + r6 + 1 * mmsize], m1
    movu        [r2 + r6 + 2 * mmsize], m2

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    dec         r4d
    jnz        .loop
    RET

;-----------------------------------------------------------------------------
;p2s and p2s_aligned 48xN avx512 code start
;-----------------------------------------------------------------------------
%macro PROCESS_P2S_48x8_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    movu        [r2],           m0
    movu        [r2 + r3],      m1
    movu        [r2 + r3 * 2],  m2
    movu        [r2 + r6],      m3

    pmovzxbw    ym0, [r0 + 32]
    pmovzxbw    ym1, [r0 + r1 + 32]
    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
    pmovzxbw    ym3, [r0 + r5 + 32]
    psllw       ym0, 6
    psllw       ym1, 6
    psllw       ym2, 6
    psllw       ym3, 6
    psubw       ym0, ym4
    psubw       ym1, ym4
    psubw       ym2, ym4
    psubw       ym3, ym4
    movu        [r2 + 64],           ym0
    movu        [r2 + r3 + 64],      ym1
    movu        [r2 + r3 * 2 + 64],  ym2
    movu        [r2 + r6 + 64],      ym3

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    movu        [r2],           m0
    movu        [r2 + r3],      m1
    movu        [r2 + r3 * 2],  m2
    movu        [r2 + r6],      m3

    pmovzxbw    ym0, [r0 + 32]
    pmovzxbw    ym1, [r0 + r1 + 32]
    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
    pmovzxbw    ym3, [r0 + r5 + 32]
    psllw       ym0, 6
    psllw       ym1, 6
    psllw       ym2, 6
    psllw       ym3, 6
    psubw       ym0, ym4
    psubw       ym1, ym4
    psubw       ym2, ym4
    psubw       ym3, ym4
    movu        [r2 + 64],           ym0
    movu        [r2 + r3 + 64],      ym1
    movu        [r2 + r3 * 2 + 64],  ym2
    movu        [r2 + r6 + 64],      ym3
%endmacro

%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0
    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    mova        [r2],           m0
    mova        [r2 + r3],      m1
    mova        [r2 + r3 * 2],  m2
    mova        [r2 + r6],      m3

    pmovzxbw    ym0, [r0 + 32]
    pmovzxbw    ym1, [r0 + r1 + 32]
    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
    pmovzxbw    ym3, [r0 + r5 + 32]
    psllw       ym0, 6
    psllw       ym1, 6
    psllw       ym2, 6
    psllw       ym3, 6
    psubw       ym0, ym4
    psubw       ym1, ym4
    psubw       ym2, ym4
    psubw       ym3, ym4
    mova        [r2 + 64],           ym0
    mova        [r2 + r3 + 64],      ym1
    mova        [r2 + r3 * 2 + 64],  ym2
    mova        [r2 + r6 + 64],      ym3

    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]

    pmovzxbw    m0, [r0]
    pmovzxbw    m1, [r0 + r1]
    pmovzxbw    m2, [r0 + r1 * 2]
    pmovzxbw    m3, [r0 + r5]
    psllw       m0, 6
    psllw       m1, 6
    psllw       m2, 6
    psllw       m3, 6
    psubw       m0, m4
    psubw       m1, m4
    psubw       m2, m4
    psubw       m3, m4
    mova        [r2],           m0
    mova        [r2 + r3],      m1
    mova        [r2 + r3 * 2],  m2
    mova        [r2 + r6],      m3

    pmovzxbw    ym0, [r0 + 32]
    pmovzxbw    ym1, [r0 + r1 + 32]
    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
    pmovzxbw    ym3, [r0 + r5 + 32]
    psllw       ym0, 6
    psllw       ym1, 6
    psllw       ym2, 6
    psllw       ym3, 6
    psubw       ym0, ym4
    psubw       ym1, ym4
    psubw       ym2, ym4
    psubw       ym3, ym4
    mova        [r2 + 64],           ym0
    mova        [r2 + r3 + 64],      ym1
    mova        [r2 + r3 * 2 + 64],  ym2
    mova        [r2 + r6 + 64],      ym3
%endmacro
;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
%if ARCH_X86_64
INIT_ZMM avx512
cglobal filterPixelToShort_48x64, 3,7,5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd m4, [pw_2000]

    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_48x8_AVX512
    RET

INIT_ZMM avx512
cglobal filterPixelToShort_aligned_48x64, 3,7,5
    mov         r3d, r3m
    add         r3d, r3d
    lea         r5, [r1 * 3]
    lea         r6, [r3 * 3]

    ; load constant
    vpbroadcastd m4, [pw_2000]

    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    lea         r0, [r0 + r1 * 4]
    lea         r2, [r2 + r3 * 4]
    PROCESS_P2S_ALIGNED_48x8_AVX512
    RET
%endif
;-----------------------------------------------------------------------------
;p2s and p2s_aligned 48xN avx512 code end
;-----------------------------------------------------------------------------

%macro PROCESS_LUMA_W4_4R 0
    movd        m0, [r0]
    movd        m1, [r0 + r1]
    punpcklbw   m2, m0, m1                     ; m2=[0 1]

    lea         r0, [r0 + 2 * r1]
    movd        m0, [r0]
    punpcklbw   m1, m0                         ; m1=[1 2]
    punpcklqdq  m2, m1                         ; m2=[0 1 1 2]
    pmaddubsw   m4, m2, [r6 + 0 * 16]          ; m4=[0+1 1+2]

    movd        m1, [r0 + r1]
    punpcklbw   m5, m0, m1                     ; m2=[2 3]
    lea         r0, [r0 + 2 * r1]
    movd        m0, [r0]
    punpcklbw   m1, m0                         ; m1=[3 4]
    punpcklqdq  m5, m1                         ; m5=[2 3 3 4]
    pmaddubsw   m2, m5, [r6 + 1 * 16]          ; m2=[2+3 3+4]
    paddw       m4, m2                         ; m4=[0+1+2+3 1+2+3+4]                   Row1-2
    pmaddubsw   m5, [r6 + 0 * 16]              ; m5=[2+3 3+4]                           Row3-4

    movd        m1, [r0 + r1]
    punpcklbw   m2, m0, m1                     ; m2=[4 5]
    lea         r0, [r0 + 2 * r1]
    movd        m0, [r0]
    punpcklbw   m1, m0                         ; m1=[5 6]
    punpcklqdq  m2, m1                         ; m2=[4 5 5 6]
    pmaddubsw   m1, m2, [r6 + 2 * 16]          ; m1=[4+5 5+6]
    paddw       m4, m1                         ; m4=[0+1+2+3+4+5 1+2+3+4+5+6]           Row1-2
    pmaddubsw   m2, [r6 + 1 * 16]              ; m2=[4+5 5+6]
    paddw       m5, m2                         ; m5=[2+3+4+5 3+4+5+6]                   Row3-4

    movd        m1, [r0 + r1]
    punpcklbw   m2, m0, m1                     ; m2=[6 7]
    lea         r0, [r0 + 2 * r1]
    movd        m0, [r0]
    punpcklbw   m1, m0                         ; m1=[7 8]
    punpcklqdq  m2, m1                         ; m2=[6 7 7 8]
    pmaddubsw   m1, m2, [r6 + 3 * 16]          ; m1=[6+7 7+8]
    paddw       m4, m1                         ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8]   Row1-2 end
    pmaddubsw   m2, [r6 + 2 * 16]              ; m2=[6+7 7+8]
    paddw       m5, m2                         ; m5=[2+3+4+5+6+7 3+4+5+6+7+8]           Row3-4

    movd        m1, [r0 + r1]
    punpcklbw   m2, m0, m1                     ; m2=[8 9]
    movd        m0, [r0 + 2 * r1]
    punpcklbw   m1, m0                         ; m1=[9 10]
    punpcklqdq  m2, m1                         ; m2=[8 9 9 10]
    pmaddubsw   m2, [r6 + 3 * 16]              ; m2=[8+9 9+10]
    paddw       m5, m2                         ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10]  Row3-4 end
%endmacro

%macro PROCESS_LUMA_W8_4R 0
    movq       m0, [r0]
    movq       m1, [r0 + r1]
    punpcklbw  m0, m1
    pmaddubsw  m7, m0, [r6 + 0 *16]            ;m7=[0+1]               Row1

    lea        r0, [r0 + 2 * r1]
    movq       m0, [r0]
    punpcklbw  m1, m0
    pmaddubsw  m6, m1, [r6 + 0 *16]            ;m6=[1+2]               Row2

    movq       m1, [r0 + r1]
    punpcklbw  m0, m1
    pmaddubsw  m5, m0, [r6 + 0 *16]            ;m5=[2+3]               Row3
    pmaddubsw  m0, [r6 + 1 * 16]
    paddw      m7, m0                          ;m7=[0+1+2+3]           Row1

    lea        r0, [r0 + 2 * r1]
    movq       m0, [r0]
    punpcklbw  m1, m0
    pmaddubsw  m4, m1, [r6 + 0 *16]            ;m4=[3+4]               Row4
    pmaddubsw  m1, [r6 + 1 * 16]
    paddw      m6, m1                          ;m6 = [1+2+3+4]         Row2

    movq       m1, [r0 + r1]
    punpcklbw  m0, m1
    pmaddubsw  m2, m0, [r6 + 1 * 16]
    pmaddubsw  m0, [r6 + 2 * 16]
    paddw      m7, m0                          ;m7=[0+1+2+3+4+5]       Row1
    paddw      m5, m2                          ;m5=[2+3+4+5]           Row3

    lea        r0, [r0 + 2 * r1]
    movq       m0, [r0]
    punpcklbw  m1, m0
    pmaddubsw  m2, m1, [r6 + 1 * 16]
    pmaddubsw  m1, [r6 + 2 * 16]
    paddw      m6, m1                          ;m6=[1+2+3+4+5+6]       Row2
    paddw      m4, m2                          ;m4=[3+4+5+6]           Row4

    movq       m1, [r0 + r1]
    punpcklbw  m0, m1
    pmaddubsw  m2, m0, [r6 + 2 * 16]
    pmaddubsw  m0, [r6 + 3 * 16]
    paddw      m7, m0                          ;m7=[0+1+2+3+4+5+6+7]   Row1 end
    paddw      m5, m2                          ;m5=[2+3+4+5+6+7]       Row3

    lea        r0, [r0 + 2 * r1]
    movq       m0, [r0]
    punpcklbw  m1, m0
    pmaddubsw  m2, m1, [r6 + 2 * 16]
    pmaddubsw  m1, [r6 + 3 * 16]
    paddw      m6, m1                          ;m6=[1+2+3+4+5+6+7+8]   Row2 end
    paddw      m4, m2                          ;m4=[3+4+5+6+7+8]       Row4

    movq       m1, [r0 + r1]
    punpcklbw  m0, m1
    pmaddubsw  m0, [r6 + 3 * 16]
    paddw      m5, m0                          ;m5=[2+3+4+5+6+7+8+9]   Row3 end

    movq       m0, [r0 + 2 * r1]
    punpcklbw  m1, m0
    pmaddubsw  m1, [r6 + 3 * 16]
    paddw      m4, m1                          ;m4=[3+4+5+6+7+8+9+10]  Row4 end
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_4xN 3
INIT_XMM sse4
cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
    lea       r5, [3 * r1]
    sub       r0, r5
    shl       r4d, 6
%ifidn %3,ps
    add       r3d, r3d
%endif

%ifdef PIC
    lea       r5, [tab_LumaCoeffVer]
    lea       r6, [r5 + r4]
%else
    lea       r6, [tab_LumaCoeffVer + r4]
%endif

%ifidn %3,pp
    mova      m3, [pw_512]
%else
    mova      m3, [pw_2000]
%endif

    mov       r4d, %2/4
    lea       r5, [4 * r1]

.loopH:
    PROCESS_LUMA_W4_4R

%ifidn %3,pp
    pmulhrsw  m4, m3
    pmulhrsw  m5, m3

    packuswb  m4, m5

    movd      [r2], m4
    pextrd    [r2 + r3], m4, 1
    lea       r2, [r2 + 2 * r3]
    pextrd    [r2], m4, 2
    pextrd    [r2 + r3], m4, 3
%else
    psubw     m4, m3
    psubw     m5, m3

    movlps    [r2], m4
    movhps    [r2 + r3], m4
    lea       r2, [r2 + 2 * r3]
    movlps    [r2], m5
    movhps    [r2 + r3], m5
%endif

    sub       r0, r5
    lea       r2, [r2 + 2 * r3]

    dec       r4d
    jnz       .loopH

    RET
%endmacro


INIT_YMM avx2
cglobal interp_8tap_vert_pp_4x4, 4,6,8
    mov             r4d, r4m
    lea             r5, [r1 * 3]
    sub             r0, r5

    ; TODO: VPGATHERDD
    movd            xm1, [r0]                       ; m1 = row0
    movd            xm2, [r0 + r1]                  ; m2 = row1
    punpcklbw       xm1, xm2                        ; m1 = [13 03 12 02 11 01 10 00]

    movd            xm3, [r0 + r1 * 2]              ; m3 = row2
    punpcklbw       xm2, xm3                        ; m2 = [23 13 22 12 21 11 20 10]
    movd            xm4, [r0 + r5]
    punpcklbw       xm3, xm4                        ; m3 = [33 23 32 22 31 21 30 20]
    punpcklwd       xm1, xm3                        ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]

    lea             r0, [r0 + r1 * 4]
    movd            xm5, [r0]                       ; m5 = row4
    punpcklbw       xm4, xm5                        ; m4 = [43 33 42 32 41 31 40 30]
    punpcklwd       xm2, xm4                        ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
    vinserti128     m1, m1, xm2, 1                  ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
    movd            xm2, [r0 + r1]                  ; m2 = row5
    punpcklbw       xm5, xm2                        ; m5 = [53 43 52 42 51 41 50 40]
    punpcklwd       xm3, xm5                        ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
    movd            xm6, [r0 + r1 * 2]              ; m6 = row6
    punpcklbw       xm2, xm6                        ; m2 = [63 53 62 52 61 51 60 50]
    punpcklwd       xm4, xm2                        ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
    vinserti128     m3, m3, xm4, 1                  ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
    movd            xm4, [r0 + r5]                  ; m4 = row7
    punpcklbw       xm6, xm4                        ; m6 = [73 63 72 62 71 61 70 60]
    punpcklwd       xm5, xm6                        ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]

    lea             r0, [r0 + r1 * 4]
    movd            xm7, [r0]                       ; m7 = row8
    punpcklbw       xm4, xm7                        ; m4 = [83 73 82 72 81 71 80 70]
    punpcklwd       xm2, xm4                        ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
    vinserti128     m5, m5, xm2, 1                  ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
    movd            xm2, [r0 + r1]                  ; m2 = row9
    punpcklbw       xm7, xm2                        ; m7 = [93 83 92 82 91 81 90 80]
    punpcklwd       xm6, xm7                        ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
    movd            xm7, [r0 + r1 * 2]              ; m7 = rowA
    punpcklbw       xm2, xm7                        ; m2 = [A3 93 A2 92 A1 91 A0 90]
    punpcklwd       xm4, xm2                        ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
    vinserti128     m6, m6, xm4, 1                  ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]

    ; load filter coeff
%ifdef PIC
    lea             r5, [tab_LumaCoeff]
    vpbroadcastd    m0, [r5 + r4 * 8 + 0]
    vpbroadcastd    m2, [r5 + r4 * 8 + 4]
%else
    vpbroadcastd    m0, [tab_LumaCoeff + r4 * 8 + 0]
    vpbroadcastd    m2, [tab_LumaCoeff + r4 * 8 + 4]
%endif

    pmaddubsw       m1, m0
    pmaddubsw       m3, m0
    pmaddubsw       m5, m2
    pmaddubsw       m6, m2
    vbroadcasti128  m0, [pw_1]
    pmaddwd         m1, m0
    pmaddwd         m3, m0
    pmaddwd         m5, m0
    pmaddwd         m6, m0
    paddd           m1, m5                          ; m1 = DQWORD ROW[1 0]
    paddd           m3, m6                          ; m3 = DQWORD ROW[3 2]
    packssdw        m1, m3                          ; m1 =  QWORD ROW[3 1 2 0]

    ; TODO: does it overflow?
    pmulhrsw        m1, [pw_512]
    vextracti128    xm2, m1, 1
    packuswb        xm1, xm2                        ; m1 =  DWORD ROW[3 1 2 0]
    movd            [r2], xm1
    pextrd          [r2 + r3], xm1, 2
    pextrd          [r2 + r3 * 2], xm1, 1
    lea             r4, [r3 * 3]
    pextrd          [r2 + r4], xm1, 3
    RET

INIT_YMM avx2
cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

    add             r3d, r3d

    movd            xm1, [r0]
    pinsrd          xm1, [r0 + r1], 1
    pinsrd          xm1, [r0 + r1 * 2], 2
    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
    lea             r0, [r0 + r1 * 4]
    movd            xm2, [r0]
    pinsrd          xm2, [r0 + r1], 1
    pinsrd          xm2, [r0 + r1 * 2], 2
    pinsrd          xm2, [r0 + r4], 3                       ; m2 = row[7 6 5 4]
    vinserti128     m1, m1, xm2, 1                          ; m1 = row[7 6 5 4 3 2 1 0]
    lea             r0, [r0 + r1 * 4]
    movd            xm3, [r0]
    pinsrd          xm3, [r0 + r1], 1
    pinsrd          xm3, [r0 + r1 * 2], 2                   ; m3 = row[x 10 9 8]
    vinserti128     m2, m2, xm3, 1                          ; m2 = row[x 10 9 8 7 6 5 4]
    mova            m3, [interp4_vpp_shuf1]
    vpermd          m0, m3, m1                              ; m0 = row[4 3 3 2 2 1 1 0]
    vpermd          m4, m3, m2                              ; m4 = row[8 7 7 6 6 5 5 4]
    mova            m3, [interp4_vpp_shuf1 + mmsize]
    vpermd          m1, m3, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
    vpermd          m2, m3, m2                              ; m2 = row[10 9 9 8 8 7 7 6]

    mova            m3, [interp4_vpp_shuf]
    pshufb          m0, m0, m3
    pshufb          m1, m1, m3
    pshufb          m4, m4, m3
    pshufb          m2, m2, m3
    pmaddubsw       m0, [r5]
    pmaddubsw       m1, [r5 + mmsize]
    pmaddubsw       m4, [r5 + 2 * mmsize]
    pmaddubsw       m2, [r5 + 3 * mmsize]
    paddw           m0, m1
    paddw           m0, m4
    paddw           m0, m2                                  ; m0 = WORD ROW[3 2 1 0]

    psubw           m0, [pw_2000]
    vextracti128    xm2, m0, 1
    lea             r5, [r3 * 3]
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r5], xm2
    RET

%macro FILTER_VER_LUMA_AVX2_4xN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
    lea             r6, [r1 * 4]
%ifidn %3,pp
    mova            m6, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m6, [pw_2000]
%endif
    lea             r8, [r3 * 3]
    mova            m5, [interp4_vpp_shuf]
    mova            m0, [interp4_vpp_shuf1]
    mova            m7, [interp4_vpp_shuf1 + mmsize]
    mov             r7d, %2 / 8
.loop:
    movd            xm1, [r0]
    pinsrd          xm1, [r0 + r1], 1
    pinsrd          xm1, [r0 + r1 * 2], 2
    pinsrd          xm1, [r0 + r4], 3                       ; m1 = row[3 2 1 0]
    lea             r0, [r0 + r1 * 4]
    movd            xm2, [r0]
    pinsrd          xm2, [r0 + r1], 1
    pinsrd          xm2, [r0 + r1 * 2], 2
    pinsrd          xm2, [r0 + r4], 3                       ; m2 = row[7 6 5 4]
    vinserti128     m1, m1, xm2, 1                          ; m1 = row[7 6 5 4 3 2 1 0]
    lea             r0, [r0 + r1 * 4]
    movd            xm3, [r0]
    pinsrd          xm3, [r0 + r1], 1
    pinsrd          xm3, [r0 + r1 * 2], 2
    pinsrd          xm3, [r0 + r4], 3                       ; m3 = row[11 10 9 8]
    vinserti128     m2, m2, xm3, 1                          ; m2 = row[11 10 9 8 7 6 5 4]
    lea             r0, [r0 + r1 * 4]
    movd            xm4, [r0]
    pinsrd          xm4, [r0 + r1], 1
    pinsrd          xm4, [r0 + r1 * 2], 2                   ; m4 = row[x 14 13 12]
    vinserti128     m3, m3, xm4, 1                          ; m3 = row[x 14 13 12 11 10 9 8]
    vpermd          m8, m0, m1                              ; m8 = row[4 3 3 2 2 1 1 0]
    vpermd          m4, m0, m2                              ; m4 = row[8 7 7 6 6 5 5 4]
    vpermd          m1, m7, m1                              ; m1 = row[6 5 5 4 4 3 3 2]
    vpermd          m2, m7, m2                              ; m2 = row[10 9 9 8 8 7 7 6]
    vpermd          m9, m0, m3                              ; m9 = row[12 11 11 10 10 9 9 8]
    vpermd          m3, m7, m3                              ; m3 = row[14 13 13 12 12 11 11 10]

    pshufb          m8, m8, m5
    pshufb          m1, m1, m5
    pshufb          m4, m4, m5
    pshufb          m9, m9, m5
    pshufb          m2, m2, m5
    pshufb          m3, m3, m5
    pmaddubsw       m8, [r5]
    pmaddubsw       m1, [r5 + mmsize]
    pmaddubsw       m9, [r5 + 2 * mmsize]
    pmaddubsw       m3, [r5 + 3 * mmsize]
    paddw           m8, m1
    paddw           m9, m3
    pmaddubsw       m1, m4, [r5 + 2 * mmsize]
    pmaddubsw       m3, m2, [r5 + 3 * mmsize]
    pmaddubsw       m4, [r5]
    pmaddubsw       m2, [r5 + mmsize]
    paddw           m3, m1
    paddw           m2, m4
    paddw           m8, m3                                  ; m8 = WORD ROW[3 2 1 0]
    paddw           m9, m2                                  ; m9 = WORD ROW[7 6 5 4]

%ifidn %3,pp
    pmulhrsw        m8, m6
    pmulhrsw        m9, m6
    packuswb        m8, m9
    vextracti128    xm1, m8, 1
    movd            [r2], xm8
    pextrd          [r2 + r3], xm8, 1
    movd            [r2 + r3 * 2], xm1
    pextrd          [r2 + r8], xm1, 1
    lea             r2, [r2 + r3 * 4]
    pextrd          [r2], xm8, 2
    pextrd          [r2 + r3], xm8, 3
    pextrd          [r2 + r3 * 2], xm1, 2
    pextrd          [r2 + r8], xm1, 3
%else
    psubw           m8, m6
    psubw           m9, m6
    vextracti128    xm1, m8, 1
    vextracti128    xm2, m9, 1
    movq            [r2], xm8
    movhps          [r2 + r3], xm8
    movq            [r2 + r3 * 2], xm1
    movhps          [r2 + r8], xm1
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm9
    movhps          [r2 + r3], xm9
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r8], xm2
%endif
    lea             r2, [r2 + r3 * 4]
    sub             r0, r6
    dec             r7d
    jnz             .loop
    RET
%endif
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 4, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 8, pp
    FILTER_VER_LUMA_AVX2_4xN 4, 8, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 16, pp
    FILTER_VER_LUMA_AVX2_4xN 4, 16, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 4, ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 8, ps
    FILTER_VER_LUMA_AVX2_4xN 4, 8, ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_4xN 4, 16, ps
    FILTER_VER_LUMA_AVX2_4xN 4, 16, ps

%macro PROCESS_LUMA_AVX2_W8_8R 0
    movq            xm1, [r0]                       ; m1 = row 0
    movq            xm2, [r0 + r1]                  ; m2 = row 1
    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
    vinserti128     m5, m1, xm2, 1                  ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
    pmaddubsw       m5, [r5]
    movq            xm4, [r0 + r4]                  ; m4 = row 3
    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
    lea             r0, [r0 + r1 * 4]
    movq            xm1, [r0]                       ; m1 = row 4
    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
    paddw           m5, m0
    pmaddubsw       m2, [r5]
    movq            xm3, [r0 + r1]                  ; m3 = row 5
    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m5, m3
    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
    paddw           m2, m0
    pmaddubsw       m1, [r5]
    movq            xm3, [r0 + r4]                  ; m3 = row 7
    punpcklbw       xm4, xm3                        ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
    lea             r0, [r0 + r1 * 4]
    movq            xm0, [r0]                       ; m0 = row 8
    punpcklbw       xm3, xm0                        ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
    vinserti128     m4, m4, xm3, 1                  ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
    paddw           m5, m3
    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
    paddw           m2, m3
    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
    paddw           m1, m3
    pmaddubsw       m4, [r5]
    movq            xm3, [r0 + r1]                  ; m3 = row 9
    punpcklbw       xm0, xm3                        ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
    punpcklbw       xm3, xm6                        ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
    vinserti128     m0, m0, xm3, 1                  ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
    paddw           m2, m3
    pmaddubsw       m3, m0, [r5 + 2 * mmsize]
    paddw           m1, m3
    pmaddubsw       m0, [r5 + 1 * mmsize]
    paddw           m4, m0

    movq            xm3, [r0 + r4]                  ; m3 = row 11
    punpcklbw       xm6, xm3                        ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
    lea             r0, [r0 + r1 * 4]
    movq            xm0, [r0]                       ; m0 = row 12
    punpcklbw       xm3, xm0                        ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
    vinserti128     m6, m6, xm3, 1                  ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
    pmaddubsw       m3, m6, [r5 + 3 * mmsize]
    paddw           m1, m3
    pmaddubsw       m6, [r5 + 2 * mmsize]
    paddw           m4, m6
    movq            xm3, [r0 + r1]                  ; m3 = row 13
    punpcklbw       xm0, xm3                        ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
    movq            xm6, [r0 + r1 * 2]              ; m6 = row 14
    punpcklbw       xm3, xm6                        ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
    vinserti128     m0, m0, xm3, 1                  ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
    pmaddubsw       m0, [r5 + 3 * mmsize]
    paddw           m4, m0
%endmacro

%macro PROCESS_LUMA_AVX2_W8_4R 0
    movq            xm1, [r0]                       ; m1 = row 0
    movq            xm2, [r0 + r1]                  ; m2 = row 1
    punpcklbw       xm1, xm2                        ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
    punpcklbw       xm2, xm3                        ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
    vinserti128     m5, m1, xm2, 1                  ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
    pmaddubsw       m5, [r5]
    movq            xm4, [r0 + r4]                  ; m4 = row 3
    punpcklbw       xm3, xm4                        ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
    lea             r0, [r0 + r1 * 4]
    movq            xm1, [r0]                       ; m1 = row 4
    punpcklbw       xm4, xm1                        ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
    vinserti128     m2, m3, xm4, 1                  ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
    paddw           m5, m0
    pmaddubsw       m2, [r5]
    movq            xm3, [r0 + r1]                  ; m3 = row 5
    punpcklbw       xm1, xm3                        ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
    movq            xm4, [r0 + r1 * 2]              ; m4 = row 6
    punpcklbw       xm3, xm4                        ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
    vinserti128     m1, m1, xm3, 1                  ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m5, m3
    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
    paddw           m2, m0
    movq            xm3, [r0 + r4]                  ; m3 = row 7
    punpcklbw       xm4, xm3                        ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
    lea             r0, [r0 + r1 * 4]
    movq            xm0, [r0]                       ; m0 = row 8
    punpcklbw       xm3, xm0                        ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
    vinserti128     m4, m4, xm3, 1                  ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
    paddw           m5, m3
    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
    paddw           m2, m3
    movq            xm3, [r0 + r1]                  ; m3 = row 9
    punpcklbw       xm0, xm3                        ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
    movq            xm6, [r0 + r1 * 2]              ; m6 = row 10
    punpcklbw       xm3, xm6                        ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
    vinserti128     m0, m0, xm3, 1                  ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
    paddw           m2, m3
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_8xN 3
INIT_XMM sse4
cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
    lea       r5, [3 * r1]
    sub       r0, r5
    shl       r4d, 6

%ifidn %3,ps
    add       r3d, r3d
%endif

%ifdef PIC
    lea       r5, [tab_LumaCoeffVer]
    lea       r6, [r5 + r4]
%else
    lea       r6, [tab_LumaCoeffVer + r4]
%endif

 %ifidn %3,pp
    mova      m3, [pw_512]
%else
    mova      m3, [pw_2000]
%endif

    mov       r4d, %2/4
    lea       r5, [4 * r1]

.loopH:
    PROCESS_LUMA_W8_4R

%ifidn %3,pp
    pmulhrsw  m7, m3
    pmulhrsw  m6, m3
    pmulhrsw  m5, m3
    pmulhrsw  m4, m3

    packuswb  m7, m6
    packuswb  m5, m4

    movlps    [r2], m7
    movhps    [r2 + r3], m7
    lea       r2, [r2 + 2 * r3]
    movlps    [r2], m5
    movhps    [r2 + r3], m5
%else
    psubw     m7, m3
    psubw     m6, m3
    psubw     m5, m3
    psubw     m4, m3

    movu      [r2], m7
    movu      [r2 + r3], m6
    lea       r2, [r2 + 2 * r3]
    movu      [r2], m5
    movu      [r2 + r3], m4
%endif

    sub       r0, r5
    lea       r2, [r2 + 2 * r3]

    dec       r4d
    jnz       .loopH

    RET
%endmacro

%macro FILTER_VER_LUMA_AVX2_8xN 3
INIT_YMM avx2
cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
    lea             r6, [r1 * 4]
%ifidn %3,pp
    mova            m7, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m7, [pw_2000]
%endif
    mov             word [rsp], %2 / 8

.loop:
    PROCESS_LUMA_AVX2_W8_8R
%ifidn %3,pp
    pmulhrsw        m5, m7                          ; m5 = word: row 0, row 1
    pmulhrsw        m2, m7                          ; m2 = word: row 2, row 3
    pmulhrsw        m1, m7                          ; m1 = word: row 4, row 5
    pmulhrsw        m4, m7                          ; m4 = word: row 6, row 7
    packuswb        m5, m2
    packuswb        m1, m4
    vextracti128    xm2, m5, 1
    vextracti128    xm4, m1, 1
    movq            [r2], xm5
    movq            [r2 + r3], xm2
    lea             r2, [r2 + r3 * 2]
    movhps          [r2], xm5
    movhps          [r2 + r3], xm2
    lea             r2, [r2 + r3 * 2]
    movq            [r2], xm1
    movq            [r2 + r3], xm4
    lea             r2, [r2 + r3 * 2]
    movhps          [r2], xm1
    movhps          [r2 + r3], xm4
%else
    psubw           m5, m7                          ; m5 = word: row 0, row 1
    psubw           m2, m7                          ; m2 = word: row 2, row 3
    psubw           m1, m7                          ; m1 = word: row 4, row 5
    psubw           m4, m7                          ; m4 = word: row 6, row 7
    vextracti128    xm6, m5, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm0, m1, 1
    movu            [r2], xm5
    movu            [r2 + r3], xm6
    lea             r2, [r2 + r3 * 2]
    movu            [r2], xm2
    movu            [r2 + r3], xm3
    lea             r2, [r2 + r3 * 2]
    movu            [r2], xm1
    movu            [r2 + r3], xm0
    lea             r2, [r2 + r3 * 2]
    movu            [r2], xm4
    vextracti128    xm4, m4, 1
    movu            [r2 + r3], xm4
%endif
    lea             r2, [r2 + r3 * 2]
    sub             r0, r6
    dec             word [rsp]
    jnz             .loop
    RET
%endmacro

%macro FILTER_VER_LUMA_AVX2_8x8 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_8x8, 4, 6, 7
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
    PROCESS_LUMA_AVX2_W8_8R
%ifidn %1,pp
    mova            m3, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m3, [pw_2000]
%endif
    lea             r4, [r3 * 3]
%ifidn %1,pp
    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
    pmulhrsw        m1, m3                          ; m1 = word: row 4, row 5
    pmulhrsw        m4, m3                          ; m4 = word: row 6, row 7
    packuswb        m5, m2
    packuswb        m1, m4
    vextracti128    xm2, m5, 1
    vextracti128    xm4, m1, 1
    movq            [r2], xm5
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm5
    movhps          [r2 + r4], xm2
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm1
    movq            [r2 + r3], xm4
    movhps          [r2 + r3 * 2], xm1
    movhps          [r2 + r4], xm4
%else
    psubw           m5, m3                          ; m5 = word: row 0, row 1
    psubw           m2, m3                          ; m2 = word: row 2, row 3
    psubw           m1, m3                          ; m1 = word: row 4, row 5
    psubw           m4, m3                          ; m4 = word: row 6, row 7
    vextracti128    xm6, m5, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm0, m1, 1
    movu            [r2], xm5
    movu            [r2 + r3], xm6
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r4], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm1
    movu            [r2 + r3], xm0
    movu            [r2 + r3 * 2], xm4
    vextracti128    xm4, m4, 1
    movu            [r2 + r4], xm4
%endif
    RET
%endmacro

%macro FILTER_VER_LUMA_AVX2_8x4 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_8x4, 4, 6, 7
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
    PROCESS_LUMA_AVX2_W8_4R
%ifidn %1,pp
    mova            m3, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m3, [pw_2000]
%endif
    lea             r4, [r3 * 3]
%ifidn %1,pp
    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
    packuswb        m5, m2
    vextracti128    xm2, m5, 1
    movq            [r2], xm5
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm5
    movhps          [r2 + r4], xm2
%else
    psubw           m5, m3                          ; m5 = word: row 0, row 1
    psubw           m2, m3                          ; m2 = word: row 2, row 3
    movu            [r2], xm5
    vextracti128    xm5, m5, 1
    movu            [r2 + r3], xm5
    movu            [r2 + r3 * 2], xm2
    vextracti128    xm2, m2, 1
    movu            [r2 + r4], xm2
%endif
    RET
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 4, pp
    FILTER_VER_LUMA_AVX2_8x4 pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 8, pp
    FILTER_VER_LUMA_AVX2_8x8 pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 16, pp
    FILTER_VER_LUMA_AVX2_8xN 8, 16, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 32, pp
    FILTER_VER_LUMA_AVX2_8xN 8, 32, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 4, ps
    FILTER_VER_LUMA_AVX2_8x4 ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 8, ps
    FILTER_VER_LUMA_AVX2_8x8 ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 16, ps
    FILTER_VER_LUMA_AVX2_8xN 8, 16, ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_8xN 8, 32, ps
    FILTER_VER_LUMA_AVX2_8xN 8, 32, ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_12xN 3
INIT_XMM sse4
cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
    lea       r5, [3 * r1]
    sub       r0, r5
    shl       r4d, 6
%ifidn %3,ps
    add       r3d, r3d
%endif

%ifdef PIC
    lea       r5, [tab_LumaCoeffVer]
    lea       r6, [r5 + r4]
%else
    lea       r6, [tab_LumaCoeffVer + r4]
%endif

 %ifidn %3,pp
    mova      m3, [pw_512]
%else
    mova      m3, [pw_2000]
%endif

    mov       r4d, %2/4

.loopH:
    PROCESS_LUMA_W8_4R

%ifidn %3,pp
    pmulhrsw  m7, m3
    pmulhrsw  m6, m3
    pmulhrsw  m5, m3
    pmulhrsw  m4, m3

    packuswb  m7, m6
    packuswb  m5, m4

    movlps    [r2], m7
    movhps    [r2 + r3], m7
    lea       r5, [r2 + 2 * r3]
    movlps    [r5], m5
    movhps    [r5 + r3], m5
%else
    psubw     m7, m3
    psubw     m6, m3
    psubw     m5, m3
    psubw     m4, m3

    movu      [r2], m7
    movu      [r2 + r3], m6
    lea       r5, [r2 + 2 * r3]
    movu      [r5], m5
    movu      [r5 + r3], m4
%endif

    lea       r5, [8 * r1 - 8]
    sub       r0, r5
%ifidn %3,pp
    add       r2, 8
%else
    add       r2, 16
%endif

    PROCESS_LUMA_W4_4R

%ifidn %3,pp
    pmulhrsw  m4, m3
    pmulhrsw  m5, m3

    packuswb  m4, m5

    movd      [r2], m4
    pextrd    [r2 + r3], m4, 1
    lea       r5, [r2 + 2 * r3]
    pextrd    [r5], m4, 2
    pextrd    [r5 + r3], m4, 3
%else
    psubw     m4, m3
    psubw     m5, m3

    movlps    [r2], m4
    movhps    [r2 + r3], m4
    lea       r5, [r2 + 2 * r3]
    movlps    [r5], m5
    movhps    [r5 + r3], m5
%endif

    lea       r5, [4 * r1 + 8]
    sub       r0, r5
%ifidn %3,pp
    lea       r2, [r2 + 4 * r3 - 8]
%else
    lea       r2, [r2 + 4 * r3 - 16]
%endif

    dec       r4d
    jnz       .loopH

    RET
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_12xN 12, 16, pp

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_12xN 12, 16, ps

%macro FILTER_VER_LUMA_AVX2_12x16 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_12x16, 4, 7, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%endif
    lea             r6, [r3 * 3]

    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    pmaddubsw       m8, [r5]
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    pmaddubsw       m9, [r5]
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
    paddw           m8, m12
    pmaddubsw       m10, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
    paddw           m9, m13
    pmaddubsw       m11, [r5]

%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movq            [r2], xm0
    pextrd          [r2 + 8], xm0, 2
    movq            [r2 + r3], xm1
    pextrd          [r2 + r3 + 8], xm1, 2
    movq            [r2 + r3 * 2], xm2
    pextrd          [r2 + r3 * 2 + 8], xm2, 2
    movq            [r2 + r6], xm3
    pextrd          [r2 + r6 + 8], xm3, 2
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm4
    pextrd          [r2 + 8], xm4, 2
    movq            [r2 + r3], xm5
    pextrd          [r2 + r3 + 8], xm5, 2
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], xm0
    vextracti128    xm0, m0, 1
    movq            [r2 + 16], xm0
    movu            [r2 + r3], xm1
    vextracti128    xm1, m1, 1
    movq            [r2 + r3 + 16], xm1
    movu            [r2 + r3 * 2], xm2
    vextracti128    xm2, m2, 1
    movq            [r2 + r3 * 2 + 16], xm2
    movu            [r2 + r6], xm3
    vextracti128    xm3, m3, 1
    movq            [r2 + r6 + 16], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm4
    vextracti128    xm4, m4, 1
    movq            [r2 + 16], xm4
    movu            [r2 + r3], xm5
    vextracti128    xm5, m5, 1
    movq            [r2 + r3 + 16], xm5
%endif

    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
    paddw           m8, m0
    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
    paddw           m10, m0
    pmaddubsw       m12, [r5]
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
    paddw           m9, m1
    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
    paddw           m11, m1
    pmaddubsw       m13, [r5]

%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movq            [r2 + r3 * 2], xm6
    pextrd          [r2 + r3 * 2 + 8], xm6, 2
    movq            [r2 + r6], xm7
    pextrd          [r2 + r6 + 8], xm7, 2
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r2 + r3 * 2], xm6
    vextracti128    xm6, m6, 1
    movq            [r2 + r3 * 2 + 16], xm6
    movu            [r2 + r6], xm7
    vextracti128    xm7, m7, 1
    movq            [r2 + r6 + 16], xm7
%endif
    lea             r2, [r2 + r3 * 4]

    movu            xm1, [r0 + r4]                  ; m1 = row 15
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
    paddw           m8, m2
    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
    paddw           m10, m2
    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
    paddw           m12, m2
    pmaddubsw       m0, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm2, [r0]                       ; m2 = row 16
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
    paddw           m9, m3
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m11, m3
    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
    paddw           m13, m3
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r1]                  ; m3 = row 17
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
    paddw           m10, m4
    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
    paddw           m12, m4
    pmaddubsw       m2, [r5 + 1 * mmsize]
    paddw           m0, m2
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
    paddw           m11, m5
    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
    paddw           m13, m5
    pmaddubsw       m3, [r5 + 1 * mmsize]
    paddw           m1, m3
    movu            xm5, [r0 + r4]                  ; m5 = row 19
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
    paddw           m12, m6
    pmaddubsw       m4, [r5 + 2 * mmsize]
    paddw           m0, m4
    lea             r0, [r0 + r1 * 4]
    movu            xm6, [r0]                       ; m6 = row 20
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
    paddw           m13, m7
    pmaddubsw       m5, [r5 + 2 * mmsize]
    paddw           m1, m5
    movu            xm7, [r0 + r1]                  ; m7 = row 21
    punpckhbw       xm2, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm2, 1
    pmaddubsw       m6, [r5 + 3 * mmsize]
    paddw           m0, m6
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
    punpckhbw       xm3, xm7, xm2
    punpcklbw       xm7, xm2
    vinserti128     m7, m7, xm3, 1
    pmaddubsw       m7, [r5 + 3 * mmsize]
    paddw           m1, m7

%ifidn %1,pp
    pmulhrsw        m8, m14                         ; m8 = word: row 8
    pmulhrsw        m9, m14                         ; m9 = word: row 9
    pmulhrsw        m10, m14                        ; m10 = word: row 10
    pmulhrsw        m11, m14                        ; m11 = word: row 11
    pmulhrsw        m12, m14                        ; m12 = word: row 12
    pmulhrsw        m13, m14                        ; m13 = word: row 13
    pmulhrsw        m0, m14                         ; m0 = word: row 14
    pmulhrsw        m1, m14                         ; m1 = word: row 15
    packuswb        m8, m9
    packuswb        m10, m11
    packuswb        m12, m13
    packuswb        m0, m1
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm1, m0, 1
    movq            [r2], xm8
    pextrd          [r2 + 8], xm8, 2
    movq            [r2 + r3], xm9
    pextrd          [r2 + r3 + 8], xm9, 2
    movq            [r2 + r3 * 2], xm10
    pextrd          [r2 + r3 * 2 + 8], xm10, 2
    movq            [r2 + r6], xm11
    pextrd          [r2 + r6 + 8], xm11, 2
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm12
    pextrd          [r2 + 8], xm12, 2
    movq            [r2 + r3], xm13
    pextrd          [r2 + r3 + 8], xm13, 2
    movq            [r2 + r3 * 2], xm0
    pextrd          [r2 + r3 * 2 + 8], xm0, 2
    movq            [r2 + r6], xm1
    pextrd          [r2 + r6 + 8], xm1, 2
%else
    psubw           m8, m14                         ; m8 = word: row 8
    psubw           m9, m14                         ; m9 = word: row 9
    psubw           m10, m14                        ; m10 = word: row 10
    psubw           m11, m14                        ; m11 = word: row 11
    psubw           m12, m14                        ; m12 = word: row 12
    psubw           m13, m14                        ; m13 = word: row 13
    psubw           m0, m14                         ; m0 = word: row 14
    psubw           m1, m14                         ; m1 = word: row 15
    movu            [r2], xm8
    vextracti128    xm8, m8, 1
    movq            [r2 + 16], xm8
    movu            [r2 + r3], xm9
    vextracti128    xm9, m9, 1
    movq            [r2 + r3 + 16], xm9
    movu            [r2 + r3 * 2], xm10
    vextracti128    xm10, m10, 1
    movq            [r2 + r3 * 2 + 16], xm10
    movu            [r2 + r6], xm11
    vextracti128    xm11, m11, 1
    movq            [r2 + r6 + 16], xm11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm12
    vextracti128    xm12, m12, 1
    movq            [r2 + 16], xm12
    movu            [r2 + r3], xm13
    vextracti128    xm13, m13, 1
    movq            [r2 + r3 + 16], xm13
    movu            [r2 + r3 * 2], xm0
    vextracti128    xm0, m0, 1
    movq            [r2 + r3 * 2 + 16], xm0
    movu            [r2 + r6], xm1
    vextracti128    xm1, m1, 1
    movq            [r2 + r6 + 16], xm1
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_12x16 pp
    FILTER_VER_LUMA_AVX2_12x16 ps

%macro FILTER_VER_LUMA_AVX2_16x16 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_16x16, 4, 7, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%endif
    lea             r6, [r3 * 3]

    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    pmaddubsw       m8, [r5]
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    pmaddubsw       m9, [r5]
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
    paddw           m8, m12
    pmaddubsw       m10, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
    paddw           m9, m13
    pmaddubsw       m11, [r5]

%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm4
    movu            [r2 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r6], m3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m4
    movu            [r2 + r3], m5
%endif

    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
    paddw           m8, m0
    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
    paddw           m10, m0
    pmaddubsw       m12, [r5]
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
    paddw           m9, m1
    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
    paddw           m11, m1
    pmaddubsw       m13, [r5]

%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r6], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r2 + r3 * 2], m6
    movu            [r2 + r6], m7
%endif
    lea             r2, [r2 + r3 * 4]

    movu            xm1, [r0 + r4]                  ; m1 = row 15
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
    paddw           m8, m2
    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
    paddw           m10, m2
    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
    paddw           m12, m2
    pmaddubsw       m0, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm2, [r0]                       ; m2 = row 16
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
    paddw           m9, m3
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m11, m3
    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
    paddw           m13, m3
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r1]                  ; m3 = row 17
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
    paddw           m10, m4
    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
    paddw           m12, m4
    pmaddubsw       m2, [r5 + 1 * mmsize]
    paddw           m0, m2
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
    paddw           m11, m5
    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
    paddw           m13, m5
    pmaddubsw       m3, [r5 + 1 * mmsize]
    paddw           m1, m3
    movu            xm5, [r0 + r4]                  ; m5 = row 19
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
    paddw           m12, m6
    pmaddubsw       m4, [r5 + 2 * mmsize]
    paddw           m0, m4
    lea             r0, [r0 + r1 * 4]
    movu            xm6, [r0]                       ; m6 = row 20
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
    paddw           m13, m7
    pmaddubsw       m5, [r5 + 2 * mmsize]
    paddw           m1, m5
    movu            xm7, [r0 + r1]                  ; m7 = row 21
    punpckhbw       xm2, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm2, 1
    pmaddubsw       m6, [r5 + 3 * mmsize]
    paddw           m0, m6
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
    punpckhbw       xm3, xm7, xm2
    punpcklbw       xm7, xm2
    vinserti128     m7, m7, xm3, 1
    pmaddubsw       m7, [r5 + 3 * mmsize]
    paddw           m1, m7

%ifidn %1,pp
    pmulhrsw        m8, m14                         ; m8 = word: row 8
    pmulhrsw        m9, m14                         ; m9 = word: row 9
    pmulhrsw        m10, m14                        ; m10 = word: row 10
    pmulhrsw        m11, m14                        ; m11 = word: row 11
    pmulhrsw        m12, m14                        ; m12 = word: row 12
    pmulhrsw        m13, m14                        ; m13 = word: row 13
    pmulhrsw        m0, m14                         ; m0 = word: row 14
    pmulhrsw        m1, m14                         ; m1 = word: row 15
    packuswb        m8, m9
    packuswb        m10, m11
    packuswb        m12, m13
    packuswb        m0, m1
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm1, m0, 1
    movu            [r2], xm8
    movu            [r2 + r3], xm9
    movu            [r2 + r3 * 2], xm10
    movu            [r2 + r6], xm11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm12
    movu            [r2 + r3], xm13
    movu            [r2 + r3 * 2], xm0
    movu            [r2 + r6], xm1
%else
    psubw           m8, m14                         ; m8 = word: row 8
    psubw           m9, m14                         ; m9 = word: row 9
    psubw           m10, m14                        ; m10 = word: row 10
    psubw           m11, m14                        ; m11 = word: row 11
    psubw           m12, m14                        ; m12 = word: row 12
    psubw           m13, m14                        ; m13 = word: row 13
    psubw           m0, m14                         ; m0 = word: row 14
    psubw           m1, m14                         ; m1 = word: row 15
    movu            [r2], m8
    movu            [r2 + r3], m9
    movu            [r2 + r3 * 2], m10
    movu            [r2 + r6], m11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m12
    movu            [r2 + r3], m13
    movu            [r2 + r3 * 2], m0
    movu            [r2 + r6], m1
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_16x16 pp
    FILTER_VER_LUMA_AVX2_16x16 ps

%macro FILTER_VER_LUMA_AVX2_16x12 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_16x12, 4, 7, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%endif
    lea             r6, [r3 * 3]

    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    pmaddubsw       m8, [r5]
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    pmaddubsw       m9, [r5]
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
    paddw           m8, m12
    pmaddubsw       m10, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
    paddw           m9, m13
    pmaddubsw       m11, [r5]

%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm4
    movu            [r2 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r6], m3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m4
    movu            [r2 + r3], m5
%endif

    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
    paddw           m8, m0
    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
    paddw           m10, m0
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
    paddw           m9, m1
    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
    paddw           m11, m1

%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r6], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r2 + r3 * 2], m6
    movu            [r2 + r6], m7
%endif
    lea             r2, [r2 + r3 * 4]

    movu            xm1, [r0 + r4]                  ; m1 = row 15
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
    paddw           m8, m2
    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
    paddw           m10, m2
    lea             r0, [r0 + r1 * 4]
    movu            xm2, [r0]                       ; m2 = row 16
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
    paddw           m9, m3
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m11, m3
    movu            xm3, [r0 + r1]                  ; m3 = row 17
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
    paddw           m10, m4
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
    paddw           m11, m5

%ifidn %1,pp
    pmulhrsw        m8, m14                         ; m8 = word: row 8
    pmulhrsw        m9, m14                         ; m9 = word: row 9
    pmulhrsw        m10, m14                        ; m10 = word: row 10
    pmulhrsw        m11, m14                        ; m11 = word: row 11
    packuswb        m8, m9
    packuswb        m10, m11
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    movu            [r2], xm8
    movu            [r2 + r3], xm9
    movu            [r2 + r3 * 2], xm10
    movu            [r2 + r6], xm11
%else
    psubw           m8, m14                         ; m8 = word: row 8
    psubw           m9, m14                         ; m9 = word: row 9
    psubw           m10, m14                        ; m10 = word: row 10
    psubw           m11, m14                        ; m11 = word: row 11
    movu            [r2], m8
    movu            [r2 + r3], m9
    movu            [r2 + r3 * 2], m10
    movu            [r2 + r6], m11
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_16x12 pp
    FILTER_VER_LUMA_AVX2_16x12 ps

%macro FILTER_VER_LUMA_AVX2_16x8 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_16x8, 4, 6, 15
    mov             r4d, r4m
    shl             r4d, 7
%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%endif
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    lea             r4, [r3 * 3]
%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r4], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm4
    movu            [r2 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r4], m3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m4
    movu            [r2 + r3], m5
%endif
    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r4], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r2 + r3 * 2], m6
    movu            [r2 + r4], m7
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_16x8 pp
    FILTER_VER_LUMA_AVX2_16x8 ps

%macro FILTER_VER_LUMA_AVX2_16x4 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_16x4, 4, 6, 13
    mov             r4d, r4m
    shl             r4d, 7
%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,pp
    mova            m12, [pw_512]
%else
    add             r3d, r3d
    vbroadcasti128  m12, [pw_2000]
%endif
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
%ifidn %1,pp
    pmulhrsw        m0, m12                         ; m0 = word: row 0
    pmulhrsw        m1, m12                         ; m1 = word: row 1
    pmulhrsw        m2, m12                         ; m2 = word: row 2
    pmulhrsw        m3, m12                         ; m3 = word: row 3
    packuswb        m0, m1
    packuswb        m2, m3
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    lea             r4, [r3 * 3]
    movu            [r2 + r4], xm3
%else
    psubw           m0, m12                         ; m0 = word: row 0
    psubw           m1, m12                         ; m1 = word: row 1
    psubw           m2, m12                         ; m2 = word: row 2
    psubw           m3, m12                         ; m3 = word: row 3
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    lea             r4, [r3 * 3]
    movu            [r2 + r4], m3
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_16x4 pp
    FILTER_VER_LUMA_AVX2_16x4 ps
%macro FILTER_VER_LUMA_AVX2_16xN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %3,ps
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%else
    mova            m14, [pw_512]
%endif
    lea             r6, [r3 * 3]
    lea             r7, [r1 * 4]
    mov             r8d, %2 / 16

.loop:
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    pmaddubsw       m8, [r5]
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    pmaddubsw       m9, [r5]
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
    paddw           m8, m12
    pmaddubsw       m10, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
    paddw           m9, m13
    pmaddubsw       m11, [r5]

%ifidn %3,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm4
    movu            [r2 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r6], m3
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m4
    movu            [r2 + r3], m5
%endif

    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
    paddw           m8, m0
    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
    paddw           m10, m0
    pmaddubsw       m12, [r5]
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
    paddw           m9, m1
    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
    paddw           m11, m1
    pmaddubsw       m13, [r5]

%ifidn %3,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r6], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r2 + r3 * 2], m6
    movu            [r2 + r6], m7
%endif

    lea             r2, [r2 + r3 * 4]

    movu            xm1, [r0 + r4]                  ; m1 = row 15
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
    paddw           m8, m2
    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
    paddw           m10, m2
    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
    paddw           m12, m2
    pmaddubsw       m0, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm2, [r0]                       ; m2 = row 16
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
    paddw           m9, m3
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m11, m3
    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
    paddw           m13, m3
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r1]                  ; m3 = row 17
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
    paddw           m10, m4
    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
    paddw           m12, m4
    pmaddubsw       m2, [r5 + 1 * mmsize]
    paddw           m0, m2
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
    paddw           m11, m5
    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
    paddw           m13, m5
    pmaddubsw       m3, [r5 + 1 * mmsize]
    paddw           m1, m3
    movu            xm5, [r0 + r4]                  ; m5 = row 19
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
    paddw           m12, m6
    pmaddubsw       m4, [r5 + 2 * mmsize]
    paddw           m0, m4
    lea             r0, [r0 + r1 * 4]
    movu            xm6, [r0]                       ; m6 = row 20
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
    paddw           m13, m7
    pmaddubsw       m5, [r5 + 2 * mmsize]
    paddw           m1, m5
    movu            xm7, [r0 + r1]                  ; m7 = row 21
    punpckhbw       xm2, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm2, 1
    pmaddubsw       m6, [r5 + 3 * mmsize]
    paddw           m0, m6
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
    punpckhbw       xm3, xm7, xm2
    punpcklbw       xm7, xm2
    vinserti128     m7, m7, xm3, 1
    pmaddubsw       m7, [r5 + 3 * mmsize]
    paddw           m1, m7

%ifidn %3,pp
    pmulhrsw        m8, m14                         ; m8 = word: row 8
    pmulhrsw        m9, m14                         ; m9 = word: row 9
    pmulhrsw        m10, m14                        ; m10 = word: row 10
    pmulhrsw        m11, m14                        ; m11 = word: row 11
    pmulhrsw        m12, m14                        ; m12 = word: row 12
    pmulhrsw        m13, m14                        ; m13 = word: row 13
    pmulhrsw        m0, m14                         ; m0 = word: row 14
    pmulhrsw        m1, m14                         ; m1 = word: row 15
    packuswb        m8, m9
    packuswb        m10, m11
    packuswb        m12, m13
    packuswb        m0, m1
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm1, m0, 1
    movu            [r2], xm8
    movu            [r2 + r3], xm9
    movu            [r2 + r3 * 2], xm10
    movu            [r2 + r6], xm11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm12
    movu            [r2 + r3], xm13
    movu            [r2 + r3 * 2], xm0
    movu            [r2 + r6], xm1
%else
    psubw           m8, m14                         ; m8 = word: row 8
    psubw           m9, m14                         ; m9 = word: row 9
    psubw           m10, m14                        ; m10 = word: row 10
    psubw           m11, m14                        ; m11 = word: row 11
    psubw           m12, m14                        ; m12 = word: row 12
    psubw           m13, m14                        ; m13 = word: row 13
    psubw           m0, m14                         ; m0 = word: row 14
    psubw           m1, m14                         ; m1 = word: row 15
    movu            [r2], m8
    movu            [r2 + r3], m9
    movu            [r2 + r3 * 2], m10
    movu            [r2 + r6], m11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], m12
    movu            [r2 + r3], m13
    movu            [r2 + r3 * 2], m0
    movu            [r2 + r6], m1
%endif

    lea             r2, [r2 + r3 * 4]
    sub             r0, r7
    dec             r8d
    jnz             .loop
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_16xN 16, 32, pp
    FILTER_VER_LUMA_AVX2_16xN 16, 64, pp
    FILTER_VER_LUMA_AVX2_16xN 16, 32, ps
    FILTER_VER_LUMA_AVX2_16xN 16, 64, ps

%macro PROCESS_LUMA_AVX2_W16_16R 1
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r7, [r0 + r1 * 4]
    movu            xm4, [r7]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r7 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r7 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm8, [r7]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r7 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    pmaddubsw       m8, [r5]
    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    pmaddubsw       m9, [r5]
    movu            xm11, [r7 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    pmaddubsw       m12, m10, [r5 + 1 * mmsize]
    paddw           m8, m12
    pmaddubsw       m10, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm12, [r7]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13
    pmaddubsw       m13, m11, [r5 + 1 * mmsize]
    paddw           m9, m13
    pmaddubsw       m11, [r5]

%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    lea             r8, [r2 + r3 * 4]
    movu            [r8], xm4
    movu            [r8 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r6], m3
    lea             r8, [r2 + r3 * 4]
    movu            [r8], m4
    movu            [r8 + r3], m5
%endif

    movu            xm13, [r7 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    pmaddubsw       m0, m12, [r5 + 2 * mmsize]
    paddw           m8, m0
    pmaddubsw       m0, m12, [r5 + 1 * mmsize]
    paddw           m10, m0
    pmaddubsw       m12, [r5]
    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1
    pmaddubsw       m1, m13, [r5 + 2 * mmsize]
    paddw           m9, m1
    pmaddubsw       m1, m13, [r5 + 1 * mmsize]
    paddw           m11, m1
    pmaddubsw       m13, [r5]

%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r8 + r3 * 2], m6
    movu            [r8 + r6], m7
%endif

    lea             r8, [r8 + r3 * 4]

    movu            xm1, [r7 + r4]                  ; m1 = row 15
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m2, m0, [r5 + 3 * mmsize]
    paddw           m8, m2
    pmaddubsw       m2, m0, [r5 + 2 * mmsize]
    paddw           m10, m2
    pmaddubsw       m2, m0, [r5 + 1 * mmsize]
    paddw           m12, m2
    pmaddubsw       m0, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm2, [r7]                       ; m2 = row 16
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 3 * mmsize]
    paddw           m9, m3
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m11, m3
    pmaddubsw       m3, m1, [r5 + 1 * mmsize]
    paddw           m13, m3
    pmaddubsw       m1, [r5]
    movu            xm3, [r7 + r1]                  ; m3 = row 17
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 3 * mmsize]
    paddw           m10, m4
    pmaddubsw       m4, m2, [r5 + 2 * mmsize]
    paddw           m12, m4
    pmaddubsw       m2, [r5 + 1 * mmsize]
    paddw           m0, m2
    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 3 * mmsize]
    paddw           m11, m5
    pmaddubsw       m5, m3, [r5 + 2 * mmsize]
    paddw           m13, m5
    pmaddubsw       m3, [r5 + 1 * mmsize]
    paddw           m1, m3
    movu            xm5, [r7 + r4]                  ; m5 = row 19
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 3 * mmsize]
    paddw           m12, m6
    pmaddubsw       m4, [r5 + 2 * mmsize]
    paddw           m0, m4
    lea             r7, [r7 + r1 * 4]
    movu            xm6, [r7]                       ; m6 = row 20
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 3 * mmsize]
    paddw           m13, m7
    pmaddubsw       m5, [r5 + 2 * mmsize]
    paddw           m1, m5
    movu            xm7, [r7 + r1]                  ; m7 = row 21
    punpckhbw       xm2, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm2, 1
    pmaddubsw       m6, [r5 + 3 * mmsize]
    paddw           m0, m6
    movu            xm2, [r7 + r1 * 2]              ; m2 = row 22
    punpckhbw       xm3, xm7, xm2
    punpcklbw       xm7, xm2
    vinserti128     m7, m7, xm3, 1
    pmaddubsw       m7, [r5 + 3 * mmsize]
    paddw           m1, m7

%ifidn %1,pp
    pmulhrsw        m8, m14                         ; m8 = word: row 8
    pmulhrsw        m9, m14                         ; m9 = word: row 9
    pmulhrsw        m10, m14                        ; m10 = word: row 10
    pmulhrsw        m11, m14                        ; m11 = word: row 11
    pmulhrsw        m12, m14                        ; m12 = word: row 12
    pmulhrsw        m13, m14                        ; m13 = word: row 13
    pmulhrsw        m0, m14                         ; m0 = word: row 14
    pmulhrsw        m1, m14                         ; m1 = word: row 15
    packuswb        m8, m9
    packuswb        m10, m11
    packuswb        m12, m13
    packuswb        m0, m1
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm1, m0, 1
    movu            [r8], xm8
    movu            [r8 + r3], xm9
    movu            [r8 + r3 * 2], xm10
    movu            [r8 + r6], xm11
    lea             r8, [r8 + r3 * 4]
    movu            [r8], xm12
    movu            [r8 + r3], xm13
    movu            [r8 + r3 * 2], xm0
    movu            [r8 + r6], xm1
%else
    psubw           m8, m14                         ; m8 = word: row 8
    psubw           m9, m14                         ; m9 = word: row 9
    psubw           m10, m14                        ; m10 = word: row 10
    psubw           m11, m14                        ; m11 = word: row 11
    psubw           m12, m14                        ; m12 = word: row 12
    psubw           m13, m14                        ; m13 = word: row 13
    psubw           m0, m14                         ; m0 = word: row 14
    psubw           m1, m14                         ; m1 = word: row 15
    movu            [r8], m8
    movu            [r8 + r3], m9
    movu            [r8 + r3 * 2], m10
    movu            [r8 + r6], m11
    lea             r8, [r8 + r3 * 4]
    movu            [r8], m12
    movu            [r8 + r3], m13
    movu            [r8 + r3 * 2], m0
    movu            [r8 + r6], m1
%endif
%endmacro

%macro PROCESS_LUMA_AVX2_W16_8R 1
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhbw       xm2, xm0, xm1
    punpcklbw       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddubsw       m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhbw       xm3, xm1, xm2
    punpcklbw       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhbw       xm4, xm2, xm3
    punpcklbw       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddubsw       m4, m2, [r5 + 1 * mmsize]
    paddw           m0, m4
    pmaddubsw       m2, [r5]
    lea             r7, [r0 + r1 * 4]
    movu            xm4, [r7]                       ; m4 = row 4
    punpckhbw       xm5, xm3, xm4
    punpcklbw       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddubsw       m5, m3, [r5 + 1 * mmsize]
    paddw           m1, m5
    pmaddubsw       m3, [r5]
    movu            xm5, [r7 + r1]                  ; m5 = row 5
    punpckhbw       xm6, xm4, xm5
    punpcklbw       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddubsw       m6, m4, [r5 + 2 * mmsize]
    paddw           m0, m6
    pmaddubsw       m6, m4, [r5 + 1 * mmsize]
    paddw           m2, m6
    pmaddubsw       m4, [r5]
    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
    punpckhbw       xm7, xm5, xm6
    punpcklbw       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddubsw       m7, m5, [r5 + 2 * mmsize]
    paddw           m1, m7
    pmaddubsw       m7, m5, [r5 + 1 * mmsize]
    paddw           m3, m7
    pmaddubsw       m5, [r5]
    movu            xm7, [r7 + r4]                  ; m7 = row 7
    punpckhbw       xm8, xm6, xm7
    punpcklbw       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddubsw       m8, m6, [r5 + 3 * mmsize]
    paddw           m0, m8
    pmaddubsw       m8, m6, [r5 + 2 * mmsize]
    paddw           m2, m8
    pmaddubsw       m8, m6, [r5 + 1 * mmsize]
    paddw           m4, m8
    pmaddubsw       m6, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm8, [r7]                       ; m8 = row 8
    punpckhbw       xm9, xm7, xm8
    punpcklbw       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddubsw       m9, m7, [r5 + 3 * mmsize]
    paddw           m1, m9
    pmaddubsw       m9, m7, [r5 + 2 * mmsize]
    paddw           m3, m9
    pmaddubsw       m9, m7, [r5 + 1 * mmsize]
    paddw           m5, m9
    pmaddubsw       m7, [r5]
    movu            xm9, [r7 + r1]                  ; m9 = row 9
    punpckhbw       xm10, xm8, xm9
    punpcklbw       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddubsw       m10, m8, [r5 + 3 * mmsize]
    paddw           m2, m10
    pmaddubsw       m10, m8, [r5 + 2 * mmsize]
    paddw           m4, m10
    pmaddubsw       m10, m8, [r5 + 1 * mmsize]
    paddw           m6, m10
    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
    punpckhbw       xm11, xm9, xm10
    punpcklbw       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddubsw       m11, m9, [r5 + 3 * mmsize]
    paddw           m3, m11
    pmaddubsw       m11, m9, [r5 + 2 * mmsize]
    paddw           m5, m11
    pmaddubsw       m11, m9, [r5 + 1 * mmsize]
    paddw           m7, m11
    movu            xm11, [r7 + r4]                 ; m11 = row 11
    punpckhbw       xm12, xm10, xm11
    punpcklbw       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddubsw       m12, m10, [r5 + 3 * mmsize]
    paddw           m4, m12
    pmaddubsw       m12, m10, [r5 + 2 * mmsize]
    paddw           m6, m12
    lea             r7, [r7 + r1 * 4]
    movu            xm12, [r7]                      ; m12 = row 12
    punpckhbw       xm13, xm11, xm12
    punpcklbw       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddubsw       m13, m11, [r5 + 3 * mmsize]
    paddw           m5, m13
    pmaddubsw       m13, m11, [r5 + 2 * mmsize]
    paddw           m7, m13

%ifidn %1,pp
    pmulhrsw        m0, m14                         ; m0 = word: row 0
    pmulhrsw        m1, m14                         ; m1 = word: row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2
    pmulhrsw        m3, m14                         ; m3 = word: row 3
    pmulhrsw        m4, m14                         ; m4 = word: row 4
    pmulhrsw        m5, m14                         ; m5 = word: row 5
    packuswb        m0, m1
    packuswb        m2, m3
    packuswb        m4, m5
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    vextracti128    xm5, m4, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    lea             r8, [r2 + r3 * 4]
    movu            [r8], xm4
    movu            [r8 + r3], xm5
%else
    psubw           m0, m14                         ; m0 = word: row 0
    psubw           m1, m14                         ; m1 = word: row 1
    psubw           m2, m14                         ; m2 = word: row 2
    psubw           m3, m14                         ; m3 = word: row 3
    psubw           m4, m14                         ; m4 = word: row 4
    psubw           m5, m14                         ; m5 = word: row 5
    movu            [r2], m0
    movu            [r2 + r3], m1
    movu            [r2 + r3 * 2], m2
    movu            [r2 + r6], m3
    lea             r8, [r2 + r3 * 4]
    movu            [r8], m4
    movu            [r8 + r3], m5
%endif

    movu            xm13, [r7 + r1]                 ; m13 = row 13
    punpckhbw       xm0, xm12, xm13
    punpcklbw       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddubsw       m0, m12, [r5 + 3 * mmsize]
    paddw           m6, m0
    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
    punpckhbw       xm1, xm13, xm0
    punpcklbw       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddubsw       m1, m13, [r5 + 3 * mmsize]
    paddw           m7, m1

%ifidn %1,pp
    pmulhrsw        m6, m14                         ; m6 = word: row 6
    pmulhrsw        m7, m14                         ; m7 = word: row 7
    packuswb        m6, m7
    vpermq          m6, m6, 11011000b
    vextracti128    xm7, m6, 1
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm7
%else
    psubw           m6, m14                         ; m6 = word: row 6
    psubw           m7, m14                         ; m7 = word: row 7
    movu            [r8 + r3 * 2], m6
    movu            [r8 + r6], m7
%endif
%endmacro

%macro FILTER_VER_LUMA_AVX2_24x32 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_24x32, 4, 11, 15
    mov             r4d, r4m
    shl             r4d, 7
%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,ps
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%else
    mova            m14, [pw_512]
%endif
    lea             r6, [r3 * 3]
    lea             r10, [r1 * 4]
    mov             r9d, 2
.loopH:
    PROCESS_LUMA_AVX2_W16_16R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    movq            xm1, [r0]                       ; m1 = row 0
    movq            xm2, [r0 + r1]                  ; m2 = row 1
    punpcklbw       xm1, xm2
    movq            xm3, [r0 + r1 * 2]              ; m3 = row 2
    punpcklbw       xm2, xm3
    vinserti128     m5, m1, xm2, 1
    pmaddubsw       m5, [r5]
    movq            xm4, [r0 + r4]                  ; m4 = row 3
    punpcklbw       xm3, xm4
    lea             r7, [r0 + r1 * 4]
    movq            xm1, [r7]                       ; m1 = row 4
    punpcklbw       xm4, xm1
    vinserti128     m2, m3, xm4, 1
    pmaddubsw       m0, m2, [r5 + 1 * mmsize]
    paddw           m5, m0
    pmaddubsw       m2, [r5]
    movq            xm3, [r7 + r1]                  ; m3 = row 5
    punpcklbw       xm1, xm3
    movq            xm4, [r7 + r1 * 2]              ; m4 = row 6
    punpcklbw       xm3, xm4
    vinserti128     m1, m1, xm3, 1
    pmaddubsw       m3, m1, [r5 + 2 * mmsize]
    paddw           m5, m3
    pmaddubsw       m0, m1, [r5 + 1 * mmsize]
    paddw           m2, m0
    pmaddubsw       m1, [r5]
    movq            xm3, [r7 + r4]                  ; m3 = row 7
    punpcklbw       xm4, xm3
    lea             r7, [r7 + r1 * 4]
    movq            xm0, [r7]                       ; m0 = row 8
    punpcklbw       xm3, xm0
    vinserti128     m4, m4, xm3, 1
    pmaddubsw       m3, m4, [r5 + 3 * mmsize]
    paddw           m5, m3
    pmaddubsw       m3, m4, [r5 + 2 * mmsize]
    paddw           m2, m3
    pmaddubsw       m3, m4, [r5 + 1 * mmsize]
    paddw           m1, m3
    pmaddubsw       m4, [r5]
    movq            xm3, [r7 + r1]                  ; m3 = row 9
    punpcklbw       xm0, xm3
    movq            xm6, [r7 + r1 * 2]              ; m6 = row 10
    punpcklbw       xm3, xm6
    vinserti128     m0, m0, xm3, 1
    pmaddubsw       m3, m0, [r5 + 3 * mmsize]
    paddw           m2, m3
    pmaddubsw       m3, m0, [r5 + 2 * mmsize]
    paddw           m1, m3
    pmaddubsw       m3, m0, [r5 + 1 * mmsize]
    paddw           m4, m3
    pmaddubsw       m0, [r5]

    movq            xm3, [r7 + r4]                  ; m3 = row 11
    punpcklbw       xm6, xm3
    lea             r7, [r7 + r1 * 4]
    movq            xm7, [r7]                       ; m7 = row 12
    punpcklbw       xm3, xm7
    vinserti128     m6, m6, xm3, 1
    pmaddubsw       m3, m6, [r5 + 3 * mmsize]
    paddw           m1, m3
    pmaddubsw       m3, m6, [r5 + 2 * mmsize]
    paddw           m4, m3
    pmaddubsw       m3, m6, [r5 + 1 * mmsize]
    paddw           m0, m3
    pmaddubsw       m6, [r5]
    movq            xm3, [r7 + r1]                  ; m3 = row 13
    punpcklbw       xm7, xm3
    movq            xm8, [r7 + r1 * 2]              ; m8 = row 14
    punpcklbw       xm3, xm8
    vinserti128     m7, m7, xm3, 1
    pmaddubsw       m3, m7, [r5 + 3 * mmsize]
    paddw           m4, m3
    pmaddubsw       m3, m7, [r5 + 2 * mmsize]
    paddw           m0, m3
    pmaddubsw       m3, m7, [r5 + 1 * mmsize]
    paddw           m6, m3
    pmaddubsw       m7, [r5]
    movq            xm3, [r7 + r4]                  ; m3 = row 15
    punpcklbw       xm8, xm3
    lea             r7, [r7 + r1 * 4]
    movq            xm9, [r7]                       ; m9 = row 16
    punpcklbw       xm3, xm9
    vinserti128     m8, m8, xm3, 1
    pmaddubsw       m3, m8, [r5 + 3 * mmsize]
    paddw           m0, m3
    pmaddubsw       m3, m8, [r5 + 2 * mmsize]
    paddw           m6, m3
    pmaddubsw       m3, m8, [r5 + 1 * mmsize]
    paddw           m7, m3
    pmaddubsw       m8, [r5]
    movq            xm3, [r7 + r1]                  ; m3 = row 17
    punpcklbw       xm9, xm3
    movq            xm10, [r7 + r1 * 2]             ; m10 = row 18
    punpcklbw       xm3, xm10
    vinserti128     m9, m9, xm3, 1
    pmaddubsw       m3, m9, [r5 + 3 * mmsize]
    paddw           m6, m3
    pmaddubsw       m3, m9, [r5 + 2 * mmsize]
    paddw           m7, m3
    pmaddubsw       m3, m9, [r5 + 1 * mmsize]
    paddw           m8, m3
    movq            xm3, [r7 + r4]                  ; m3 = row 19
    punpcklbw       xm10, xm3
    lea             r7, [r7 + r1 * 4]
    movq            xm9, [r7]                       ; m9 = row 20
    punpcklbw       xm3, xm9
    vinserti128     m10, m10, xm3, 1
    pmaddubsw       m3, m10, [r5 + 3 * mmsize]
    paddw           m7, m3
    pmaddubsw       m3, m10, [r5 + 2 * mmsize]
    paddw           m8, m3
    movq            xm3, [r7 + r1]                  ; m3 = row 21
    punpcklbw       xm9, xm3
    movq            xm10, [r7 + r1 * 2]             ; m10 = row 22
    punpcklbw       xm3, xm10
    vinserti128     m9, m9, xm3, 1
    pmaddubsw       m3, m9, [r5 + 3 * mmsize]
    paddw           m8, m3
%ifidn %1,pp
    pmulhrsw        m5, m14                         ; m5 = word: row 0, row 1
    pmulhrsw        m2, m14                         ; m2 = word: row 2, row 3
    pmulhrsw        m1, m14                         ; m1 = word: row 4, row 5
    pmulhrsw        m4, m14                         ; m4 = word: row 6, row 7
    pmulhrsw        m0, m14                         ; m0 = word: row 8, row 9
    pmulhrsw        m6, m14                         ; m6 = word: row 10, row 11
    pmulhrsw        m7, m14                         ; m7 = word: row 12, row 13
    pmulhrsw        m8, m14                         ; m8 = word: row 14, row 15
    packuswb        m5, m2
    packuswb        m1, m4
    packuswb        m0, m6
    packuswb        m7, m8
    vextracti128    xm2, m5, 1
    vextracti128    xm4, m1, 1
    vextracti128    xm6, m0, 1
    vextracti128    xm8, m7, 1
    movq            [r2], xm5
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm5
    movhps          [r2 + r6], xm2
    lea             r8, [r2 + r3 * 4]
    movq            [r8], xm1
    movq            [r8 + r3], xm4
    movhps          [r8 + r3 * 2], xm1
    movhps          [r8 + r6], xm4
    lea             r8, [r8 + r3 * 4]
    movq            [r8], xm0
    movq            [r8 + r3], xm6
    movhps          [r8 + r3 * 2], xm0
    movhps          [r8 + r6], xm6
    lea             r8, [r8 + r3 * 4]
    movq            [r8], xm7
    movq            [r8 + r3], xm8
    movhps          [r8 + r3 * 2], xm7
    movhps          [r8 + r6], xm8
%else
    psubw           m5, m14                         ; m5 = word: row 0, row 1
    psubw           m2, m14                         ; m2 = word: row 2, row 3
    psubw           m1, m14                         ; m1 = word: row 4, row 5
    psubw           m4, m14                         ; m4 = word: row 6, row 7
    psubw           m0, m14                         ; m0 = word: row 8, row 9
    psubw           m6, m14                         ; m6 = word: row 10, row 11
    psubw           m7, m14                         ; m7 = word: row 12, row 13
    psubw           m8, m14                         ; m8 = word: row 14, row 15
    vextracti128    xm3, m5, 1
    movu            [r2], xm5
    movu            [r2 + r3], xm3
    vextracti128    xm3, m2, 1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    vextracti128    xm3, m1, 1
    lea             r8, [r2 + r3 * 4]
    movu            [r8], xm1
    movu            [r8 + r3], xm3
    vextracti128    xm3, m4, 1
    movu            [r8 + r3 * 2], xm4
    movu            [r8 + r6], xm3
    vextracti128    xm3, m0, 1
    lea             r8, [r8 + r3 * 4]
    movu            [r8], xm0
    movu            [r8 + r3], xm3
    vextracti128    xm3, m6, 1
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm3
    vextracti128    xm3, m7, 1
    lea             r8, [r8 + r3 * 4]
    movu            [r8], xm7
    movu            [r8 + r3], xm3
    vextracti128    xm3, m8, 1
    movu            [r8 + r3 * 2], xm8
    movu            [r8 + r6], xm3
%endif
    sub             r7, r10
    lea             r0, [r7 - 16]
%ifidn %1,pp
    lea             r2, [r8 + r3 * 4 - 16]
%else
    lea             r2, [r8 + r3 * 4 - 32]
%endif
    dec             r9d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_24x32 pp
    FILTER_VER_LUMA_AVX2_24x32 ps

%macro FILTER_VER_LUMA_AVX2_32xN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
    mov             r4d, r4m
    shl             r4d, 7
%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %3,ps
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%else
    mova            m14, [pw_512]
%endif
    lea             r6, [r3 * 3]
    lea             r11, [r1 * 4]
    mov             r9d, %2 / 16
.loopH:
    mov             r10d, %1 / 16
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %3
%ifidn %3,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r10d
    jnz             .loopW
    sub             r7, r11
    lea             r0, [r7 - 16]
%ifidn %3,pp
    lea             r2, [r8 + r3 * 4 - 16]
%else
    lea             r2, [r8 + r3 * 4 - 32]
%endif
    dec             r9d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_32xN 32, 32, pp
    FILTER_VER_LUMA_AVX2_32xN 32, 64, pp
    FILTER_VER_LUMA_AVX2_32xN 32, 32, ps
    FILTER_VER_LUMA_AVX2_32xN 32, 64, ps

%macro FILTER_VER_LUMA_AVX2_32x16 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_32x16, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,ps
    add             r3d, r3d
    vbroadcasti128  m14, [pw_2000]
%else
    mova            m14, [pw_512]
%endif
    lea             r6, [r3 * 3]
    mov             r9d, 2
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_32x16 pp
    FILTER_VER_LUMA_AVX2_32x16 ps

%macro FILTER_VER_LUMA_AVX2_32x24 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_32x24, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7
%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif
    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,ps
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    vbroadcasti128  m14, [pw_2000]
%endif
    mov             r9d, 2
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    lea             r9, [r1 * 4]
    sub             r7, r9
    lea             r0, [r7 - 16]
%ifidn %1,pp
    lea             r2, [r8 + r3 * 4 - 16]
%else
    lea             r2, [r8 + r3 * 4 - 32]
%endif
    mov             r9d, 2
.loop:
    PROCESS_LUMA_AVX2_W16_8R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r9d
    jnz             .loop
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_32x24 pp
    FILTER_VER_LUMA_AVX2_32x24 ps

%macro FILTER_VER_LUMA_AVX2_32x8 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_32x8, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,ps
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
%ifidn %1,pp
    mova            m14, [pw_512]
%else
    vbroadcasti128  m14, [pw_2000]
%endif
    mov             r9d, 2
.loopW:
    PROCESS_LUMA_AVX2_W16_8R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_32x8 pp
    FILTER_VER_LUMA_AVX2_32x8 ps

%macro FILTER_VER_LUMA_AVX2_48x64 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_48x64, 4, 12, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %1,ps
    add             r3d, r3d
%endif

    lea             r6, [r3 * 3]
    lea             r11, [r1 * 4]

%ifidn %1,pp
    mova            m14, [pw_512]
%else
    vbroadcasti128  m14, [pw_2000]
%endif

    mov             r9d, 4
.loopH:
    mov             r10d, 3
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r10d
    jnz             .loopW
    sub             r7, r11
    lea             r0, [r7 - 32]
%ifidn %1,pp
    lea             r2, [r8 + r3 * 4 - 32]
%else
    lea             r2, [r8 + r3 * 4 - 64]
%endif
    dec             r9d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_48x64 pp
    FILTER_VER_LUMA_AVX2_48x64 ps

%macro FILTER_VER_LUMA_AVX2_64xN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %3,ps
    add             r3d, r3d
%endif

    lea             r6, [r3 * 3]
    lea             r11, [r1 * 4]

%ifidn %3,pp
    mova            m14, [pw_512]
%else
    vbroadcasti128  m14, [pw_2000]
%endif

    mov             r9d, %2 / 16
.loopH:
    mov             r10d, %1 / 16
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %3
%ifidn %3,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r10d
    jnz             .loopW
    sub             r7, r11
    lea             r0, [r7 - 48]
%ifidn %3,pp
    lea             r2, [r8 + r3 * 4 - 48]
%else
    lea             r2, [r8 + r3 * 4 - 96]
%endif
    dec             r9d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_64xN 64, 32, pp
    FILTER_VER_LUMA_AVX2_64xN 64, 48, pp
    FILTER_VER_LUMA_AVX2_64xN 64, 64, pp
    FILTER_VER_LUMA_AVX2_64xN 64, 32, ps
    FILTER_VER_LUMA_AVX2_64xN 64, 48, ps
    FILTER_VER_LUMA_AVX2_64xN 64, 64, ps

%macro FILTER_VER_LUMA_AVX2_64x16 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_64x16, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7

%ifdef PIC
    lea             r5, [tab_LumaCoeffVer_32]
    add             r5, r4
%else
    lea             r5, [tab_LumaCoeffVer_32 + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %1,ps
    add             r3d, r3d
%endif

    lea             r6, [r3 * 3]

%ifidn %1,pp
    mova            m14, [pw_512]
%else
    vbroadcasti128  m14, [pw_2000]
%endif

    mov             r9d, 4
.loopW:
    PROCESS_LUMA_AVX2_W16_16R %1
%ifidn %1,pp
    add             r2, 16
%else
    add             r2, 32
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_64x16 pp
    FILTER_VER_LUMA_AVX2_64x16 ps

;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA 3
INIT_XMM sse4
cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
    lea       r5, [3 * r1]
    sub       r0, r5
    shl       r4d, 6
%ifidn %3,ps
    add       r3d, r3d
%endif

%ifdef PIC
    lea       r5, [tab_LumaCoeffVer]
    lea       r6, [r5 + r4]
%else
    lea       r6, [tab_LumaCoeffVer + r4]
%endif

%ifidn %3,pp
    mova      m3, [pw_512]
%else
    mova      m3, [pw_2000]
%endif
    mov       dword [rsp], %2/4

.loopH:
    mov       r4d, (%1/8)
.loopW:
    PROCESS_LUMA_W8_4R
%ifidn %3,pp
    pmulhrsw  m7, m3
    pmulhrsw  m6, m3
    pmulhrsw  m5, m3
    pmulhrsw  m4, m3

    packuswb  m7, m6
    packuswb  m5, m4

    movlps    [r2], m7
    movhps    [r2 + r3], m7
    lea       r5, [r2 + 2 * r3]
    movlps    [r5], m5
    movhps    [r5 + r3], m5
%else
    psubw     m7, m3
    psubw     m6, m3
    psubw     m5, m3
    psubw     m4, m3

    movu      [r2], m7
    movu      [r2 + r3], m6
    lea       r5, [r2 + 2 * r3]
    movu      [r5], m5
    movu      [r5 + r3], m4
%endif

    lea       r5, [8 * r1 - 8]
    sub       r0, r5
%ifidn %3,pp
    add       r2, 8
%else
    add       r2, 16
%endif
    dec       r4d
    jnz       .loopW

    lea       r0, [r0 + 4 * r1 - %1]
%ifidn %3,pp
    lea       r2, [r2 + 4 * r3 - %1]
%else
    lea       r2, [r2 + 4 * r3 - 2 * %1]
%endif

    dec       dword [rsp]
    jnz       .loopH

    RET
%endmacro

    FILTER_VER_LUMA 16, 4, pp
    FILTER_VER_LUMA 16, 8, pp
    FILTER_VER_LUMA 16, 12, pp
    FILTER_VER_LUMA 16, 16, pp
    FILTER_VER_LUMA 16, 32, pp
    FILTER_VER_LUMA 16, 64, pp
    FILTER_VER_LUMA 24, 32, pp
    FILTER_VER_LUMA 32, 8, pp
    FILTER_VER_LUMA 32, 16, pp
    FILTER_VER_LUMA 32, 24, pp
    FILTER_VER_LUMA 32, 32, pp
    FILTER_VER_LUMA 32, 64, pp
    FILTER_VER_LUMA 48, 64, pp
    FILTER_VER_LUMA 64, 16, pp
    FILTER_VER_LUMA 64, 32, pp
    FILTER_VER_LUMA 64, 48, pp
    FILTER_VER_LUMA 64, 64, pp

    FILTER_VER_LUMA 16, 4, ps
    FILTER_VER_LUMA 16, 8, ps
    FILTER_VER_LUMA 16, 12, ps
    FILTER_VER_LUMA 16, 16, ps
    FILTER_VER_LUMA 16, 32, ps
    FILTER_VER_LUMA 16, 64, ps
    FILTER_VER_LUMA 24, 32, ps
    FILTER_VER_LUMA 32, 8, ps
    FILTER_VER_LUMA 32, 16, ps
    FILTER_VER_LUMA 32, 24, ps
    FILTER_VER_LUMA 32, 32, ps
    FILTER_VER_LUMA 32, 64, ps
    FILTER_VER_LUMA 48, 64, ps
    FILTER_VER_LUMA 64, 16, ps
    FILTER_VER_LUMA 64, 32, ps
    FILTER_VER_LUMA 64, 48, ps
    FILTER_VER_LUMA 64, 64, ps

%macro PROCESS_LUMA_SP_W4_4R 0
    movq       m0, [r0]
    movq       m1, [r0 + r1]
    punpcklwd  m0, m1                          ;m0=[0 1]
    pmaddwd    m0, [r6 + 0 *16]                ;m0=[0+1]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m1, m4                          ;m1=[1 2]
    pmaddwd    m1, [r6 + 0 *16]                ;m1=[1+2]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[2 3]
    pmaddwd    m2, m4, [r6 + 0 *16]            ;m2=[2+3]  Row3
    pmaddwd    m4, [r6 + 1 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[3 4]
    pmaddwd    m3, m5, [r6 + 0 *16]            ;m3=[3+4]  Row4
    pmaddwd    m5, [r6 + 1 * 16]
    paddd      m1, m5                          ;m1 = [1+2+3+4]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[4 5]
    pmaddwd    m6, m4, [r6 + 1 * 16]
    paddd      m2, m6                          ;m2=[2+3+4+5]  Row3
    pmaddwd    m4, [r6 + 2 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3+4+5]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[5 6]
    pmaddwd    m6, m5, [r6 + 1 * 16]
    paddd      m3, m6                          ;m3=[3+4+5+6]  Row4
    pmaddwd    m5, [r6 + 2 * 16]
    paddd      m1, m5                          ;m1=[1+2+3+4+5+6]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[6 7]
    pmaddwd    m6, m4, [r6 + 2 * 16]
    paddd      m2, m6                          ;m2=[2+3+4+5+6+7]  Row3
    pmaddwd    m4, [r6 + 3 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3+4+5+6+7]  Row1 end

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[7 8]
    pmaddwd    m6, m5, [r6 + 2 * 16]
    paddd      m3, m6                          ;m3=[3+4+5+6+7+8]  Row4
    pmaddwd    m5, [r6 + 3 * 16]
    paddd      m1, m5                          ;m1=[1+2+3+4+5+6+7+8]  Row2 end

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[8 9]
    pmaddwd    m4, [r6 + 3 * 16]
    paddd      m2, m4                          ;m2=[2+3+4+5+6+7+8+9]  Row3 end

    movq       m4, [r0 + 2 * r1]
    punpcklwd  m5, m4                          ;m5=[9 10]
    pmaddwd    m5, [r6 + 3 * 16]
    paddd      m3, m5                          ;m3=[3+4+5+6+7+8+9+10]  Row4 end
%endmacro

;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SP 2
INIT_XMM sse4
cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize

    add       r1d, r1d
    lea       r5, [r1 + 2 * r1]
    sub       r0, r5
    shl       r4d, 6

%ifdef PIC
    lea       r5, [tab_LumaCoeffV]
    lea       r6, [r5 + r4]
%else
    lea       r6, [tab_LumaCoeffV + r4]
%endif

    mova      m7, [pd_526336]

    mov       dword [rsp], %2/4
.loopH:
    mov       r4d, (%1/4)
.loopW:
    PROCESS_LUMA_SP_W4_4R

    paddd     m0, m7
    paddd     m1, m7
    paddd     m2, m7
    paddd     m3, m7

    psrad     m0, 12
    psrad     m1, 12
    psrad     m2, 12
    psrad     m3, 12

    packssdw  m0, m1
    packssdw  m2, m3

    packuswb  m0, m2

    movd      [r2], m0
    pextrd    [r2 + r3], m0, 1
    lea       r5, [r2 + 2 * r3]
    pextrd    [r5], m0, 2
    pextrd    [r5 + r3], m0, 3

    lea       r5, [8 * r1 - 2 * 4]
    sub       r0, r5
    add       r2, 4

    dec       r4d
    jnz       .loopW

    lea       r0, [r0 + 4 * r1 - 2 * %1]
    lea       r2, [r2 + 4 * r3 - %1]

    dec       dword [rsp]
    jnz       .loopH

    RET
%endmacro

;--------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
    FILTER_VER_LUMA_SP 4, 4
    FILTER_VER_LUMA_SP 8, 8
    FILTER_VER_LUMA_SP 8, 4
    FILTER_VER_LUMA_SP 4, 8
    FILTER_VER_LUMA_SP 16, 16
    FILTER_VER_LUMA_SP 16, 8
    FILTER_VER_LUMA_SP 8, 16
    FILTER_VER_LUMA_SP 16, 12
    FILTER_VER_LUMA_SP 12, 16
    FILTER_VER_LUMA_SP 16, 4
    FILTER_VER_LUMA_SP 4, 16
    FILTER_VER_LUMA_SP 32, 32
    FILTER_VER_LUMA_SP 32, 16
    FILTER_VER_LUMA_SP 16, 32
    FILTER_VER_LUMA_SP 32, 24
    FILTER_VER_LUMA_SP 24, 32
    FILTER_VER_LUMA_SP 32, 8
    FILTER_VER_LUMA_SP 8, 32
    FILTER_VER_LUMA_SP 64, 64
    FILTER_VER_LUMA_SP 64, 32
    FILTER_VER_LUMA_SP 32, 64
    FILTER_VER_LUMA_SP 64, 48
    FILTER_VER_LUMA_SP 48, 64
    FILTER_VER_LUMA_SP 64, 16
    FILTER_VER_LUMA_SP 16, 64

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal filterPixelToShort_4x2, 3, 4, 3
    mov         r3d, r3m
    add         r3d, r3d

    ; load constant
    mova        m1, [pb_128]
    mova        m2, [tab_c_64_n64]

    movd        m0, [r0]
    pinsrd      m0, [r0 + r1], 1
    punpcklbw   m0, m1
    pmaddubsw   m0, m2

    movq        [r2 + r3 * 0], m0
    movhps      [r2 + r3 * 1], m0

    RET

;-----------------------------------------------------------------------------
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
;-----------------------------------------------------------------------------
INIT_XMM ssse3
cglobal filterPixelToShort_8x2, 3, 4, 3
    mov         r3d, r3m
    add         r3d, r3d

    ; load constant
    mova        m1, [pb_128]
    mova        m2, [tab_c_64_n64]

    movh        m0, [r0]
    punpcklbw   m0, m1
    pmaddubsw   m0, m2
    movu        [r2 + r3 * 0], m0

    movh        m0, [r0 + r1]
    punpcklbw   m0, m1
    pmaddubsw   m0, m2
    movu        [r2 + r3 * 1], m0

    RET

;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SS 2
INIT_XMM sse2
cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize

    add        r1d, r1d
    add        r3d, r3d
    lea        r5, [3 * r1]
    sub        r0, r5
    shl        r4d, 6

%ifdef PIC
    lea        r5, [tab_LumaCoeffV]
    lea        r6, [r5 + r4]
%else
    lea        r6, [tab_LumaCoeffV + r4]
%endif

    mov        dword [rsp], %2/4
.loopH:
    mov        r4d, (%1/4)
.loopW:
    movq       m0, [r0]
    movq       m1, [r0 + r1]
    punpcklwd  m0, m1                          ;m0=[0 1]
    pmaddwd    m0, [r6 + 0 *16]                ;m0=[0+1]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m1, m4                          ;m1=[1 2]
    pmaddwd    m1, [r6 + 0 *16]                ;m1=[1+2]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[2 3]
    pmaddwd    m2, m4, [r6 + 0 *16]            ;m2=[2+3]  Row3
    pmaddwd    m4, [r6 + 1 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[3 4]
    pmaddwd    m3, m5, [r6 + 0 *16]            ;m3=[3+4]  Row4
    pmaddwd    m5, [r6 + 1 * 16]
    paddd      m1, m5                          ;m1 = [1+2+3+4]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[4 5]
    pmaddwd    m6, m4, [r6 + 1 * 16]
    paddd      m2, m6                          ;m2=[2+3+4+5]  Row3
    pmaddwd    m4, [r6 + 2 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3+4+5]  Row1

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[5 6]
    pmaddwd    m6, m5, [r6 + 1 * 16]
    paddd      m3, m6                          ;m3=[3+4+5+6]  Row4
    pmaddwd    m5, [r6 + 2 * 16]
    paddd      m1, m5                          ;m1=[1+2+3+4+5+6]  Row2

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[6 7]
    pmaddwd    m6, m4, [r6 + 2 * 16]
    paddd      m2, m6                          ;m2=[2+3+4+5+6+7]  Row3
    pmaddwd    m4, [r6 + 3 * 16]
    paddd      m0, m4                          ;m0=[0+1+2+3+4+5+6+7]  Row1 end
    psrad      m0, 6

    lea        r0, [r0 + 2 * r1]
    movq       m4, [r0]
    punpcklwd  m5, m4                          ;m5=[7 8]
    pmaddwd    m6, m5, [r6 + 2 * 16]
    paddd      m3, m6                          ;m3=[3+4+5+6+7+8]  Row4
    pmaddwd    m5, [r6 + 3 * 16]
    paddd      m1, m5                          ;m1=[1+2+3+4+5+6+7+8]  Row2 end
    psrad      m1, 6

    packssdw   m0, m1

    movlps     [r2], m0
    movhps     [r2 + r3], m0

    movq       m5, [r0 + r1]
    punpcklwd  m4, m5                          ;m4=[8 9]
    pmaddwd    m4, [r6 + 3 * 16]
    paddd      m2, m4                          ;m2=[2+3+4+5+6+7+8+9]  Row3 end
    psrad      m2, 6

    movq       m4, [r0 + 2 * r1]
    punpcklwd  m5, m4                          ;m5=[9 10]
    pmaddwd    m5, [r6 + 3 * 16]
    paddd      m3, m5                          ;m3=[3+4+5+6+7+8+9+10]  Row4 end
    psrad      m3, 6

    packssdw   m2, m3

    movlps     [r2 + 2 * r3], m2
    lea        r5, [3 * r3]
    movhps     [r2 + r5], m2

    lea        r5, [8 * r1 - 2 * 4]
    sub        r0, r5
    add        r2, 2 * 4

    dec        r4d
    jnz        .loopW

    lea        r0, [r0 + 4 * r1 - 2 * %1]
    lea        r2, [r2 + 4 * r3 - 2 * %1]

    dec        dword [rsp]
    jnz        .loopH

    RET
%endmacro

    FILTER_VER_LUMA_SS 4, 4
    FILTER_VER_LUMA_SS 8, 8
    FILTER_VER_LUMA_SS 8, 4
    FILTER_VER_LUMA_SS 4, 8
    FILTER_VER_LUMA_SS 16, 16
    FILTER_VER_LUMA_SS 16, 8
    FILTER_VER_LUMA_SS 8, 16
    FILTER_VER_LUMA_SS 16, 12
    FILTER_VER_LUMA_SS 12, 16
    FILTER_VER_LUMA_SS 16, 4
    FILTER_VER_LUMA_SS 4, 16
    FILTER_VER_LUMA_SS 32, 32
    FILTER_VER_LUMA_SS 32, 16
    FILTER_VER_LUMA_SS 16, 32
    FILTER_VER_LUMA_SS 32, 24
    FILTER_VER_LUMA_SS 24, 32
    FILTER_VER_LUMA_SS 32, 8
    FILTER_VER_LUMA_SS 8, 32
    FILTER_VER_LUMA_SS 64, 64
    FILTER_VER_LUMA_SS 64, 32
    FILTER_VER_LUMA_SS 32, 64
    FILTER_VER_LUMA_SS 64, 48
    FILTER_VER_LUMA_SS 48, 64
    FILTER_VER_LUMA_SS 64, 16
    FILTER_VER_LUMA_SS 16, 64

%macro FILTER_VER_LUMA_AVX2_4x4 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
    mov             r4d, r4m
    add             r1d, r1d
    shl             r4d, 7

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %1,sp
    mova            m6, [pd_526336]
%else
    add             r3d, r3d
%endif

    movq            xm0, [r0]
    movq            xm1, [r0 + r1]
    punpcklwd       xm0, xm1
    movq            xm2, [r0 + r1 * 2]
    punpcklwd       xm1, xm2
    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
    pmaddwd         m0, [r5]
    movq            xm3, [r0 + r4]
    punpcklwd       xm2, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm4, [r0]
    punpcklwd       xm3, xm4
    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
    pmaddwd         m5, m2, [r5 + 1 * mmsize]
    pmaddwd         m2, [r5]
    paddd           m0, m5
    movq            xm3, [r0 + r1]
    punpcklwd       xm4, xm3
    movq            xm1, [r0 + r1 * 2]
    punpcklwd       xm3, xm1
    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
    pmaddwd         m5, m4, [r5 + 2 * mmsize]
    pmaddwd         m4, [r5 + 1 * mmsize]
    paddd           m0, m5
    paddd           m2, m4
    movq            xm3, [r0 + r4]
    punpcklwd       xm1, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm4, [r0]
    punpcklwd       xm3, xm4
    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
    pmaddwd         m5, m1, [r5 + 3 * mmsize]
    pmaddwd         m1, [r5 + 2 * mmsize]
    paddd           m0, m5
    paddd           m2, m1
    movq            xm3, [r0 + r1]
    punpcklwd       xm4, xm3
    movq            xm1, [r0 + 2 * r1]
    punpcklwd       xm3, xm1
    vinserti128     m4, m4, xm3, 1                  ; m4 = [A 9 9 8]
    pmaddwd         m4, [r5 + 3 * mmsize]
    paddd           m2, m4

%ifidn %1,sp
    paddd           m0, m6
    paddd           m2, m6
    psrad           m0, 12
    psrad           m2, 12
%else
    psrad           m0, 6
    psrad           m2, 6
%endif
    packssdw        m0, m2
    vextracti128    xm2, m0, 1
    lea             r4, [r3 * 3]

%ifidn %1,sp
    packuswb        xm0, xm2
    movd            [r2], xm0
    pextrd          [r2 + r3], xm0, 2
    pextrd          [r2 + r3 * 2], xm0, 1
    pextrd          [r2 + r4], xm0, 3
%else
    movq            [r2], xm0
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm0
    movhps          [r2 + r4], xm2
%endif
    RET
%endmacro

    FILTER_VER_LUMA_AVX2_4x4 sp
    FILTER_VER_LUMA_AVX2_4x4 ss

%macro FILTER_VER_LUMA_AVX2_4x8 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %1,sp
    mova            m7, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]

    movq            xm0, [r0]
    movq            xm1, [r0 + r1]
    punpcklwd       xm0, xm1
    movq            xm2, [r0 + r1 * 2]
    punpcklwd       xm1, xm2
    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
    pmaddwd         m0, [r5]
    movq            xm3, [r0 + r4]
    punpcklwd       xm2, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm4, [r0]
    punpcklwd       xm3, xm4
    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
    pmaddwd         m5, m2, [r5 + 1 * mmsize]
    pmaddwd         m2, [r5]
    paddd           m0, m5
    movq            xm3, [r0 + r1]
    punpcklwd       xm4, xm3
    movq            xm1, [r0 + r1 * 2]
    punpcklwd       xm3, xm1
    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
    pmaddwd         m5, m4, [r5 + 2 * mmsize]
    paddd           m0, m5
    pmaddwd         m5, m4, [r5 + 1 * mmsize]
    paddd           m2, m5
    pmaddwd         m4, [r5]
    movq            xm3, [r0 + r4]
    punpcklwd       xm1, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm6, [r0]
    punpcklwd       xm3, xm6
    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
    pmaddwd         m5, m1, [r5 + 3 * mmsize]
    paddd           m0, m5
    pmaddwd         m5, m1, [r5 + 2 * mmsize]
    paddd           m2, m5
    pmaddwd         m5, m1, [r5 + 1 * mmsize]
    paddd           m4, m5
    pmaddwd         m1, [r5]
    movq            xm3, [r0 + r1]
    punpcklwd       xm6, xm3
    movq            xm5, [r0 + 2 * r1]
    punpcklwd       xm3, xm5
    vinserti128     m6, m6, xm3, 1                  ; m6 = [A 9 9 8]
    pmaddwd         m3, m6, [r5 + 3 * mmsize]
    paddd           m2, m3
    pmaddwd         m3, m6, [r5 + 2 * mmsize]
    paddd           m4, m3
    pmaddwd         m6, [r5 + 1 * mmsize]
    paddd           m1, m6

%ifidn %1,sp
    paddd           m0, m7
    paddd           m2, m7
    psrad           m0, 12
    psrad           m2, 12
%else
    psrad           m0, 6
    psrad           m2, 6
%endif
    packssdw        m0, m2

    movq            xm3, [r0 + r4]
    punpcklwd       xm5, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm2, [r0]
    punpcklwd       xm3, xm2
    vinserti128     m5, m5, xm3, 1                  ; m5 = [C B B A]
    pmaddwd         m3, m5, [r5 + 3 * mmsize]
    paddd           m4, m3
    pmaddwd         m5, [r5 + 2 * mmsize]
    paddd           m1, m5
    movq            xm3, [r0 + r1]
    punpcklwd       xm2, xm3
    movq            xm5, [r0 + 2 * r1]
    punpcklwd       xm3, xm5
    vinserti128     m2, m2, xm3, 1                  ; m2 = [E D D C]
    pmaddwd         m2, [r5 + 3 * mmsize]
    paddd           m1, m2

%ifidn %1,sp
    paddd           m4, m7
    paddd           m1, m7
    psrad           m4, 12
    psrad           m1, 12
%else
    psrad           m4, 6
    psrad           m1, 6
%endif
    packssdw        m4, m1

%ifidn %1,sp
    packuswb        m0, m4
    vextracti128    xm2, m0, 1
    movd            [r2], xm0
    movd            [r2 + r3], xm2
    pextrd          [r2 + r3 * 2], xm0, 1
    pextrd          [r2 + r6], xm2, 1
    lea             r2, [r2 + r3 * 4]
    pextrd          [r2], xm0, 2
    pextrd          [r2 + r3], xm2, 2
    pextrd          [r2 + r3 * 2], xm0, 3
    pextrd          [r2 + r6], xm2, 3
%else
    vextracti128    xm2, m0, 1
    vextracti128    xm1, m4, 1
    movq            [r2], xm0
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm0
    movhps          [r2 + r6], xm2
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm4
    movq            [r2 + r3], xm1
    movhps          [r2 + r3 * 2], xm4
    movhps          [r2 + r6], xm1
%endif
    RET
%endmacro

    FILTER_VER_LUMA_AVX2_4x8 sp
    FILTER_VER_LUMA_AVX2_4x8 ss

%macro PROCESS_LUMA_AVX2_W4_16R 1
    movq            xm0, [r0]
    movq            xm1, [r0 + r1]
    punpcklwd       xm0, xm1
    movq            xm2, [r0 + r1 * 2]
    punpcklwd       xm1, xm2
    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
    pmaddwd         m0, [r5]
    movq            xm3, [r0 + r4]
    punpcklwd       xm2, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm4, [r0]
    punpcklwd       xm3, xm4
    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
    pmaddwd         m5, m2, [r5 + 1 * mmsize]
    pmaddwd         m2, [r5]
    paddd           m0, m5
    movq            xm3, [r0 + r1]
    punpcklwd       xm4, xm3
    movq            xm1, [r0 + r1 * 2]
    punpcklwd       xm3, xm1
    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
    pmaddwd         m5, m4, [r5 + 2 * mmsize]
    paddd           m0, m5
    pmaddwd         m5, m4, [r5 + 1 * mmsize]
    paddd           m2, m5
    pmaddwd         m4, [r5]
    movq            xm3, [r0 + r4]
    punpcklwd       xm1, xm3
    lea             r0, [r0 + 4 * r1]
    movq            xm6, [r0]
    punpcklwd       xm3, xm6
    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
    pmaddwd         m5, m1, [r5 + 3 * mmsize]
    paddd           m0, m5
    pmaddwd         m5, m1, [r5 + 2 * mmsize]
    paddd           m2, m5
    pmaddwd         m5, m1, [r5 + 1 * mmsize]
    paddd           m4, m5
    pmaddwd         m1, [r5]
    movq            xm3, [r0 + r1]
    punpcklwd       xm6, xm3
    movq            xm5, [r0 + 2 * r1]
    punpcklwd       xm3, xm5
    vinserti128     m6, m6, xm3, 1                  ; m6 = [10 9 9 8]
    pmaddwd         m3, m6, [r5 + 3 * mmsize]
    paddd           m2, m3
    pmaddwd         m3, m6, [r5 + 2 * mmsize]
    paddd           m4, m3
    pmaddwd         m3, m6, [r5 + 1 * mmsize]
    paddd           m1, m3
    pmaddwd         m6, [r5]

%ifidn %1,sp
    paddd           m0, m7
    paddd           m2, m7
    psrad           m0, 12
    psrad           m2, 12
%else
    psrad           m0, 6
    psrad           m2, 6
%endif
    packssdw        m0, m2
    vextracti128    xm2, m0, 1
%ifidn %1,sp
    packuswb        xm0, xm2
    movd            [r2], xm0
    pextrd          [r2 + r3], xm0, 2
    pextrd          [r2 + r3 * 2], xm0, 1
    pextrd          [r2 + r6], xm0, 3
%else
    movq            [r2], xm0
    movq            [r2 + r3], xm2
    movhps          [r2 + r3 * 2], xm0
    movhps          [r2 + r6], xm2
%endif

    movq            xm2, [r0 + r4]
    punpcklwd       xm5, xm2
    lea             r0, [r0 + 4 * r1]
    movq            xm0, [r0]
    punpcklwd       xm2, xm0
    vinserti128     m5, m5, xm2, 1                  ; m5 = [12 11 11 10]
    pmaddwd         m2, m5, [r5 + 3 * mmsize]
    paddd           m4, m2
    pmaddwd         m2, m5, [r5 + 2 * mmsize]
    paddd           m1, m2
    pmaddwd         m2, m5, [r5 + 1 * mmsize]
    paddd           m6, m2
    pmaddwd         m5, [r5]
    movq            xm2, [r0 + r1]
    punpcklwd       xm0, xm2
    movq            xm3, [r0 + 2 * r1]
    punpcklwd       xm2, xm3
    vinserti128     m0, m0, xm2, 1                  ; m0 = [14 13 13 12]
    pmaddwd         m2, m0, [r5 + 3 * mmsize]
    paddd           m1, m2
    pmaddwd         m2, m0, [r5 + 2 * mmsize]
    paddd           m6, m2
    pmaddwd         m2, m0, [r5 + 1 * mmsize]
    paddd           m5, m2
    pmaddwd         m0, [r5]

%ifidn %1,sp
    paddd           m4, m7
    paddd           m1, m7
    psrad           m4, 12
    psrad           m1, 12
%else
    psrad           m4, 6
    psrad           m1, 6
%endif
    packssdw        m4, m1
    vextracti128    xm1, m4, 1
    lea             r2, [r2 + r3 * 4]
%ifidn %1,sp
    packuswb        xm4, xm1
    movd            [r2], xm4
    pextrd          [r2 + r3], xm4, 2
    pextrd          [r2 + r3 * 2], xm4, 1
    pextrd          [r2 + r6], xm4, 3
%else
    movq            [r2], xm4
    movq            [r2 + r3], xm1
    movhps          [r2 + r3 * 2], xm4
    movhps          [r2 + r6], xm1
%endif

    movq            xm4, [r0 + r4]
    punpcklwd       xm3, xm4
    lea             r0, [r0 + 4 * r1]
    movq            xm1, [r0]
    punpcklwd       xm4, xm1
    vinserti128     m3, m3, xm4, 1                  ; m3 = [16 15 15 14]
    pmaddwd         m4, m3, [r5 + 3 * mmsize]
    paddd           m6, m4
    pmaddwd         m4, m3, [r5 + 2 * mmsize]
    paddd           m5, m4
    pmaddwd         m4, m3, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m3, [r5]
    movq            xm4, [r0 + r1]
    punpcklwd       xm1, xm4
    movq            xm2, [r0 + 2 * r1]
    punpcklwd       xm4, xm2
    vinserti128     m1, m1, xm4, 1                  ; m1 = [18 17 17 16]
    pmaddwd         m4, m1, [r5 + 3 * mmsize]
    paddd           m5, m4
    pmaddwd         m4, m1, [r5 + 2 * mmsize]
    paddd           m0, m4
    pmaddwd         m1, [r5 + 1 * mmsize]
    paddd           m3, m1
    movq            xm4, [r0 + r4]
    punpcklwd       xm2, xm4
    lea             r0, [r0 + 4 * r1]
    movq            xm1, [r0]
    punpcklwd       xm4, xm1
    vinserti128     m2, m2, xm4, 1                  ; m2 = [20 19 19 18]
    pmaddwd         m4, m2, [r5 + 3 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5 + 2 * mmsize]
    paddd           m3, m2
    movq            xm4, [r0 + r1]
    punpcklwd       xm1, xm4
    movq            xm2, [r0 + 2 * r1]
    punpcklwd       xm4, xm2
    vinserti128     m1, m1, xm4, 1                  ; m1 = [22 21 21 20]
    pmaddwd         m1, [r5 + 3 * mmsize]
    paddd           m3, m1

%ifidn %1,sp
    paddd           m6, m7
    paddd           m5, m7
    paddd           m0, m7
    paddd           m3, m7
    psrad           m6, 12
    psrad           m5, 12
    psrad           m0, 12
    psrad           m3, 12
%else
    psrad           m6, 6
    psrad           m5, 6
    psrad           m0, 6
    psrad           m3, 6
%endif
    packssdw        m6, m5
    packssdw        m0, m3
    lea             r2, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m6, m0
    vextracti128    xm0, m6, 1
    movd            [r2], xm6
    movd            [r2 + r3], xm0
    pextrd          [r2 + r3 * 2], xm6, 1
    pextrd          [r2 + r6], xm0, 1
    lea             r2, [r2 + r3 * 4]
    pextrd          [r2], xm6, 2
    pextrd          [r2 + r3], xm0, 2
    pextrd          [r2 + r3 * 2], xm6, 3
    pextrd          [r2 + r6], xm0, 3
%else
    vextracti128    xm5, m6, 1
    vextracti128    xm3, m0, 1
    movq            [r2], xm6
    movq            [r2 + r3], xm5
    movhps          [r2 + r3 * 2], xm6
    movhps          [r2 + r6], xm5
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm0
    movq            [r2 + r3], xm3
    movhps          [r2 + r3 * 2], xm0
    movhps          [r2 + r6], xm3
%endif
%endmacro

%macro FILTER_VER_LUMA_AVX2_4x16 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_4x16, 4, 7, 8
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m7, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    PROCESS_LUMA_AVX2_W4_16R %1
    RET
%endmacro

    FILTER_VER_LUMA_AVX2_4x16 sp
    FILTER_VER_LUMA_AVX2_4x16 ss

%macro FILTER_VER_LUMA_S_AVX2_8x8 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_8x8, 4, 6, 12
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %1,sp
    mova            m11, [pd_526336]
%else
    add             r3d, r3d
%endif

    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    pmaddwd         m2, [r5]
    paddd           m0, m4
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    pmaddwd         m3, [r5]
    paddd           m1, m5
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m6, m4, [r5 + 1 * mmsize]
    paddd           m2, m6
    pmaddwd         m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm7, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddwd         m7, m5, [r5 + 2 * mmsize]
    paddd           m1, m7
    pmaddwd         m7, m5, [r5 + 1 * mmsize]
    pmaddwd         m5, [r5]
    paddd           m3, m7
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhwd       xm8, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddwd         m8, m6, [r5 + 3 * mmsize]
    paddd           m0, m8
    pmaddwd         m8, m6, [r5 + 2 * mmsize]
    paddd           m2, m8
    pmaddwd         m8, m6, [r5 + 1 * mmsize]
    pmaddwd         m6, [r5]
    paddd           m4, m8
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhwd       xm9, xm7, xm8
    punpcklwd       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddwd         m9, m7, [r5 + 3 * mmsize]
    paddd           m1, m9
    pmaddwd         m9, m7, [r5 + 2 * mmsize]
    paddd           m3, m9
    pmaddwd         m9, m7, [r5 + 1 * mmsize]
    pmaddwd         m7, [r5]
    paddd           m5, m9
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m2, m10
    pmaddwd         m10, m8, [r5 + 2 * mmsize]
    pmaddwd         m8, [r5 + 1 * mmsize]
    paddd           m4, m10
    paddd           m6, m8
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhwd       xm8, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm8, 1
    pmaddwd         m8, m9, [r5 + 3 * mmsize]
    paddd           m3, m8
    pmaddwd         m8, m9, [r5 + 2 * mmsize]
    pmaddwd         m9, [r5 + 1 * mmsize]
    paddd           m5, m8
    paddd           m7, m9
    movu            xm8, [r0 + r4]                  ; m8 = row 11
    punpckhwd       xm9, xm10, xm8
    punpcklwd       xm10, xm8
    vinserti128     m10, m10, xm9, 1
    pmaddwd         m9, m10, [r5 + 3 * mmsize]
    pmaddwd         m10, [r5 + 2 * mmsize]
    paddd           m4, m9
    paddd           m6, m10

    lea             r4, [r3 * 3]
%ifidn %1,sp
    paddd           m0, m11
    paddd           m1, m11
    paddd           m2, m11
    paddd           m3, m11
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
%ifidn %1,sp
    packuswb        m0, m2
    mova            m1, [interp8_hps_shuf]
    vpermd          m0, m1, m0
    vextracti128    xm2, m0, 1
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r4], xm2
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r4], xm3
%endif

    lea             r0, [r0 + r1 * 4]
    movu            xm9, [r0]                       ; m9 = row 12
    punpckhwd       xm3, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm3, 1
    pmaddwd         m3, m8, [r5 + 3 * mmsize]
    pmaddwd         m8, [r5 + 2 * mmsize]
    paddd           m5, m3
    paddd           m7, m8
    movu            xm3, [r0 + r1]                  ; m3 = row 13
    punpckhwd       xm0, xm9, xm3
    punpcklwd       xm9, xm3
    vinserti128     m9, m9, xm0, 1
    pmaddwd         m9, [r5 + 3 * mmsize]
    paddd           m6, m9
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhwd       xm9, xm3, xm0
    punpcklwd       xm3, xm0
    vinserti128     m3, m3, xm9, 1
    pmaddwd         m3, [r5 + 3 * mmsize]
    paddd           m7, m3

%ifidn %1,sp
    paddd           m4, m11
    paddd           m5, m11
    paddd           m6, m11
    paddd           m7, m11
    psrad           m4, 12
    psrad           m5, 12
    psrad           m6, 12
    psrad           m7, 12
%else
    psrad           m4, 6
    psrad           m5, 6
    psrad           m6, 6
    psrad           m7, 6
%endif
    packssdw        m4, m5
    packssdw        m6, m7
    lea             r2, [r2 + r3 * 4]
%ifidn %1,sp
    packuswb        m4, m6
    vpermd          m4, m1, m4
    vextracti128    xm6, m4, 1
    movq            [r2], xm4
    movhps          [r2 + r3], xm4
    movq            [r2 + r3 * 2], xm6
    movhps          [r2 + r4], xm6
%else
    vpermq          m4, m4, 11011000b
    vpermq          m6, m6, 11011000b
    vextracti128    xm5, m4, 1
    vextracti128    xm7, m6, 1
    movu            [r2], xm4
    movu            [r2 + r3], xm5
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r4], xm7
%endif
    RET
%endif
%endmacro

    FILTER_VER_LUMA_S_AVX2_8x8 sp
    FILTER_VER_LUMA_S_AVX2_8x8 ss

%macro FILTER_VER_LUMA_S_AVX2_8xN 2
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    lea             r7, [r1 * 4]
    mov             r8d, %2 / 16
.loopH:
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    paddd           m1, m5
    pmaddwd         m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m6, m4, [r5 + 1 * mmsize]
    paddd           m2, m6
    pmaddwd         m4, [r5]
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm7, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddwd         m7, m5, [r5 + 2 * mmsize]
    paddd           m1, m7
    pmaddwd         m7, m5, [r5 + 1 * mmsize]
    paddd           m3, m7
    pmaddwd         m5, [r5]
    movu            xm7, [r0 + r4]                  ; m7 = row 7
    punpckhwd       xm8, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddwd         m8, m6, [r5 + 3 * mmsize]
    paddd           m0, m8
    pmaddwd         m8, m6, [r5 + 2 * mmsize]
    paddd           m2, m8
    pmaddwd         m8, m6, [r5 + 1 * mmsize]
    paddd           m4, m8
    pmaddwd         m6, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm8, [r0]                       ; m8 = row 8
    punpckhwd       xm9, xm7, xm8
    punpcklwd       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddwd         m9, m7, [r5 + 3 * mmsize]
    paddd           m1, m9
    pmaddwd         m9, m7, [r5 + 2 * mmsize]
    paddd           m3, m9
    pmaddwd         m9, m7, [r5 + 1 * mmsize]
    paddd           m5, m9
    pmaddwd         m7, [r5]
    movu            xm9, [r0 + r1]                  ; m9 = row 9
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m2, m10
    pmaddwd         m10, m8, [r5 + 2 * mmsize]
    paddd           m4, m10
    pmaddwd         m10, m8, [r5 + 1 * mmsize]
    paddd           m6, m10
    pmaddwd         m8, [r5]
    movu            xm10, [r0 + r1 * 2]             ; m10 = row 10
    punpckhwd       xm11, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddwd         m11, m9, [r5 + 3 * mmsize]
    paddd           m3, m11
    pmaddwd         m11, m9, [r5 + 2 * mmsize]
    paddd           m5, m11
    pmaddwd         m11, m9, [r5 + 1 * mmsize]
    paddd           m7, m11
    pmaddwd         m9, [r5]
    movu            xm11, [r0 + r4]                 ; m11 = row 11
    punpckhwd       xm12, xm10, xm11
    punpcklwd       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddwd         m12, m10, [r5 + 3 * mmsize]
    paddd           m4, m12
    pmaddwd         m12, m10, [r5 + 2 * mmsize]
    paddd           m6, m12
    pmaddwd         m12, m10, [r5 + 1 * mmsize]
    paddd           m8, m12
    pmaddwd         m10, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm12, [r0]                      ; m12 = row 12
    punpckhwd       xm13, xm11, xm12
    punpcklwd       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddwd         m13, m11, [r5 + 3 * mmsize]
    paddd           m5, m13
    pmaddwd         m13, m11, [r5 + 2 * mmsize]
    paddd           m7, m13
    pmaddwd         m13, m11, [r5 + 1 * mmsize]
    paddd           m9, m13
    pmaddwd         m11, [r5]

%ifidn %1,sp
    paddd           m0, m14
    paddd           m1, m14
    paddd           m2, m14
    paddd           m3, m14
    paddd           m4, m14
    paddd           m5, m14
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
    psrad           m4, 12
    psrad           m5, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
    psrad           m4, 6
    psrad           m5, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
    packssdw        m4, m5
%ifidn %1,sp
    packuswb        m0, m2
    mova            m1, [interp8_hps_shuf]
    vpermd          m0, m1, m0
    vextracti128    xm2, m0, 1
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r6], xm2
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
%endif

    movu            xm13, [r0 + r1]                 ; m13 = row 13
    punpckhwd       xm0, xm12, xm13
    punpcklwd       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddwd         m0, m12, [r5 + 3 * mmsize]
    paddd           m6, m0
    pmaddwd         m0, m12, [r5 + 2 * mmsize]
    paddd           m8, m0
    pmaddwd         m0, m12, [r5 + 1 * mmsize]
    paddd           m10, m0
    pmaddwd         m12, [r5]
    movu            xm0, [r0 + r1 * 2]              ; m0 = row 14
    punpckhwd       xm2, xm13, xm0
    punpcklwd       xm13, xm0
    vinserti128     m13, m13, xm2, 1
    pmaddwd         m2, m13, [r5 + 3 * mmsize]
    paddd           m7, m2
    pmaddwd         m2, m13, [r5 + 2 * mmsize]
    paddd           m9, m2
    pmaddwd         m2, m13, [r5 + 1 * mmsize]
    paddd           m11, m2
    pmaddwd         m13, [r5]

%ifidn %1,sp
    paddd           m6, m14
    paddd           m7, m14
    psrad           m6, 12
    psrad           m7, 12
%else
    psrad           m6, 6
    psrad           m7, 6
%endif
    packssdw        m6, m7
    lea             r2, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m4, m6
    vpermd          m4, m1, m4
    vextracti128    xm6, m4, 1
    movq            [r2], xm4
    movhps          [r2 + r3], xm4
    movq            [r2 + r3 * 2], xm6
    movhps          [r2 + r6], xm6
%else
    vpermq          m6, m6, 11011000b
    vpermq          m4, m4, 11011000b
    vextracti128    xm1, m4, 1
    vextracti128    xm7, m6, 1
    movu            [r2], xm4
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm6
    movu            [r2 + r6], xm7
%endif

    movu            xm6, [r0 + r4]                  ; m6 = row 15
    punpckhwd       xm5, xm0, xm6
    punpcklwd       xm0, xm6
    vinserti128     m0, m0, xm5, 1
    pmaddwd         m5, m0, [r5 + 3 * mmsize]
    paddd           m8, m5
    pmaddwd         m5, m0, [r5 + 2 * mmsize]
    paddd           m10, m5
    pmaddwd         m5, m0, [r5 + 1 * mmsize]
    paddd           m12, m5
    pmaddwd         m0, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm2, [r0]                       ; m2 = row 16
    punpckhwd       xm3, xm6, xm2
    punpcklwd       xm6, xm2
    vinserti128     m6, m6, xm3, 1
    pmaddwd         m3, m6, [r5 + 3 * mmsize]
    paddd           m9, m3
    pmaddwd         m3, m6, [r5 + 2 * mmsize]
    paddd           m11, m3
    pmaddwd         m3, m6, [r5 + 1 * mmsize]
    paddd           m13, m3
    pmaddwd         m6, [r5]
    movu            xm3, [r0 + r1]                  ; m3 = row 17
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 3 * mmsize]
    paddd           m10, m4
    pmaddwd         m4, m2, [r5 + 2 * mmsize]
    paddd           m12, m4
    pmaddwd         m2, [r5 + 1 * mmsize]
    paddd           m0, m2
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 18
    punpckhwd       xm2, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm2, 1
    pmaddwd         m2, m3, [r5 + 3 * mmsize]
    paddd           m11, m2
    pmaddwd         m2, m3, [r5 + 2 * mmsize]
    paddd           m13, m2
    pmaddwd         m3, [r5 + 1 * mmsize]
    paddd           m6, m3
    movu            xm2, [r0 + r4]                  ; m2 = row 19
    punpckhwd       xm7, xm4, xm2
    punpcklwd       xm4, xm2
    vinserti128     m4, m4, xm7, 1
    pmaddwd         m7, m4, [r5 + 3 * mmsize]
    paddd           m12, m7
    pmaddwd         m4, [r5 + 2 * mmsize]
    paddd           m0, m4
    lea             r0, [r0 + r1 * 4]
    movu            xm7, [r0]                       ; m7 = row 20
    punpckhwd       xm3, xm2, xm7
    punpcklwd       xm2, xm7
    vinserti128     m2, m2, xm3, 1
    pmaddwd         m3, m2, [r5 + 3 * mmsize]
    paddd           m13, m3
    pmaddwd         m2, [r5 + 2 * mmsize]
    paddd           m6, m2
    movu            xm3, [r0 + r1]                  ; m3 = row 21
    punpckhwd       xm2, xm7, xm3
    punpcklwd       xm7, xm3
    vinserti128     m7, m7, xm2, 1
    pmaddwd         m7, [r5 + 3 * mmsize]
    paddd           m0, m7
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 22
    punpckhwd       xm7, xm3, xm2
    punpcklwd       xm3, xm2
    vinserti128     m3, m3, xm7, 1
    pmaddwd         m3, [r5 + 3 * mmsize]
    paddd           m6, m3

%ifidn %1,sp
    paddd           m8, m14
    paddd           m9, m14
    paddd           m10, m14
    paddd           m11, m14
    paddd           m12, m14
    paddd           m13, m14
    paddd           m0, m14
    paddd           m6, m14
    psrad           m8, 12
    psrad           m9, 12
    psrad           m10, 12
    psrad           m11, 12
    psrad           m12, 12
    psrad           m13, 12
    psrad           m0, 12
    psrad           m6, 12
%else
    psrad           m8, 6
    psrad           m9, 6
    psrad           m10, 6
    psrad           m11, 6
    psrad           m12, 6
    psrad           m13, 6
    psrad           m0, 6
    psrad           m6, 6
%endif
    packssdw        m8, m9
    packssdw        m10, m11
    packssdw        m12, m13
    packssdw        m0, m6
    lea             r2, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m8, m10
    packuswb        m12, m0
    vpermd          m8, m1, m8
    vpermd          m12, m1, m12
    vextracti128    xm10, m8, 1
    vextracti128    xm0, m12, 1
    movq            [r2], xm8
    movhps          [r2 + r3], xm8
    movq            [r2 + r3 * 2], xm10
    movhps          [r2 + r6], xm10
    lea             r2, [r2 + r3 * 4]
    movq            [r2], xm12
    movhps          [r2 + r3], xm12
    movq            [r2 + r3 * 2], xm0
    movhps          [r2 + r6], xm0
%else
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm6, m0, 1
    movu            [r2], xm8
    movu            [r2 + r3], xm9
    movu            [r2 + r3 * 2], xm10
    movu            [r2 + r6], xm11
    lea             r2, [r2 + r3 * 4]
    movu            [r2], xm12
    movu            [r2 + r3], xm13
    movu            [r2 + r3 * 2], xm0
    movu            [r2 + r6], xm6
%endif

    lea             r2, [r2 + r3 * 4]
    sub             r0, r7
    dec             r8d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_S_AVX2_8xN sp, 16
    FILTER_VER_LUMA_S_AVX2_8xN sp, 32
    FILTER_VER_LUMA_S_AVX2_8xN ss, 16
    FILTER_VER_LUMA_S_AVX2_8xN ss, 32

%macro PROCESS_LUMA_S_AVX2_W8_4R 1
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5]
    lea             r0, [r0 + r1 * 4]
    movu            xm4, [r0]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    paddd           m1, m5
    pmaddwd         m3, [r5]
    movu            xm5, [r0 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m4, [r5 + 1 * mmsize]
    paddd           m2, m4
    movu            xm6, [r0 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm4, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm4, 1
    pmaddwd         m4, m5, [r5 + 2 * mmsize]
    paddd           m1, m4
    pmaddwd         m5, [r5 + 1 * mmsize]
    paddd           m3, m5
    movu            xm4, [r0 + r4]                  ; m4 = row 7
    punpckhwd       xm5, xm6, xm4
    punpcklwd       xm6, xm4
    vinserti128     m6, m6, xm5, 1
    pmaddwd         m5, m6, [r5 + 3 * mmsize]
    paddd           m0, m5
    pmaddwd         m6, [r5 + 2 * mmsize]
    paddd           m2, m6
    lea             r0, [r0 + r1 * 4]
    movu            xm5, [r0]                       ; m5 = row 8
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 3 * mmsize]
    paddd           m1, m6
    pmaddwd         m4, [r5 + 2 * mmsize]
    paddd           m3, m4
    movu            xm6, [r0 + r1]                  ; m6 = row 9
    punpckhwd       xm4, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm4, 1
    pmaddwd         m5, [r5 + 3 * mmsize]
    paddd           m2, m5
    movu            xm4, [r0 + r1 * 2]              ; m4 = row 10
    punpckhwd       xm5, xm6, xm4
    punpcklwd       xm6, xm4
    vinserti128     m6, m6, xm5, 1
    pmaddwd         m6, [r5 + 3 * mmsize]
    paddd           m3, m6

%ifidn %1,sp
    paddd           m0, m7
    paddd           m1, m7
    paddd           m2, m7
    paddd           m3, m7
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
%ifidn %1,sp
    packuswb        m0, m2
    mova            m4, [interp8_hps_shuf]
    vpermd          m0, m4, m0
    vextracti128    xm2, m0, 1
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
%endif
%endmacro

%macro FILTER_VER_LUMA_S_AVX2_8x4 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_8x4, 4, 6, 8
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m7, [pd_526336]
%else
    add             r3d, r3d
%endif

    PROCESS_LUMA_S_AVX2_W8_4R %1
    lea             r4, [r3 * 3]
%ifidn %1,sp
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r4], xm2
%else
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r4], xm3
%endif
    RET
%endmacro

    FILTER_VER_LUMA_S_AVX2_8x4 sp
    FILTER_VER_LUMA_S_AVX2_8x4 ss

%macro PROCESS_LUMA_AVX2_W8_16R 1
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5]
    lea             r7, [r0 + r1 * 4]
    movu            xm4, [r7]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    paddd           m1, m5
    pmaddwd         m3, [r5]
    movu            xm5, [r7 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m6, m4, [r5 + 1 * mmsize]
    paddd           m2, m6
    pmaddwd         m4, [r5]
    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm7, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddwd         m7, m5, [r5 + 2 * mmsize]
    paddd           m1, m7
    pmaddwd         m7, m5, [r5 + 1 * mmsize]
    paddd           m3, m7
    pmaddwd         m5, [r5]
    movu            xm7, [r7 + r4]                  ; m7 = row 7
    punpckhwd       xm8, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddwd         m8, m6, [r5 + 3 * mmsize]
    paddd           m0, m8
    pmaddwd         m8, m6, [r5 + 2 * mmsize]
    paddd           m2, m8
    pmaddwd         m8, m6, [r5 + 1 * mmsize]
    paddd           m4, m8
    pmaddwd         m6, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm8, [r7]                       ; m8 = row 8
    punpckhwd       xm9, xm7, xm8
    punpcklwd       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddwd         m9, m7, [r5 + 3 * mmsize]
    paddd           m1, m9
    pmaddwd         m9, m7, [r5 + 2 * mmsize]
    paddd           m3, m9
    pmaddwd         m9, m7, [r5 + 1 * mmsize]
    paddd           m5, m9
    pmaddwd         m7, [r5]
    movu            xm9, [r7 + r1]                  ; m9 = row 9
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m2, m10
    pmaddwd         m10, m8, [r5 + 2 * mmsize]
    paddd           m4, m10
    pmaddwd         m10, m8, [r5 + 1 * mmsize]
    paddd           m6, m10
    pmaddwd         m8, [r5]
    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
    punpckhwd       xm11, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddwd         m11, m9, [r5 + 3 * mmsize]
    paddd           m3, m11
    pmaddwd         m11, m9, [r5 + 2 * mmsize]
    paddd           m5, m11
    pmaddwd         m11, m9, [r5 + 1 * mmsize]
    paddd           m7, m11
    pmaddwd         m9, [r5]
    movu            xm11, [r7 + r4]                 ; m11 = row 11
    punpckhwd       xm12, xm10, xm11
    punpcklwd       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddwd         m12, m10, [r5 + 3 * mmsize]
    paddd           m4, m12
    pmaddwd         m12, m10, [r5 + 2 * mmsize]
    paddd           m6, m12
    pmaddwd         m12, m10, [r5 + 1 * mmsize]
    paddd           m8, m12
    pmaddwd         m10, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm12, [r7]                      ; m12 = row 12
    punpckhwd       xm13, xm11, xm12
    punpcklwd       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddwd         m13, m11, [r5 + 3 * mmsize]
    paddd           m5, m13
    pmaddwd         m13, m11, [r5 + 2 * mmsize]
    paddd           m7, m13
    pmaddwd         m13, m11, [r5 + 1 * mmsize]
    paddd           m9, m13
    pmaddwd         m11, [r5]

%ifidn %1,sp
    paddd           m0, m14
    paddd           m1, m14
    paddd           m2, m14
    paddd           m3, m14
    paddd           m4, m14
    paddd           m5, m14
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
    psrad           m4, 12
    psrad           m5, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
    psrad           m4, 6
    psrad           m5, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
    packssdw        m4, m5
%ifidn %1,sp
    packuswb        m0, m2
    mova            m5, [interp8_hps_shuf]
    vpermd          m0, m5, m0
    vextracti128    xm2, m0, 1
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r6], xm2
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
%endif

    movu            xm13, [r7 + r1]                 ; m13 = row 13
    punpckhwd       xm0, xm12, xm13
    punpcklwd       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddwd         m0, m12, [r5 + 3 * mmsize]
    paddd           m6, m0
    pmaddwd         m0, m12, [r5 + 2 * mmsize]
    paddd           m8, m0
    pmaddwd         m0, m12, [r5 + 1 * mmsize]
    paddd           m10, m0
    pmaddwd         m12, [r5]
    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
    punpckhwd       xm1, xm13, xm0
    punpcklwd       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddwd         m1, m13, [r5 + 3 * mmsize]
    paddd           m7, m1
    pmaddwd         m1, m13, [r5 + 2 * mmsize]
    paddd           m9, m1
    pmaddwd         m1, m13, [r5 + 1 * mmsize]
    paddd           m11, m1
    pmaddwd         m13, [r5]

%ifidn %1,sp
    paddd           m6, m14
    paddd           m7, m14
    psrad           m6, 12
    psrad           m7, 12
%else
    psrad           m6, 6
    psrad           m7, 6
%endif
    packssdw        m6, m7
    lea             r8, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m4, m6
    vpermd          m4, m5, m4
    vextracti128    xm6, m4, 1
    movq            [r8], xm4
    movhps          [r8 + r3], xm4
    movq            [r8 + r3 * 2], xm6
    movhps          [r8 + r6], xm6
%else
    vpermq          m4, m4, 11011000b
    vpermq          m6, m6, 11011000b
    vextracti128    xm1, m4, 1
    vextracti128    xm7, m6, 1
    movu            [r8], xm4
    movu            [r8 + r3], xm1
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm7
%endif

    movu            xm1, [r7 + r4]                  ; m1 = row 15
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m2, m0, [r5 + 3 * mmsize]
    paddd           m8, m2
    pmaddwd         m2, m0, [r5 + 2 * mmsize]
    paddd           m10, m2
    pmaddwd         m2, m0, [r5 + 1 * mmsize]
    paddd           m12, m2
    pmaddwd         m0, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm2, [r7]                       ; m2 = row 16
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m3, m1, [r5 + 3 * mmsize]
    paddd           m9, m3
    pmaddwd         m3, m1, [r5 + 2 * mmsize]
    paddd           m11, m3
    pmaddwd         m3, m1, [r5 + 1 * mmsize]
    paddd           m13, m3
    pmaddwd         m1, [r5]
    movu            xm3, [r7 + r1]                  ; m3 = row 17
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 3 * mmsize]
    paddd           m10, m4
    pmaddwd         m4, m2, [r5 + 2 * mmsize]
    paddd           m12, m4
    pmaddwd         m2, [r5 + 1 * mmsize]
    paddd           m0, m2
    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
    punpckhwd       xm2, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm2, 1
    pmaddwd         m2, m3, [r5 + 3 * mmsize]
    paddd           m11, m2
    pmaddwd         m2, m3, [r5 + 2 * mmsize]
    paddd           m13, m2
    pmaddwd         m3, [r5 + 1 * mmsize]
    paddd           m1, m3
    movu            xm2, [r7 + r4]                  ; m2 = row 19
    punpckhwd       xm6, xm4, xm2
    punpcklwd       xm4, xm2
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 3 * mmsize]
    paddd           m12, m6
    pmaddwd         m4, [r5 + 2 * mmsize]
    paddd           m0, m4
    lea             r7, [r7 + r1 * 4]
    movu            xm6, [r7]                       ; m6 = row 20
    punpckhwd       xm7, xm2, xm6
    punpcklwd       xm2, xm6
    vinserti128     m2, m2, xm7, 1
    pmaddwd         m7, m2, [r5 + 3 * mmsize]
    paddd           m13, m7
    pmaddwd         m2, [r5 + 2 * mmsize]
    paddd           m1, m2
    movu            xm7, [r7 + r1]                  ; m7 = row 21
    punpckhwd       xm2, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm2, 1
    pmaddwd         m6, [r5 + 3 * mmsize]
    paddd           m0, m6
    movu            xm2, [r7 + r1 * 2]              ; m2 = row 22
    punpckhwd       xm3, xm7, xm2
    punpcklwd       xm7, xm2
    vinserti128     m7, m7, xm3, 1
    pmaddwd         m7, [r5 + 3 * mmsize]
    paddd           m1, m7

%ifidn %1,sp
    paddd           m8, m14
    paddd           m9, m14
    paddd           m10, m14
    paddd           m11, m14
    paddd           m12, m14
    paddd           m13, m14
    paddd           m0, m14
    paddd           m1, m14
    psrad           m8, 12
    psrad           m9, 12
    psrad           m10, 12
    psrad           m11, 12
    psrad           m12, 12
    psrad           m13, 12
    psrad           m0, 12
    psrad           m1, 12
%else
    psrad           m8, 6
    psrad           m9, 6
    psrad           m10, 6
    psrad           m11, 6
    psrad           m12, 6
    psrad           m13, 6
    psrad           m0, 6
    psrad           m1, 6
%endif
    packssdw        m8, m9
    packssdw        m10, m11
    packssdw        m12, m13
    packssdw        m0, m1
    lea             r8, [r8 + r3 * 4]

%ifidn %1,sp
    packuswb        m8, m10
    packuswb        m12, m0
    vpermd          m8, m5, m8
    vpermd          m12, m5, m12
    vextracti128    xm10, m8, 1
    vextracti128    xm0, m12, 1
    movq            [r8], xm8
    movhps          [r8 + r3], xm8
    movq            [r8 + r3 * 2], xm10
    movhps          [r8 + r6], xm10
    lea             r8, [r8 + r3 * 4]
    movq            [r8], xm12
    movhps          [r8 + r3], xm12
    movq            [r8 + r3 * 2], xm0
    movhps          [r8 + r6], xm0
%else
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vpermq          m12, m12, 11011000b
    vpermq          m0, m0, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    vextracti128    xm13, m12, 1
    vextracti128    xm1, m0, 1
    movu            [r8], xm8
    movu            [r8 + r3], xm9
    movu            [r8 + r3 * 2], xm10
    movu            [r8 + r6], xm11
    lea             r8, [r8 + r3 * 4]
    movu            [r8], xm12
    movu            [r8 + r3], xm13
    movu            [r8 + r3 * 2], xm0
    movu            [r8 + r6], xm1
%endif
%endmacro

%macro FILTER_VER_LUMA_AVX2_Nx16 2
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    mov             r9d, %2 / 8
.loopW:
    PROCESS_LUMA_AVX2_W8_16R %1
%ifidn %1,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_Nx16 sp, 16
    FILTER_VER_LUMA_AVX2_Nx16 sp, 32
    FILTER_VER_LUMA_AVX2_Nx16 sp, 64
    FILTER_VER_LUMA_AVX2_Nx16 ss, 16
    FILTER_VER_LUMA_AVX2_Nx16 ss, 32
    FILTER_VER_LUMA_AVX2_Nx16 ss, 64

%macro FILTER_VER_LUMA_AVX2_NxN 3
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4

%ifidn %3,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif

    lea             r6, [r3 * 3]
    lea             r11, [r1 * 4]
    mov             r9d, %2 / 16
.loopH:
    mov             r10d, %1 / 8
.loopW:
    PROCESS_LUMA_AVX2_W8_16R %3
%ifidn %3,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    dec             r10d
    jnz             .loopW
    sub             r7, r11
    lea             r0, [r7 - 2 * %1 + 16]
%ifidn %3,sp
    lea             r2, [r8 + r3 * 4 - %1 + 8]
%else
    lea             r2, [r8 + r3 * 4 - 2 * %1 + 16]
%endif
    dec             r9d
    jnz             .loopH
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
    FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
    FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
    FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
    FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
    FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
    FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
    FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
    FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
    FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
    FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
    FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
    FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
    FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
    FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
    FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
    FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
    FILTER_VER_LUMA_AVX2_NxN 64, 64, ss

%macro FILTER_VER_LUMA_S_AVX2_12x16 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_12x16, 4, 9, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    PROCESS_LUMA_AVX2_W8_16R %1
%ifidn %1,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    mova            m7, m14
    PROCESS_LUMA_AVX2_W4_16R %1
    RET
%endif
%endmacro

    FILTER_VER_LUMA_S_AVX2_12x16 sp
    FILTER_VER_LUMA_S_AVX2_12x16 ss

%macro FILTER_VER_LUMA_S_AVX2_16x12 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_16x12, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    mov             r9d, 2
.loopW:
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5]
    lea             r7, [r0 + r1 * 4]
    movu            xm4, [r7]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    paddd           m1, m5
    pmaddwd         m3, [r5]
    movu            xm5, [r7 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m6, m4, [r5 + 1 * mmsize]
    paddd           m2, m6
    pmaddwd         m4, [r5]
    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm7, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddwd         m7, m5, [r5 + 2 * mmsize]
    paddd           m1, m7
    pmaddwd         m7, m5, [r5 + 1 * mmsize]
    paddd           m3, m7
    pmaddwd         m5, [r5]
    movu            xm7, [r7 + r4]                  ; m7 = row 7
    punpckhwd       xm8, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddwd         m8, m6, [r5 + 3 * mmsize]
    paddd           m0, m8
    pmaddwd         m8, m6, [r5 + 2 * mmsize]
    paddd           m2, m8
    pmaddwd         m8, m6, [r5 + 1 * mmsize]
    paddd           m4, m8
    pmaddwd         m6, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm8, [r7]                       ; m8 = row 8
    punpckhwd       xm9, xm7, xm8
    punpcklwd       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddwd         m9, m7, [r5 + 3 * mmsize]
    paddd           m1, m9
    pmaddwd         m9, m7, [r5 + 2 * mmsize]
    paddd           m3, m9
    pmaddwd         m9, m7, [r5 + 1 * mmsize]
    paddd           m5, m9
    pmaddwd         m7, [r5]
    movu            xm9, [r7 + r1]                  ; m9 = row 9
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m2, m10
    pmaddwd         m10, m8, [r5 + 2 * mmsize]
    paddd           m4, m10
    pmaddwd         m10, m8, [r5 + 1 * mmsize]
    paddd           m6, m10
    pmaddwd         m8, [r5]
    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
    punpckhwd       xm11, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm11, 1
    pmaddwd         m11, m9, [r5 + 3 * mmsize]
    paddd           m3, m11
    pmaddwd         m11, m9, [r5 + 2 * mmsize]
    paddd           m5, m11
    pmaddwd         m11, m9, [r5 + 1 * mmsize]
    paddd           m7, m11
    pmaddwd         m9, [r5]
    movu            xm11, [r7 + r4]                 ; m11 = row 11
    punpckhwd       xm12, xm10, xm11
    punpcklwd       xm10, xm11
    vinserti128     m10, m10, xm12, 1
    pmaddwd         m12, m10, [r5 + 3 * mmsize]
    paddd           m4, m12
    pmaddwd         m12, m10, [r5 + 2 * mmsize]
    paddd           m6, m12
    pmaddwd         m12, m10, [r5 + 1 * mmsize]
    paddd           m8, m12
    pmaddwd         m10, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm12, [r7]                      ; m12 = row 12
    punpckhwd       xm13, xm11, xm12
    punpcklwd       xm11, xm12
    vinserti128     m11, m11, xm13, 1
    pmaddwd         m13, m11, [r5 + 3 * mmsize]
    paddd           m5, m13
    pmaddwd         m13, m11, [r5 + 2 * mmsize]
    paddd           m7, m13
    pmaddwd         m13, m11, [r5 + 1 * mmsize]
    paddd           m9, m13
    pmaddwd         m11, [r5]

%ifidn %1,sp
    paddd           m0, m14
    paddd           m1, m14
    paddd           m2, m14
    paddd           m3, m14
    paddd           m4, m14
    paddd           m5, m14
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
    psrad           m4, 12
    psrad           m5, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
    psrad           m4, 6
    psrad           m5, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
    packssdw        m4, m5

%ifidn %1,sp
    packuswb        m0, m2
    mova            m5, [interp8_hps_shuf]
    vpermd          m0, m5, m0
    vextracti128    xm2, m0, 1
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r6], xm2
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
%endif

    movu            xm13, [r7 + r1]                 ; m13 = row 13
    punpckhwd       xm0, xm12, xm13
    punpcklwd       xm12, xm13
    vinserti128     m12, m12, xm0, 1
    pmaddwd         m0, m12, [r5 + 3 * mmsize]
    paddd           m6, m0
    pmaddwd         m0, m12, [r5 + 2 * mmsize]
    paddd           m8, m0
    pmaddwd         m12, [r5 + 1 * mmsize]
    paddd           m10, m12
    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
    punpckhwd       xm1, xm13, xm0
    punpcklwd       xm13, xm0
    vinserti128     m13, m13, xm1, 1
    pmaddwd         m1, m13, [r5 + 3 * mmsize]
    paddd           m7, m1
    pmaddwd         m1, m13, [r5 + 2 * mmsize]
    paddd           m9, m1
    pmaddwd         m13, [r5 + 1 * mmsize]
    paddd           m11, m13

%ifidn %1,sp
    paddd           m6, m14
    paddd           m7, m14
    psrad           m6, 12
    psrad           m7, 12
%else
    psrad           m6, 6
    psrad           m7, 6
%endif
    packssdw        m6, m7
    lea             r8, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m4, m6
    vpermd          m4, m5, m4
    vextracti128    xm6, m4, 1
    movq            [r8], xm4
    movhps          [r8 + r3], xm4
    movq            [r8 + r3 * 2], xm6
    movhps          [r8 + r6], xm6
%else
    vpermq          m4, m4, 11011000b
    vpermq          m6, m6, 11011000b
    vextracti128    xm1, m4, 1
    vextracti128    xm7, m6, 1
    movu            [r8], xm4
    movu            [r8 + r3], xm1
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm7
%endif

    movu            xm1, [r7 + r4]                  ; m1 = row 15
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m2, m0, [r5 + 3 * mmsize]
    paddd           m8, m2
    pmaddwd         m0, [r5 + 2 * mmsize]
    paddd           m10, m0
    lea             r7, [r7 + r1 * 4]
    movu            xm2, [r7]                       ; m2 = row 16
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m3, m1, [r5 + 3 * mmsize]
    paddd           m9, m3
    pmaddwd         m1, [r5 + 2 * mmsize]
    paddd           m11, m1
    movu            xm3, [r7 + r1]                  ; m3 = row 17
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m2, [r5 + 3 * mmsize]
    paddd           m10, m2
    movu            xm4, [r7 + r1 * 2]              ; m4 = row 18
    punpckhwd       xm2, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm2, 1
    pmaddwd         m3, [r5 + 3 * mmsize]
    paddd           m11, m3

%ifidn %1,sp
    paddd           m8, m14
    paddd           m9, m14
    paddd           m10, m14
    paddd           m11, m14
    psrad           m8, 12
    psrad           m9, 12
    psrad           m10, 12
    psrad           m11, 12
%else
    psrad           m8, 6
    psrad           m9, 6
    psrad           m10, 6
    psrad           m11, 6
%endif
    packssdw        m8, m9
    packssdw        m10, m11
    lea             r8, [r8 + r3 * 4]

%ifidn %1,sp
    packuswb        m8, m10
    vpermd          m8, m5, m8
    vextracti128    xm10, m8, 1
    movq            [r8], xm8
    movhps          [r8 + r3], xm8
    movq            [r8 + r3 * 2], xm10
    movhps          [r8 + r6], xm10
    add             r2, 8
%else
    vpermq          m8, m8, 11011000b
    vpermq          m10, m10, 11011000b
    vextracti128    xm9, m8, 1
    vextracti128    xm11, m10, 1
    movu            [r8], xm8
    movu            [r8 + r3], xm9
    movu            [r8 + r3 * 2], xm10
    movu            [r8 + r6], xm11
    add             r2, 16
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_S_AVX2_16x12 sp
    FILTER_VER_LUMA_S_AVX2_16x12 ss

%macro FILTER_VER_LUMA_S_AVX2_16x4 1
INIT_YMM avx2
cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m7, [pd_526336]
%else
    add             r3d, r3d
%endif
    mov             dword [rsp], 2
.loopW:
    PROCESS_LUMA_S_AVX2_W8_4R %1
    lea             r6, [r3 * 3]
%ifidn %1,sp
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r6], xm2
    add             r2, 8
%else
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
    add             r2, 16
%endif
    lea             r6, [8 * r1 - 16]
    sub             r0, r6
    dec             dword [rsp]
    jnz             .loopW
    RET
%endmacro

    FILTER_VER_LUMA_S_AVX2_16x4 sp
    FILTER_VER_LUMA_S_AVX2_16x4 ss

%macro PROCESS_LUMA_S_AVX2_W8_8R 1
    movu            xm0, [r0]                       ; m0 = row 0
    movu            xm1, [r0 + r1]                  ; m1 = row 1
    punpckhwd       xm2, xm0, xm1
    punpcklwd       xm0, xm1
    vinserti128     m0, m0, xm2, 1
    pmaddwd         m0, [r5]
    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
    punpckhwd       xm3, xm1, xm2
    punpcklwd       xm1, xm2
    vinserti128     m1, m1, xm3, 1
    pmaddwd         m1, [r5]
    movu            xm3, [r0 + r4]                  ; m3 = row 3
    punpckhwd       xm4, xm2, xm3
    punpcklwd       xm2, xm3
    vinserti128     m2, m2, xm4, 1
    pmaddwd         m4, m2, [r5 + 1 * mmsize]
    paddd           m0, m4
    pmaddwd         m2, [r5]
    lea             r7, [r0 + r1 * 4]
    movu            xm4, [r7]                       ; m4 = row 4
    punpckhwd       xm5, xm3, xm4
    punpcklwd       xm3, xm4
    vinserti128     m3, m3, xm5, 1
    pmaddwd         m5, m3, [r5 + 1 * mmsize]
    paddd           m1, m5
    pmaddwd         m3, [r5]
    movu            xm5, [r7 + r1]                  ; m5 = row 5
    punpckhwd       xm6, xm4, xm5
    punpcklwd       xm4, xm5
    vinserti128     m4, m4, xm6, 1
    pmaddwd         m6, m4, [r5 + 2 * mmsize]
    paddd           m0, m6
    pmaddwd         m6, m4, [r5 + 1 * mmsize]
    paddd           m2, m6
    pmaddwd         m4, [r5]
    movu            xm6, [r7 + r1 * 2]              ; m6 = row 6
    punpckhwd       xm7, xm5, xm6
    punpcklwd       xm5, xm6
    vinserti128     m5, m5, xm7, 1
    pmaddwd         m7, m5, [r5 + 2 * mmsize]
    paddd           m1, m7
    pmaddwd         m7, m5, [r5 + 1 * mmsize]
    paddd           m3, m7
    pmaddwd         m5, [r5]
    movu            xm7, [r7 + r4]                  ; m7 = row 7
    punpckhwd       xm8, xm6, xm7
    punpcklwd       xm6, xm7
    vinserti128     m6, m6, xm8, 1
    pmaddwd         m8, m6, [r5 + 3 * mmsize]
    paddd           m0, m8
    pmaddwd         m8, m6, [r5 + 2 * mmsize]
    paddd           m2, m8
    pmaddwd         m8, m6, [r5 + 1 * mmsize]
    paddd           m4, m8
    pmaddwd         m6, [r5]
    lea             r7, [r7 + r1 * 4]
    movu            xm8, [r7]                       ; m8 = row 8
    punpckhwd       xm9, xm7, xm8
    punpcklwd       xm7, xm8
    vinserti128     m7, m7, xm9, 1
    pmaddwd         m9, m7, [r5 + 3 * mmsize]
    paddd           m1, m9
    pmaddwd         m9, m7, [r5 + 2 * mmsize]
    paddd           m3, m9
    pmaddwd         m9, m7, [r5 + 1 * mmsize]
    paddd           m5, m9
    pmaddwd         m7, [r5]
    movu            xm9, [r7 + r1]                  ; m9 = row 9
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m2, m10
    pmaddwd         m10, m8, [r5 + 2 * mmsize]
    paddd           m4, m10
    pmaddwd         m8, [r5 + 1 * mmsize]
    paddd           m6, m8
    movu            xm10, [r7 + r1 * 2]             ; m10 = row 10
    punpckhwd       xm8, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm8, 1
    pmaddwd         m8, m9, [r5 + 3 * mmsize]
    paddd           m3, m8
    pmaddwd         m8, m9, [r5 + 2 * mmsize]
    paddd           m5, m8
    pmaddwd         m9, [r5 + 1 * mmsize]
    paddd           m7, m9
    movu            xm8, [r7 + r4]                  ; m8 = row 11
    punpckhwd       xm9, xm10, xm8
    punpcklwd       xm10, xm8
    vinserti128     m10, m10, xm9, 1
    pmaddwd         m9, m10, [r5 + 3 * mmsize]
    paddd           m4, m9
    pmaddwd         m10, [r5 + 2 * mmsize]
    paddd           m6, m10
    lea             r7, [r7 + r1 * 4]
    movu            xm9, [r7]                       ; m9 = row 12
    punpckhwd       xm10, xm8, xm9
    punpcklwd       xm8, xm9
    vinserti128     m8, m8, xm10, 1
    pmaddwd         m10, m8, [r5 + 3 * mmsize]
    paddd           m5, m10
    pmaddwd         m8, [r5 + 2 * mmsize]
    paddd           m7, m8

%ifidn %1,sp
    paddd           m0, m11
    paddd           m1, m11
    paddd           m2, m11
    paddd           m3, m11
    paddd           m4, m11
    paddd           m5, m11
    psrad           m0, 12
    psrad           m1, 12
    psrad           m2, 12
    psrad           m3, 12
    psrad           m4, 12
    psrad           m5, 12
%else
    psrad           m0, 6
    psrad           m1, 6
    psrad           m2, 6
    psrad           m3, 6
    psrad           m4, 6
    psrad           m5, 6
%endif
    packssdw        m0, m1
    packssdw        m2, m3
    packssdw        m4, m5

%ifidn %1,sp
    packuswb        m0, m2
    mova            m5, [interp8_hps_shuf]
    vpermd          m0, m5, m0
    vextracti128    xm2, m0, 1
    movq            [r2], xm0
    movhps          [r2 + r3], xm0
    movq            [r2 + r3 * 2], xm2
    movhps          [r2 + r6], xm2
%else
    vpermq          m0, m0, 11011000b
    vpermq          m2, m2, 11011000b
    vextracti128    xm1, m0, 1
    vextracti128    xm3, m2, 1
    movu            [r2], xm0
    movu            [r2 + r3], xm1
    movu            [r2 + r3 * 2], xm2
    movu            [r2 + r6], xm3
%endif

    movu            xm10, [r7 + r1]                 ; m10 = row 13
    punpckhwd       xm0, xm9, xm10
    punpcklwd       xm9, xm10
    vinserti128     m9, m9, xm0, 1
    pmaddwd         m9, [r5 + 3 * mmsize]
    paddd           m6, m9
    movu            xm0, [r7 + r1 * 2]              ; m0 = row 14
    punpckhwd       xm1, xm10, xm0
    punpcklwd       xm10, xm0
    vinserti128     m10, m10, xm1, 1
    pmaddwd         m10, [r5 + 3 * mmsize]
    paddd           m7, m10

%ifidn %1,sp
    paddd           m6, m11
    paddd           m7, m11
    psrad           m6, 12
    psrad           m7, 12
%else
    psrad           m6, 6
    psrad           m7, 6
%endif
    packssdw        m6, m7
    lea             r8, [r2 + r3 * 4]

%ifidn %1,sp
    packuswb        m4, m6
    vpermd          m4, m5, m4
    vextracti128    xm6, m4, 1
    movq            [r8], xm4
    movhps          [r8 + r3], xm4
    movq            [r8 + r3 * 2], xm6
    movhps          [r8 + r6], xm6
%else
    vpermq          m4, m4, 11011000b
    vpermq          m6, m6, 11011000b
    vextracti128    xm5, m4, 1
    vextracti128    xm7, m6, 1
    movu            [r8], xm4
    movu            [r8 + r3], xm5
    movu            [r8 + r3 * 2], xm6
    movu            [r8 + r6], xm7
%endif
%endmacro

%macro FILTER_VER_LUMA_AVX2_Nx8 2
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m11, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    mov             r9d, %2 / 8
.loopW:
    PROCESS_LUMA_S_AVX2_W8_8R %1
%ifidn %1,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    RET
%endif
%endmacro

    FILTER_VER_LUMA_AVX2_Nx8 sp, 32
    FILTER_VER_LUMA_AVX2_Nx8 sp, 16
    FILTER_VER_LUMA_AVX2_Nx8 ss, 32
    FILTER_VER_LUMA_AVX2_Nx8 ss, 16

%macro FILTER_VER_LUMA_S_AVX2_32x24 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_vert_%1_32x24, 4, 10, 15
    mov             r4d, r4m
    shl             r4d, 7
    add             r1d, r1d

%ifdef PIC
    lea             r5, [pw_LumaCoeffVer]
    add             r5, r4
%else
    lea             r5, [pw_LumaCoeffVer + r4]
%endif

    lea             r4, [r1 * 3]
    sub             r0, r4
%ifidn %1,sp
    mova            m14, [pd_526336]
%else
    add             r3d, r3d
%endif
    lea             r6, [r3 * 3]
    mov             r9d, 4
.loopW:
    PROCESS_LUMA_AVX2_W8_16R %1
%ifidn %1,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    dec             r9d
    jnz             .loopW
    lea             r9, [r1 * 4]
    sub             r7, r9
    lea             r0, [r7 - 48]
%ifidn %1,sp
    lea             r2, [r8 + r3 * 4 - 24]
%else
    lea             r2, [r8 + r3 * 4 - 48]
%endif
    mova            m11, m14
    mov             r9d, 4
.loop:
    PROCESS_LUMA_S_AVX2_W8_8R %1
%ifidn %1,sp
    add             r2, 8
%else
    add             r2, 16
%endif
    add             r0, 16
    dec             r9d
    jnz             .loop
    RET
%endif
%endmacro

    FILTER_VER_LUMA_S_AVX2_32x24 sp
    FILTER_VER_LUMA_S_AVX2_32x24 ss
;-------------------------------------------------------------------------------------------------------------
;ipfilter_chroma_avx512 code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0
    ; register map
    ; m0 - interpolate coeff
    ; m1, m2 - shuffle order table
    ; m3 - constant word 1
    ; m4 - constant word 512

    movu               m5,           [r0]
    pshufb             m6,           m5,       m2
    pshufb             m5,           m5,       m1
    pmaddubsw          m5,           m0
    pmaddubsw          m6,           m0
    pmaddwd            m5,           m3
    pmaddwd            m6,           m3

    movu               m7,           [r0 + 4]
    pshufb             m8,           m7,       m2
    pshufb             m7,           m7,       m1
    pmaddubsw          m7,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m7,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu              [r2],          m5
%endmacro

%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
    ; register map
    ; m0 - interpolate coeff
    ; m1, m2 - shuffle order table
    ; m3 - constant word 1
    ; m4 - constant word 512
    ; m9 - store shuffle order table

    movu              ym5,           [r0]
    vinserti32x8       m5,           [r0 + r1], 1
    movu              ym7,           [r0 + 4]
    vinserti32x8       m7,           [r0 + r1 + 4], 1

    pshufb             m6,           m5,       m2
    pshufb             m5,           m1
    pshufb             m8,           m7,       m2
    pshufb             m7,           m1

    pmaddubsw          m5,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m5,           m3
    pmaddwd            m7,           m3

    pmaddubsw          m6,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m6,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu             [r2],          ym5
    vextracti32x8    [r2 + r3],      m5,            1
%endmacro

%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0
    ; register map
    ; m0 - interpolate coeff
    ; m1, m2 - shuffle order table
    ; m3 - constant word 1
    ; m4 - constant word 512

    movu              xm5,           [r0]
    vinserti32x4       m5,           [r0 + r1],            1
    vinserti32x4       m5,           [r0 + 2 * r1],        2
    vinserti32x4       m5,           [r0 + r6],            3
    pshufb             m6,           m5,       m2
    pshufb             m5,           m1

    movu              xm7,           [r0 + 4]
    vinserti32x4       m7,           [r0 + r1 + 4],        1
    vinserti32x4       m7,           [r0 + 2 * r1 + 4],    2
    vinserti32x4       m7,           [r0 + r6 + 4],        3
    pshufb             m8,           m7,       m2
    pshufb             m7,           m1

    pmaddubsw          m5,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m5,           m3
    pmaddwd            m7,           m3

    pmaddubsw          m6,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m6,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu              [r2],          xm5
    vextracti32x4     [r2 + r3],     m5,       1
    vextracti32x4     [r2 + 2 * r3], m5,       2
    vextracti32x4     [r2 + r7],     m5,       3
%endmacro

%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0
    ; register map
    ; m0 - interpolate coeff
    ; m1, m2 - shuffle order table
    ; m3 - constant word 1
    ; m4 - constant word 512
    movu              ym5,           [r0]
    vinserti32x8       m5,           [r0 + r1], 1
    movu              ym7,           [r0 + 4]
    vinserti32x8       m7,           [r0 + r1 + 4], 1

    pshufb             m6,           m5,           m2
    pshufb             m5,           m1
    pshufb             m8,           m7,           m2
    pshufb             m7,           m1

    pmaddubsw          m5,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m5,           m3
    pmaddwd            m7,           m3

    pmaddubsw          m6,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m6,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu             [r2],          ym5
    vextracti32x8    [r2 + r3],      m5,            1

    movu              ym5,           [r0 + 2 * r1]
    vinserti32x8       m5,           [r0 + r6], 1
    movu              ym7,           [r0 + 2 * r1 + 4]
    vinserti32x8       m7,           [r0 + r6 + 4], 1

    pshufb             m6,           m5,           m2
    pshufb             m5,           m1
    pshufb             m8,           m7,           m2
    pshufb             m7,           m1

    pmaddubsw          m5,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m5,           m3
    pmaddwd            m7,           m3

    pmaddubsw          m6,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m6,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu             [r2 + 2 * r3], ym5
    vextracti32x8    [r2 + r7],      m5,            1

    movu              xm5,           [r0 + mmsize/2]
    vinserti32x4       m5,           [r0 + r1 + mmsize/2],            1
    vinserti32x4       m5,           [r0 + 2 * r1 + mmsize/2],        2
    vinserti32x4       m5,           [r0 + r6 + mmsize/2],            3
    pshufb             m6,           m5,       m2
    pshufb             m5,           m1

    movu              xm7,           [r0 + 36]
    vinserti32x4       m7,           [r0 + r1 + 36],        1
    vinserti32x4       m7,           [r0 + 2 * r1 + 36],    2
    vinserti32x4       m7,           [r0 + r6 + 36],        3
    pshufb             m8,           m7,       m2
    pshufb             m7,           m1

    pmaddubsw          m5,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m5,           m3
    pmaddwd            m7,           m3

    pmaddubsw          m6,           m0
    pmaddubsw          m8,           m0
    pmaddwd            m6,           m3
    pmaddwd            m8,           m3

    packssdw           m5,           m7
    packssdw           m6,           m8
    pmulhrsw           m5,           m4
    pmulhrsw           m6,           m4
    packuswb           m5,           m6
    movu              [r2 + mmsize/2],          xm5
    vextracti32x4     [r2 + r3 + mmsize/2],     m5,       1
    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m5,       2
    vextracti32x4     [r2 + r7 + mmsize/2],     m5,       3
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_CHROMA_PP_64xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_64x%1, 4,6,9
    mov               r4d,               r4m

%ifdef PIC
    lea               r5,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r5 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8   m3,           [pw_1]
    vbroadcasti32x8   m4,           [pw_512]
    dec               r0

%rep %1 - 1
    PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
    lea               r2,           [r2 + r3]
    lea               r0,           [r0 + r1]
%endrep
    PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
    RET
%endmacro

%if ARCH_X86_64
    IPFILTER_CHROMA_PP_64xN_AVX512  64
    IPFILTER_CHROMA_PP_64xN_AVX512  32
    IPFILTER_CHROMA_PP_64xN_AVX512  48
    IPFILTER_CHROMA_PP_64xN_AVX512  16
%endif

%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_32x%1, 4,6,9
    mov               r4d,               r4m

%ifdef PIC
    lea               r5,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r5 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8   m3,           [pw_1]
    vbroadcasti32x8   m4,           [pw_512]
    dec               r0

%rep %1/2 - 1
    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
    lea               r2,           [r2 + 2 * r3]
    lea               r0,           [r0 + 2 * r1]
%endrep
    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
    RET
%endmacro

%if ARCH_X86_64
    IPFILTER_CHROMA_PP_32xN_AVX512 16
    IPFILTER_CHROMA_PP_32xN_AVX512 24
    IPFILTER_CHROMA_PP_32xN_AVX512 8
    IPFILTER_CHROMA_PP_32xN_AVX512 32
    IPFILTER_CHROMA_PP_32xN_AVX512 64
    IPFILTER_CHROMA_PP_32xN_AVX512 48
%endif

%macro IPFILTER_CHROMA_PP_16xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_16x%1, 4,8,9
    mov               r4d,          r4m
    lea               r6,           [3 * r1]
    lea               r7,           [3 * r3]
%ifdef PIC
    lea               r5,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r5 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8   m3,           [pw_1]
    vbroadcasti32x8   m4,           [pw_512]
    dec               r0

%rep %1/4 - 1
    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
    lea               r2,           [r2 + 4 * r3]
    lea               r0,           [r0 + 4 * r1]
%endrep
    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
    RET
%endmacro

%if ARCH_X86_64
    IPFILTER_CHROMA_PP_16xN_AVX512 4
    IPFILTER_CHROMA_PP_16xN_AVX512 8
    IPFILTER_CHROMA_PP_16xN_AVX512 12
    IPFILTER_CHROMA_PP_16xN_AVX512 16
    IPFILTER_CHROMA_PP_16xN_AVX512 24
    IPFILTER_CHROMA_PP_16xN_AVX512 32
    IPFILTER_CHROMA_PP_16xN_AVX512 64
%endif

%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_4tap_horiz_pp_48x64, 4,8,9
    mov               r4d,          r4m
    lea               r6,           [3 * r1]
    lea               r7,           [3 * r3]
%ifdef PIC
    lea               r5,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r5 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8   m3,           [pw_1]
    vbroadcasti32x8   m4,           [pw_512]
    dec               r0

%rep 15
    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
    lea               r2,           [r2 + 4 * r3]
    lea               r0,           [r0 + 4 * r1]
%endrep
    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
    RET
%endif

%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
    movu               ym6,          [r0]
    vinserti32x8       m6,           [r0 + 4], 1
    pshufb             m7,           m6,       m2
    pshufb             m6,           m1
    pmaddubsw          m6,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m6,           m3
    pmaddwd            m7,           m3

    movu               ym8,          [r0 + 32]
    vinserti32x8       m8,           [r0 + 36], 1
    pshufb             m9,           m8,       m2
    pshufb             m8,           m1
    pmaddubsw          m8,           m0
    pmaddubsw          m9,           m0
    pmaddwd            m8,           m3
    pmaddwd            m9,           m3

    packssdw           m6,           m7
    packssdw           m8,           m9
    psubw              m6,           m4
    psubw              m8,           m4
    vpermq             m6,           m10,       m6
    vpermq             m8,           m10,       m8
    movu               [r2],         m6
    movu               [r2 + mmsize],m8
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_CHROMA_PS_64xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_ps_64x%1, 4,7,11
    mov             r4d, r4m
    mov             r5d, r5m

%ifdef PIC
    lea               r6,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r6 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8    m3,           [pw_1]
    vbroadcasti32x8    m4,           [pw_2000]
    mova               m10,          [interp4_hps_shuf_avx512]

    ; register map
    ; m0    - interpolate coeff
    ; m1,m2 - load shuffle order table
    ; m3    - constant word 1
    ; m4    - constant word 2000
    ; m10   - store shuffle order table

    mov               r6d,         %1
    dec               r0
    test              r5d,         r5d
    je                .loop
    sub               r0,          r1
    add               r6d,         3

.loop:
    PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512
    lea               r2,           [r2 + 2 * r3]
    lea               r0,           [r0 + r1]
    dec               r6d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64
    IPFILTER_CHROMA_PS_64xN_AVX512 64
    IPFILTER_CHROMA_PS_64xN_AVX512 32
    IPFILTER_CHROMA_PS_64xN_AVX512 48
    IPFILTER_CHROMA_PS_64xN_AVX512 16
%endif

%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
    movu               ym6,          [r0]
    vinserti32x8       m6,           [r0 + 4], 1
    pshufb             m7,           m6,       m2
    pshufb             m6,           m6,       m1
    pmaddubsw          m6,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m6,           m3
    pmaddwd            m7,           m3

    packssdw           m6,           m7
    psubw              m6,           m4
    vpermq             m6,           m8,       m6
    movu               [r2],         m6
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_CHROMA_PS_32xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
    mov             r4d, r4m
    mov             r5d, r5m

%ifdef PIC
    lea               r6,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r6 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8    m3,           [pw_1]
    vbroadcasti32x8    m4,           [pw_2000]
    mova               m8,           [interp4_hps_shuf_avx512]

    ; register map
    ; m0    - interpolate coeff
    ; m1,m2 - load shuffle order table
    ; m3    - constant word 1
    ; m4    - constant word 2000
    ; m8   - store shuffle order table

    mov               r6d,         %1
    dec               r0
    test              r5d,         r5d
    je                .loop
    sub               r0,          r1
    add               r6d,         3

.loop:
    PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
    lea               r2,           [r2 + 2 * r3]
    lea               r0,           [r0 + r1]
    dec               r6d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64
    IPFILTER_CHROMA_PS_32xN_AVX512 64
    IPFILTER_CHROMA_PS_32xN_AVX512 48
    IPFILTER_CHROMA_PS_32xN_AVX512 32
    IPFILTER_CHROMA_PS_32xN_AVX512 24
    IPFILTER_CHROMA_PS_32xN_AVX512 16
    IPFILTER_CHROMA_PS_32xN_AVX512 8
%endif

%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
    movu               xm6,         [r0]
    vinserti32x4       m6,          [r0 + 4],      1
    vinserti32x4       m6,          [r0 + r1],     2
    vinserti32x4       m6,          [r0 + r1 + 4], 3

    pshufb             m7,          m6,            m2
    pshufb             m6,          m6,            m1
    pmaddubsw          m6,          m0
    pmaddubsw          m7,          m0
    pmaddwd            m6,          m3
    pmaddwd            m7,          m3

    packssdw           m6,          m7
    psubw              m6,          m4
    vpermq             m6,          m8,            m6
    movu               [r2],        ym6
    vextracti32x8      [r2 + r3],   m6,            1
%endmacro

%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
    movu              xm6,          [r0]
    vinserti32x4      m6,           [r0 + 4],  1

    pshufb            ym7,          ym6,       ym2
    pshufb            ym6,          ym6,       ym1
    pmaddubsw         ym6,          ym0
    pmaddubsw         ym7,          ym0
    pmaddwd           ym6,          ym3
    pmaddwd           ym7,          ym3

    packssdw          ym6,          ym7
    psubw             ym6,          ym4
    vpermq            ym6,          ym8,       ym6
    movu              [r2],         ym6
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_CHROMA_PS_16xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
    mov             r4d, r4m
    mov             r5d, r5m
    add             r3,  r3

%ifdef PIC
    lea               r6,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r6 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8    m1,          [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8    m2,          [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8    m3,          [pw_1]
    vbroadcasti32x8    m4,          [pw_2000]
    mova               m8,          [interp4_hps_store_16xN_avx512]

    ; register map
    ; m0    - interpolate coeff
    ; m1,m2 - load shuffle order table
    ; m3    - constant word 1
    ; m4    - constant word 2000
    ; m8   - store shuffle order table

    mov               r6d,          %1
    dec               r0
    test              r5d,          r5d
    je                .loop
    sub               r0,           r1
    add               r6d,          3
    PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512
    lea               r2,           [r2 + r3]
    lea               r0,           [r0 + r1]
    dec               r6d

.loop:
    PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512
    lea               r2,           [r2 + 2 * r3]
    lea               r0,           [r0 + 2 * r1]
    sub               r6d,          2
    jnz               .loop

    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_CHROMA_PS_16xN_AVX512 64
    IPFILTER_CHROMA_PS_16xN_AVX512 32
    IPFILTER_CHROMA_PS_16xN_AVX512 24
    IPFILTER_CHROMA_PS_16xN_AVX512 16
    IPFILTER_CHROMA_PS_16xN_AVX512 12
    IPFILTER_CHROMA_PS_16xN_AVX512 8
    IPFILTER_CHROMA_PS_16xN_AVX512 4
%endif

%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
    movu               ym6,          [r0]
    vinserti32x8       m6,           [r0 + 4], 1
    pshufb             m7,           m6,       m2
    pshufb             m6,           m6,       m1
    pmaddubsw          m6,           m0
    pmaddubsw          m7,           m0
    pmaddwd            m6,           m3
    pmaddwd            m7,           m3

    packssdw           m6,           m7
    psubw              m6,           m4
    vpermq             m6,           m8,       m6
    movu               [r2],         m6

    movu              xm6,          [r0 + 32]
    vinserti32x4      m6,           [r0 + 36], 1
    pshufb            ym7,          ym6,       ym2
    pshufb            ym6,          ym6,       ym1
    pmaddubsw         ym6,          ym0
    pmaddubsw         ym7,          ym0
    pmaddwd           ym6,          ym3
    pmaddwd           ym7,          ym3

    packssdw          ym6,          ym7
    psubw             ym6,          ym4
    vpermq            ym6,          ym9,       ym6
    movu              [r2 + mmsize],ym6
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_CHROMA_PS_48xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_horiz_ps_48x%1, 4,7,10
    mov             r4d, r4m
    mov             r5d, r5m

%ifdef PIC
    lea               r6,           [tab_ChromaCoeff]
    vpbroadcastd      m0,           [r6 + r4 * 4]
%else
    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
%endif

    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
    vbroadcasti32x8    m3,           [pw_1]
    vbroadcasti32x8    m4,           [pw_2000]
    mova               m8,           [interp4_hps_shuf_avx512]
    mova               m9,           [interp4_hps_store_16xN_avx512]

    ; register map
    ; m0    - interpolate coeff
    ; m1,m2 - load shuffle order table
    ; m3    - constant word 1
    ; m4    - constant word 2000
    ; m8   - store shuffle order table

    mov               r6d,         %1
    dec               r0
    test              r5d,         r5d
    je                .loop
    sub               r0,          r1
    add               r6d,         3

.loop:
    PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
    lea               r2,           [r2 + 2 * r3]
    lea               r0,           [r0 + r1]
    dec               r6d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_CHROMA_PS_48xN_AVX512 64
%endif

;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vpp and chroma_vps code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_CHROMA_VERT_16x4_AVX512 1
    lea                   r5,                 [r0 + 4 * r1]
    movu                  xm1,                [r0]
    movu                  xm3,                [r0 + r1]
    vinserti32x4          m1,                 [r0 + r1],           1
    vinserti32x4          m3,                 [r0 + 2 * r1],       1
    vinserti32x4          m1,                 [r0 + 2 * r1],       2
    vinserti32x4          m3,                 [r0 + r6],           2
    vinserti32x4          m1,                 [r0 + r6],           3
    vinserti32x4          m3,                 [r0 + 4 * r1],       3

    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  xm4,                [r0 + 2 * r1]
    movu                  xm5,                [r0 + r6]
    vinserti32x4          m4,                 [r0 + r6],           1
    vinserti32x4          m5,                 [r5],                1
    vinserti32x4          m4,                 [r5],                2
    vinserti32x4          m5,                 [r5 + r1],           2
    vinserti32x4          m4,                 [r5 + r1],           3
    vinserti32x4          m5,                 [r5 + 2 * r1],       3

    punpcklbw             m3,                 m4,                  m5
    pmaddubsw             m3,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m3
    paddw                 m1,                 m4
%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    packuswb              m0,                 m1
    movu                  [r2],               xm0
    vextracti32x4         [r2 + r3],          m0,                  1
    vextracti32x4         [r2 + 2 * r3],      m0,                  2
    vextracti32x4         [r2 + r7],          m0,                  3
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m2,                 m10
    mova                  m3,                 m11

    vpermi2q              m2,                 m0,                  m1
    vpermi2q              m3,                 m0,                  m1
    
    movu                  [r2],               ym2
    vextracti32x8         [r2 + r3],          m2,                   1
    movu                  [r2 + 2 * r3],      ym3
    vextracti32x8         [r2 + r7],          m3,                   1
%endif
%endmacro

;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_CHROMA_16xN_AVX512 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_16x%2, 4, 10, 12
    mov                   r4d,                r4m
    shl                   r4d,                7
    sub                   r0,                 r1

%ifdef PIC
    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + mmsize]
%else
    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m10,                [interp4_vps_store1_avx512]
    mova                  m11,                [interp4_vps_store2_avx512]
%endif
    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]

%rep %2/4 - 1
    PROCESS_CHROMA_VERT_16x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_16x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 4
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 8
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 12
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 16
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 24
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 32
    FILTER_VERT_CHROMA_16xN_AVX512 pp, 64

    FILTER_VERT_CHROMA_16xN_AVX512 ps, 4
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 8
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 12
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 16
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 24
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 32
    FILTER_VERT_CHROMA_16xN_AVX512 ps, 64
%endif
%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
    movu                  ym1,                [r0]
    movu                  ym3,                [r0 + r1]
    vinserti32x8          m1,                 [r0 + 2 * r1],       1
    vinserti32x8          m3,                 [r0 + r6],           1
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  ym4,                [r0 + 2 * r1]
    vinserti32x8          m4,                 [r0 + 4 * r1],       1
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    lea                   r0,                 [r0 + 2 * r1]

    movu                  ym5,                [r0 + r1]
    vinserti32x8          m5,                 [r0 + r6],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    paddw                 m0,                 m6
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9
    paddw                 m1,                 m4

    movu                  ym4,                [r0 + 2 * r1]
    vinserti32x8          m4,                 [r0 + 4 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    paddw                 m2,                 m6
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9
    paddw                 m3,                 m5

%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7
    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2],               ym0
    movu                  [r2 + r3],          ym2
    vextracti32x8         [r2 + 2 * r3],      m0,                  1
    vextracti32x8         [r2 + r7],          m2,                  1
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    psubw                 m2,                 m7
    psubw                 m3,                 m7

    mova                  m4,                 m10
    mova                  m5,                 m11
    vpermi2q              m4,                 m0,                m1
    vpermi2q              m5,                 m0,                m1
    mova                  m6,                 m10
    mova                  m12,                m11
    vpermi2q              m6,                 m2,                m3
    vpermi2q              m12,                 m2,                m3

    movu                  [r2],               m4
    movu                  [r2 + r3],          m6
    movu                  [r2 + 2 * r3],      m5
    movu                  [r2 + r7],          m12
%endif
%endmacro

;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_CHROMA_32xN_AVX512 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_32x%2, 4, 8, 13
    mov                   r4d,                r4m
    shl                   r4d,                7
    sub                   r0,                 r1

%ifdef PIC
    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + mmsize]
%else
    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1,pp
    vbroadcasti32x8       m7,                [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                [pw_2000]
    mova                  m10,                [interp4_vps_store1_avx512]
    mova                  m11,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]

%rep %2/4 - 1
    PROCESS_CHROMA_VERT_32x4_AVX512 %1
    lea                   r0,                 [r0 + 2 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_32x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 8
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 16
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 24
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 32
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 48
    FILTER_VERT_CHROMA_32xN_AVX512 pp, 64

    FILTER_VERT_CHROMA_32xN_AVX512 ps, 8
    FILTER_VERT_CHROMA_32xN_AVX512 ps, 16
    FILTER_VERT_CHROMA_32xN_AVX512 ps, 24
    FILTER_VERT_CHROMA_32xN_AVX512 ps, 32
    FILTER_VERT_CHROMA_32xN_AVX512 ps, 48
    FILTER_VERT_CHROMA_32xN_AVX512 ps, 64
%endif
%macro PROCESS_CHROMA_VERT_48x4_AVX512 1
    movu                  ym1,                [r0]
    movu                  ym3,                [r0 + r1]
    vinserti32x8          m1,                 [r0 + 2 * r1],       1
    vinserti32x8          m3,                 [r0 + r6],           1
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  ym4,                [r0 + 2 * r1]
    vinserti32x8          m4,                 [r0 + 4 * r1],       1
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    lea                   r5,                 [r0 + 4 * r1]

    movu                  ym5,                [r0 + r6]
    vinserti32x8          m5,                 [r5 + r1],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    paddw                 m0,                 m6
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9
    paddw                 m1,                 m4

    movu                  ym4,                [r0 + 4 * r1]
    vinserti32x8          m4,                 [r5 + 2 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    paddw                 m2,                 m6
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9
    paddw                 m3,                 m5
%ifidn %1, pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7

    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2],               ym0
    movu                  [r2 + r3],          ym2
    vextracti32x8         [r2 + 2 * r3],      m0,                  1
    vextracti32x8         [r2 + r7],          m2,                  1
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    psubw                 m2,                 m7
    psubw                 m3,                 m7

    mova                  m4,                 m10
    mova                  m5,                 m11
    vpermi2q              m4,                 m0,                m1
    vpermi2q              m5,                 m0,                m1
    mova                  m6,                 m10
    mova                  m12,                m11
    vpermi2q              m6,                 m2,                m3
    vpermi2q              m12,                m2,                m3

    movu                  [r2],               m4
    movu                  [r2 + r3],          m6
    movu                  [r2 + 2 * r3],      m5
    movu                  [r2 + r7],          m12
%endif
    movu                  xm1,                [r0 + mmsize/2]
    movu                  xm3,                [r0 + r1 + mmsize/2]
    vinserti32x4          m1,                 [r0 + r1 + mmsize/2],           1
    vinserti32x4          m3,                 [r0 + 2 * r1 + mmsize/2],       1
    vinserti32x4          m1,                 [r0 + 2 * r1 + mmsize/2],       2
    vinserti32x4          m3,                 [r0 + r6 + mmsize/2],           2
    vinserti32x4          m1,                 [r0 + r6 + mmsize/2],           3
    vinserti32x4          m3,                 [r0 + 4 * r1 + mmsize/2],       3

    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
    movu                  xm5,                [r0 + r6 + mmsize/2]
    vinserti32x4          m4,                 [r0 + r6 + mmsize/2],           1
    vinserti32x4          m5,                 [r5 + mmsize/2],                1
    vinserti32x4          m4,                 [r5 + mmsize/2],                2
    vinserti32x4          m5,                 [r5 + r1 + mmsize/2],           2
    vinserti32x4          m4,                 [r5 + r1 + mmsize/2],           3
    vinserti32x4          m5,                 [r5 + 2 * r1 + mmsize/2],       3

    punpcklbw             m3,                 m4,                  m5
    pmaddubsw             m3,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9
    paddw                 m0,                 m3
    paddw                 m1,                 m4
%ifidn %1, pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    packuswb              m0,                 m1
    movu                  [r2 + mmsize/2],               xm0
    vextracti32x4         [r2 + r3 + mmsize/2],          m0,                  1
    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  2
    vextracti32x4         [r2 + r7 + mmsize/2],          m0,                  3
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m2,                m10
    mova                  m3,                m11

    vpermi2q              m2,  m0, m1
    vpermi2q              m3,  m0, m1

    movu                  [r2 + mmsize],               ym2
    vextracti32x8         [r2 + r3 + mmsize],          m2,                  1
    movu                  [r2 + 2 * r3 + mmsize],      ym3
    vextracti32x8         [r2 + r7 + mmsize],          m3,                  1
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_CHROMA_48x64_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_48x64, 4, 8, 13
    mov                   r4d,                r4m
    shl                   r4d,                7
    sub                   r0,                 r1

%ifdef PIC
    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + mmsize]
%else
    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m10,                [interp4_vps_store1_avx512]
    mova                  m11,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]
%rep 15
    PROCESS_CHROMA_VERT_48x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_48x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_CHROMA_48x64_AVX512 pp
    FILTER_VERT_CHROMA_48x64_AVX512 ps
%endif
%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
    movu              m0,              [r0]                        ; m0 = row 0
    movu              m1,              [r0 + r1]                   ; m1 = row 1
    punpcklbw         m2,              m0,                m1
    punpckhbw         m3,              m0,                m1
    pmaddubsw         m2,              m10
    pmaddubsw         m3,              m10
    movu              m0,              [r0 + r1 * 2]               ; m0 = row 2
    punpcklbw         m4,              m1,                m0
    punpckhbw         m5,              m1,                m0
    pmaddubsw         m4,              m10
    pmaddubsw         m5,              m10
    movu              m1,              [r0 + r4]                   ; m1 = row 3
    punpcklbw         m6,              m0,                m1
    punpckhbw         m7,              m0,                m1
    pmaddubsw         m8,              m6,                m11
    pmaddubsw         m9,              m7,                m11
    pmaddubsw         m6,              m10
    pmaddubsw         m7,              m10
    paddw             m2,              m8
    paddw             m3,              m9

%ifidn %1,pp
    pmulhrsw          m2,              m12
    pmulhrsw          m3,              m12
    packuswb          m2,              m3
    movu              [r2],            m2
%else
    psubw             m2, m12
    psubw             m3, m12
    movu              m8, m13
    movu              m9, m14
    vpermi2q          m8, m2, m3
    vpermi2q          m9, m2, m3
    movu              [r2], m8
    movu              [r2 + mmsize], m9
%endif

    lea               r0,              [r0 + r1 * 4]
    movu              m0,              [r0]                        ; m0 = row 4
    punpcklbw         m2,              m1,                m0
    punpckhbw         m3,              m1,                m0
    pmaddubsw         m8,              m2,                m11
    pmaddubsw         m9,              m3,                m11
    pmaddubsw         m2,              m10
    pmaddubsw         m3,              m10
    paddw             m4,              m8
    paddw             m5,              m9

%ifidn %1,pp
    pmulhrsw          m4,              m12
    pmulhrsw          m5,              m12
    packuswb          m4,              m5
    movu              [r2 + r3],       m4
%else
    psubw             m4, m12
    psubw             m5, m12
    movu              m8, m13
    movu              m9, m14
    vpermi2q          m8, m4, m5
    vpermi2q          m9, m4, m5
    movu              [r2 + r3], m8
    movu              [r2 + r3 + mmsize], m9
%endif

    movu              m1,              [r0 + r1]                   ; m1 = row 5
    punpcklbw         m4,              m0,                m1
    punpckhbw         m5,              m0,                m1
    pmaddubsw         m4,              m11
    pmaddubsw         m5,              m11
    paddw             m6,              m4
    paddw             m7,              m5

%ifidn %1,pp
    pmulhrsw          m6,              m12
    pmulhrsw          m7,              m12
    packuswb          m6,              m7
    movu              [r2 + r3 * 2],   m6
%else
    psubw             m6, m12
    psubw             m7, m12
    movu              m8, m13
    movu              m9, m14
    vpermi2q          m8, m6, m7
    vpermi2q          m9, m6, m7
    movu              [r2 + 2 * r3], m8
    movu              [r2 + 2 * r3 + mmsize], m9
%endif
    movu              m0,              [r0 + r1 * 2]               ; m0 = row 6
    punpcklbw         m6,              m1,                m0
    punpckhbw         m7,              m1,                m0
    pmaddubsw         m6,              m11
    pmaddubsw         m7,              m11
    paddw             m2,              m6
    paddw             m3,              m7

%ifidn %1,pp
    pmulhrsw          m2,              m12
    pmulhrsw          m3,              m12
    packuswb          m2,              m3
    movu              [r2 + r5],       m2
%else
    psubw             m2, m12
    psubw             m3, m12
    movu              m8, m13
    movu              m9, m14
    vpermi2q          m8, m2, m3
    vpermi2q          m9, m2, m3
    movu              [r2 + r5], m8
    movu              [r2 + r5 + mmsize], m9
%endif
%endmacro

%macro FILTER_VER_CHROMA_AVX512_64xN 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_64x%2, 4, 6, 15
    mov               r4d,             r4m
    shl               r4d,             7

%ifdef PIC
    lea               r5,              [tab_ChromaCoeffVer_32_avx512]
    mova              m10,             [r5 + r4]
    mova              m11,             [r5 + r4 + mmsize]
%else
    mova              m10,             [tab_ChromaCoeffVer_32_avx512 + r4]
    mova              m11,             [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1,pp
    vbroadcasti32x8            m12, [pw_512]
%else
    shl                        r3d, 1
    vbroadcasti32x8            m12, [pw_2000]
    mova                       m13, [interp4_vps_store1_avx512]
    mova                       m14, [interp4_vps_store2_avx512]
%endif
    lea               r4,              [r1 * 3]
    sub               r0,              r1
    lea               r5,              [r3 * 3]

%rep %2/4 - 1
    PROCESS_CHROMA_VERT_64x4_AVX512 %1
    lea               r2, [r2 + r3 * 4]
%endrep
    PROCESS_CHROMA_VERT_64x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64 == 1
FILTER_VER_CHROMA_AVX512_64xN pp, 64
FILTER_VER_CHROMA_AVX512_64xN pp, 48
FILTER_VER_CHROMA_AVX512_64xN pp, 32
FILTER_VER_CHROMA_AVX512_64xN pp, 16

FILTER_VER_CHROMA_AVX512_64xN ps, 64
FILTER_VER_CHROMA_AVX512_64xN ps, 48
FILTER_VER_CHROMA_AVX512_64xN ps, 32
FILTER_VER_CHROMA_AVX512_64xN ps, 16
%endif
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vpp and chroma_vps code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vss code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_CHROMA_VERT_SS_8x4_AVX512 0
    lea                   r5,                 [r0 + 4 * r1]
    movu                  xm1,                [r0]
    movu                  xm3,                [r0 + r1]
    vinserti32x4          m1,                 [r0 + r1],           1
    vinserti32x4          m3,                 [r0 + 2 * r1],       1
    vinserti32x4          m1,                 [r0 + 2 * r1],       2
    vinserti32x4          m3,                 [r0 + r6],           2
    vinserti32x4          m1,                 [r0 + r6],           3
    vinserti32x4          m3,                 [r0 + 4 * r1],       3

    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m8
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m8

    movu                  xm4,                [r0 + 2 * r1]
    movu                  xm5,                [r0 + r6]
    vinserti32x4          m4,                 [r0 + r6],           1
    vinserti32x4          m5,                 [r5],                1
    vinserti32x4          m4,                 [r5],                2
    vinserti32x4          m5,                 [r5 + r1],           2
    vinserti32x4          m4,                 [r5 + r1],           3
    vinserti32x4          m5,                 [r5 + 2 * r1],       3

    punpcklwd             m3,                 m4,                  m5
    pmaddwd               m3,                 m9
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m9

    paddd                 m0,                 m3
    paddd                 m1,                 m4

    psrad                 m0,                 6
    psrad                 m1,                 6
    packssdw              m0,                 m1
    movu                  [r2],               xm0
    vextracti32x4         [r2 + r3],          m0,                  1
    vextracti32x4         [r2 + 2 * r3],      m0,                  2
    vextracti32x4         [r2 + r7],          m0,                  3
%endmacro

;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_vert_ss_8x%1, 5, 8, 10
    add                   r1d,                r1d
    add                   r3d,                r3d
    sub                   r0,                 r1
    shl                   r4d,                7
%ifdef PIC
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + mmsize]
%else
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512 + r4]
    mova                  m8,                 [r5]
    mova                  m9,                 [r5 + mmsize]
%endif
    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]

%rep %1/4 - 1
    PROCESS_CHROMA_VERT_SS_8x4_AVX512
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_SS_8x4_AVX512
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_SS_CHROMA_8xN_AVX512 4
    FILTER_VER_SS_CHROMA_8xN_AVX512 8
    FILTER_VER_SS_CHROMA_8xN_AVX512 12
    FILTER_VER_SS_CHROMA_8xN_AVX512 16
    FILTER_VER_SS_CHROMA_8xN_AVX512 32
    FILTER_VER_SS_CHROMA_8xN_AVX512 64
%endif

%macro PROCESS_CHROMA_VERT_S_16x4_AVX512 1
    movu                  ym1,                [r0]
    lea                   r6,                 [r0 + 2 * r1]
    vinserti32x8          m1,                 [r6],                1
    movu                  ym3,                [r0 + r1]
    vinserti32x8          m3,                 [r6 + r1],           1
    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m7
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m7

    movu                  ym4,                [r0 + 2 * r1]
    vinserti32x8          m4,                 [r6 + 2 * r1],       1
    punpcklwd             m2,                 m3,                  m4
    pmaddwd               m2,                 m7
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m7

    movu                  ym5,                [r0 + r4]
    vinserti32x8          m5,                 [r6 + r4],           1
    punpcklwd             m6,                 m4,                  m5
    pmaddwd               m6,                 m8
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m8
    paddd                 m1,                 m4

    movu                  ym4,                [r0 + 4 * r1]
    vinserti32x8          m4,                 [r6 + 4 * r1],       1
    punpcklwd             m6,                 m5,                  m4
    pmaddwd               m6,                 m8
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m8
    paddd                 m3,                 m5

%ifidn %1, sp
    paddd                m0,                  m9
    paddd                m1,                  m9
    paddd                m2,                  m9
    paddd                m3,                  m9

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m10,                 m0
    movu                 [r2],                xm0
    vextracti32x4        [r2 + r3],           m0,                  2
    vextracti32x4        [r2 + 2 * r3],       m0,                  1
    vextracti32x4        [r2 + r5],           m0,                  3
%else
    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6
    packssdw              m0,                 m1
    packssdw              m2,                 m3

    movu                  [r2],               ym0
    movu                  [r2 + r3],          ym2
    vextracti32x8         [r2 + 2 * r3],      m0,                  1
    vextracti32x8         [r2 + r5],          m2,                  1
%endif
%endmacro

;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_S_CHROMA_16xN_AVX512 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_16x%2, 4, 7, 11
    mov                   r4d,                r4m
    shl                   r4d,                7

%ifdef PIC
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
    mova                  m7,                 [r5 + r4]
    mova                  m8,                 [r5 + r4 + mmsize]
%else
    mova                  m7,                 [pw_ChromaCoeffVer_32_avx512 + r4]
    mova                  m8,                 [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1, sp
    vbroadcasti32x4       m9,                 [pd_526336]
    mova                  m10,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif
    add                   r1d,                r1d
    sub                   r0,                 r1
    lea                   r4,                 [r1 * 3]
    lea                   r5,                 [r3 * 3]

%rep %2/4 - 1
    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 4
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 8
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 12
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 16
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 24
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 32
    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 64
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 4
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 8
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 12
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 16
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 24
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 32
    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 64
%endif

%macro PROCESS_CHROMA_VERT_SS_24x8_AVX512 0
    movu                  ym1,                [r0]
    lea                   r6,                 [r0 + 2 * r1]
    lea                   r8,                 [r0 + 4 * r1]
    lea                   r9,                 [r8 + 2 * r1]

    movu                  ym10,               [r8]
    movu                  ym3,                [r0 + r1]
    movu                  ym12,               [r8 + r1]
    vinserti32x8          m1,                 [r6],                1
    vinserti32x8          m10,                [r9],                1
    vinserti32x8          m3,                 [r6 + r1],           1
    vinserti32x8          m12,                [r9 + r1],           1

    punpcklwd             m0,                 m1,                  m3
    punpcklwd             m9,                 m10,                 m12
    pmaddwd               m0,                 m16
    pmaddwd               m9,                 m16
    punpckhwd             m1,                 m3
    punpckhwd             m10,                m12
    pmaddwd               m1,                 m16
    pmaddwd               m10,                m16

    movu                  ym4,                [r0 + 2 * r1]
    movu                  ym13,               [r8 + 2 * r1]
    vinserti32x8          m4,                 [r6 + 2 * r1],       1
    vinserti32x8          m13,                [r9 + 2 * r1],       1
    punpcklwd             m2,                 m3,                  m4
    punpcklwd             m11,                m12,                 m13
    pmaddwd               m2,                 m16
    pmaddwd               m11,                m16
    punpckhwd             m3,                 m4
    punpckhwd             m12,                m13
    pmaddwd               m3,                 m16
    pmaddwd               m12,                m16

    movu                  ym5,                [r0 + r10]
    vinserti32x8          m5,                 [r6 + r10],          1
    movu                  ym14,               [r8 + r10]
    vinserti32x8          m14,                [r9 + r10],          1
    punpcklwd             m6,                 m4,                  m5
    punpcklwd             m15,                m13,                 m14
    pmaddwd               m6,                 m17
    pmaddwd               m15,                m17
    paddd                 m0,                 m6
    paddd                 m9,                 m15
    punpckhwd             m4,                 m5
    punpckhwd             m13,                m14
    pmaddwd               m4,                 m17
    pmaddwd               m13,                m17
    paddd                 m1,                 m4
    paddd                 m10,                m13

    movu                  ym4,                [r0 + 4 * r1]
    vinserti32x8          m4,                 [r6 + 4 * r1],       1
    movu                  ym13,               [r8 + 4 * r1]
    vinserti32x8          m13,                [r9 + 4 * r1],       1
    punpcklwd             m6,                 m5,                  m4
    punpcklwd             m15,                m14,                 m13
    pmaddwd               m6,                 m17
    pmaddwd               m15,                m17
    paddd                 m2,                 m6
    paddd                 m11,                m15
    punpckhwd             m5,                 m4
    punpckhwd             m14,                m13
    pmaddwd               m5,                 m17
    pmaddwd               m14,                m17
    paddd                 m3,                 m5
    paddd                 m12,                m14

    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6
    psrad                 m9,                 6
    psrad                 m10,                6
    psrad                 m11,                6
    psrad                 m12,                6

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    packssdw              m9,                 m10
    packssdw              m11,                m12

    movu                  [r2],               ym0
    movu                  [r2 + r3],          ym2
    vextracti32x8         [r2 + 2 * r3],      m0,                  1
    vextracti32x8         [r2 + r7],          m2,                  1
    lea                   r11,                [r2 + 4 * r3]
    movu                  [r11],              ym9
    movu                  [r11 + r3],         ym11
    vextracti32x8         [r11 + 2 * r3],     m9,                  1
    vextracti32x8         [r11 + r7],         m11,                 1

    movu                  xm1,                [r0 + mmsize/2]
    vinserti32x4          m1,                 [r6 + mmsize/2],                1
    vinserti32x4          m1,                 [r8 + mmsize/2],                2
    vinserti32x4          m1,                 [r9 + mmsize/2],                3
    movu                  xm3,                [r0 + r1 + mmsize/2]
    vinserti32x4          m3,                 [r6 + r1 + mmsize/2],           1
    vinserti32x4          m3,                 [r8 + r1 + mmsize/2],           2
    vinserti32x4          m3,                 [r9 + r1 + mmsize/2],           3
    punpcklwd             m0,                 m1,                             m3
    pmaddwd               m0,                 m16
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m16

    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r6 + 2 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r9 + 2 * r1 + mmsize/2],       3
    punpcklwd             m2,                 m3,                             m4
    pmaddwd               m2,                 m16
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m16

    movu                  xm5,                [r0 + r10 + mmsize/2]
    vinserti32x4          m5,                 [r6 + r10 + mmsize/2],          1
    vinserti32x4          m5,                 [r8 + r10 + mmsize/2],          2
    vinserti32x4          m5,                 [r9 + r10 + mmsize/2],          3
    punpcklwd             m6,                 m4,                             m5
    pmaddwd               m6,                 m17
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m17
    paddd                 m1,                 m4

    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r6 + 4 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r8 + 4 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r9 + 4 * r1 + mmsize/2],       3
    punpcklwd             m6,                 m5,                             m4
    pmaddwd               m6,                 m17
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m17
    paddd                 m3,                 m5

    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6

    packssdw              m0,                 m1
    packssdw              m2,                 m3

    movu                  [r2 + mmsize/2],               xm0
    movu                  [r2 + r3 + mmsize/2],          xm2
    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
    lea                   r2,                            [r2 + 4 * r3]
    vextracti32x4         [r2 + mmsize/2],               m0,                  2
    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
%endmacro

%macro FILTER_VER_SS_CHROMA_24xN_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_vert_ss_24x%1, 5, 12, 18
    add                   r1d,                r1d
    add                   r3d,                r3d
    sub                   r0,                 r1
    shl                   r4d,                7
%ifdef PIC
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
    mova                  m16,                [r5 + r4]
    mova                  m17,                [r5 + r4 + mmsize]
%else
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512 + r4]
    mova                  m16,                [r5]
    mova                  m17,                [r5 + mmsize]
%endif
    lea                   r10,                [3 * r1]
    lea                   r7,                 [3 * r3]
%rep %1/8 - 1
    PROCESS_CHROMA_VERT_SS_24x8_AVX512
    lea                   r0,                 [r8 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_SS_24x8_AVX512
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_SS_CHROMA_24xN_AVX512 32
    FILTER_VER_SS_CHROMA_24xN_AVX512 64
%endif
%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1
    movu                  m1,                 [r0]
    movu                  m3,                 [r0 + r1]
    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m7
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m7
    movu                  m4,                 [r0 + 2 * r1]
    punpcklwd             m2,                 m3,                  m4
    pmaddwd               m2,                 m7
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m7
    movu                  m5,                 [r0 + r4]
    punpcklwd             m6,                 m4,                  m5
    pmaddwd               m6,                 m8
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m8
    paddd                 m1,                 m4
    movu                  m4,                 [r0 + 4 * r1]
    punpcklwd             m6,                 m5,                  m4
    pmaddwd               m6,                 m8
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m8
    paddd                 m3,                 m5
%ifidn %1, sp
    paddd                 m0,                 m9
    paddd                 m1,                 m9
    paddd                 m2,                 m9
    paddd                 m3,                 m9

    psrad                 m0,                 12
    psrad                 m1,                 12
    psrad                 m2,                 12
    psrad                 m3,                 12

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    packuswb              m0,                 m2
    vpermq                m0,                 m10,                   m0
    movu                  [r2],               ym0
    vextracti32x8         [r2 + r3],          m0,                    1
%else
    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6
    packssdw              m0,                 m1
    packssdw              m2,                 m3
    movu                  [r2],               m0
    movu                  [r2 + r3],          m2
%endif
%endmacro

%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_32x%2, 4, 6, 11
    mov               r4d,             r4m
    shl               r4d,             7
%ifdef PIC
    lea               r5,              [pw_ChromaCoeffVer_32_avx512]
    mova              m7,              [r5 + r4]
    mova              m8,              [r5 + r4 + mmsize]
%else
    mova              m7,              [pw_ChromaCoeffVer_32_avx512 + r4]
    mova              m8,              [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif
%ifidn %1, sp
    vbroadcasti32x4   m9,              [pd_526336]
    mova              m10,             [interp8_vsp_store_avx512]
%else
    add               r3d,             r3d
%endif
    add               r1d,             r1d
    sub               r0,              r1
    lea               r4,              [r1 * 3]
    lea               r5,              [r3 * 3]
%rep %2/2 - 1
    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
    lea               r0,              [r0 + r1 * 2]
    lea               r2,              [r2 + r3 * 2]
%endrep
    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 8
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 16
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 24
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 32
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 48
    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 64
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 8
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 16
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 24
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 32
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
%endif

%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
    lea                   r6,                 [r0 + 2 * r1]

    movu                  m1,                 [r6]
    movu                  m3,                 [r6 + r1]
    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m7
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m7
    movu                  m4,                 [r6 + 2 * r1]
    punpcklwd             m2,                 m3,                  m4
    pmaddwd               m2,                 m7
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m7

    movu                  m5,                 [r6 + r4]
    punpcklwd             m6,                 m4,                  m5
    pmaddwd               m6,                 m8
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m8
    paddd                 m1,                 m4

    movu                  m4,                 [r6 + 4 * r1]
    punpcklwd             m6,                 m5,                  m4
    pmaddwd               m6,                 m8
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m8
    paddd                 m3,                 m5

%ifidn %1, sp
    paddd                 m0,                 m9
    paddd                 m1,                 m9
    paddd                 m2,                 m9
    paddd                 m3,                 m9

    psrad                 m0,                 12
    psrad                 m1,                 12
    psrad                 m2,                 12
    psrad                 m3,                 12

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    packuswb              m0,                 m2
    vpermq                m0,                 m10,                   m0
    movu                  [r2 + 2 * r3],      ym0
    vextracti32x8         [r2 + r5],          m0,                    1
%else
    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    movu                  [r2 + 2 * r3],      m0
    movu                  [r2 + r5],          m2
%endif

    movu                  ym1,                [r0 + mmsize]
    vinserti32x8          m1,                 [r6 + mmsize],                1
    movu                  ym3,                [r0 + r1 + mmsize]
    vinserti32x8          m3,                 [r6 + r1 + mmsize],           1
    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m7
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m7

    movu                  ym4,                [r0 + 2 * r1 + mmsize]
    vinserti32x8          m4,                 [r6 + 2 * r1 + mmsize],       1
    punpcklwd             m2,                 m3,                  m4
    pmaddwd               m2,                 m7
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m7

    movu                  ym5,                [r0 + r4 + mmsize]
    vinserti32x8          m5,                 [r6 + r4 + mmsize],           1
    punpcklwd             m6,                 m4,                  m5
    pmaddwd               m6,                 m8
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m8
    paddd                 m1,                 m4

    movu                  ym4,                [r0 + 4 * r1 + mmsize]
    vinserti32x8          m4,                 [r6 + 4 * r1 + mmsize],       1
    punpcklwd             m6,                 m5,                  m4
    pmaddwd               m6,                 m8
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m8
    paddd                 m3,                 m5

%ifidn %1, sp
    paddd                m0,                  m9
    paddd                m1,                  m9
    paddd                m2,                  m9
    paddd                m3,                  m9

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m10,                 m0
    movu                 [r2 + mmsize/2],                xm0
    vextracti32x4        [r2 + r3 + mmsize/2],           m0,                  2
    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  1
    vextracti32x4        [r2 + r5 + mmsize/2],           m0,                  3
%else
    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6
    packssdw              m0,                 m1
    packssdw              m2,                 m3

    movu                  [r2 + mmsize],               ym0
    movu                  [r2 + r3 + mmsize],          ym2
    vextracti32x8         [r2 + 2 * r3 + mmsize],      m0,                  1
    vextracti32x8         [r2 + r5 + mmsize],          m2,                  1
%endif
%endmacro

%macro FILTER_VER_S_CHROMA_48x64_AVX512 1
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_48x64, 4, 7, 11
    mov                   r4d,                r4m
    shl                   r4d,                7

%ifdef PIC
    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
    mova                  m7,                 [r5 + r4]
    mova                  m8,                 [r5 + r4 + mmsize]
%else
    mova                  m7,                 [pw_ChromaCoeffVer_32_avx512 + r4]
    mova                  m8,                 [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1, sp
    vbroadcasti32x4       m9,                 [pd_526336]
    mova                  m10,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif
    add                   r1d,                r1d
    sub                   r0,                 r1
    lea                   r4,                 [r1 * 3]
    lea                   r5,                 [r3 * 3]

%rep 15
    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_CHROMA_48x64_AVX512 ss
    FILTER_VER_S_CHROMA_48x64_AVX512 sp
%endif

%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1
    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
    movu                  m1,                 [r0 + mmsize]
    movu                  m3,                 [r0 + r1 + mmsize]
    punpcklwd             m0,                 m1,                  m3
    pmaddwd               m0,                 m7
    punpckhwd             m1,                 m3
    pmaddwd               m1,                 m7
    movu                  m4,                 [r0 + 2 * r1 + mmsize]
    punpcklwd             m2,                 m3,                  m4
    pmaddwd               m2,                 m7
    punpckhwd             m3,                 m4
    pmaddwd               m3,                 m7

    movu                  m5,                 [r0 + r4 + mmsize]
    punpcklwd             m6,                 m4,                  m5
    pmaddwd               m6,                 m8
    paddd                 m0,                 m6
    punpckhwd             m4,                 m5
    pmaddwd               m4,                 m8
    paddd                 m1,                 m4

    movu                  m4,                 [r0 + 4 * r1 + mmsize]
    punpcklwd             m6,                 m5,                  m4
    pmaddwd               m6,                 m8
    paddd                 m2,                 m6
    punpckhwd             m5,                 m4
    pmaddwd               m5,                 m8
    paddd                 m3,                 m5

%ifidn %1, sp
    paddd                 m0,                 m9
    paddd                 m1,                 m9
    paddd                 m2,                 m9
    paddd                 m3,                 m9

    psrad                 m0,                 12
    psrad                 m1,                 12
    psrad                 m2,                 12
    psrad                 m3,                 12

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    packuswb              m0,                 m2
    vpermq                m0,                 m10,                   m0
    movu                  [r2 + mmsize/2],    ym0
    vextracti32x8         [r2 + r3 + mmsize/2], m0,                    1
%else
    psrad                 m0,                 6
    psrad                 m1,                 6
    psrad                 m2,                 6
    psrad                 m3,                 6

    packssdw              m0,                 m1
    packssdw              m2,                 m3
    movu                  [r2 + mmsize],      m0
    movu                  [r2 + r3 + mmsize], m2
%endif
%endmacro

%macro FILTER_VER_S_CHROMA_64xN_AVX512 2
INIT_ZMM avx512
cglobal interp_4tap_vert_%1_64x%2, 4, 6, 11
    mov               r4d,             r4m
    shl               r4d,             7
%ifdef PIC
    lea               r5,              [pw_ChromaCoeffVer_32_avx512]
    mova              m7,              [r5 + r4]
    mova              m8,              [r5 + r4 + mmsize]
%else
    mova              m7,              [pw_ChromaCoeffVer_32_avx512 + r4]
    mova              m8,              [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif

%ifidn %1, sp
    vbroadcasti32x4   m9,              [pd_526336]
    mova              m10,             [interp8_vsp_store_avx512]
%else
    add               r3d,             r3d
%endif
    add               r1d,             r1d
    sub               r0,              r1
    lea               r4,              [r1 * 3]
    lea               r5,              [r3 * 3]

%rep %2/2 - 1
    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
    lea               r0,              [r0 + r1 * 2]
    lea               r2,              [r2 + r3 * 2]
%endrep
    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16
    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32
    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48
    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64
    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16
    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32
    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48
    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64
%endif
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vss code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;ipfilter_chroma_avx512 code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;ipfilter_luma_avx512 code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 0
    ; register map
    ; m0 , m1 interpolate coeff
    ; m2 , m3, m4  shuffle order table
    ; m5 - pw_1
    ; m6 - pw_512

    movu              m7,        [r0]
    movu              m9,        [r0 + 8]

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2
    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2


    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2],     m7
%endmacro

%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
    ; register map
    ; m0 , m1 interpolate coeff
    ; m2 , m3, m4  shuffle order table
    ; m5 - pw_1
    ; m6 - pw_512

    movu             ym7,        [r0]
    vinserti32x8      m7,        [r0 + r1], 1
    movu             ym9,        [r0 + 8]
    vinserti32x8      m9,        [r0 + r1 + 8], 1

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2
    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2

    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2],     ym7
    vextracti32x8     [r2 + r3], m7, 1
%endmacro

%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
    ; register map
    ; m0 , m1 interpolate coeff
    ; m2 , m3, m4  shuffle order table
    ; m5 - pw_1
    ; m6 - pw_512

    movu             xm7,        [r0]
    vinserti32x4      m7,        [r0 + r1],          1
    vinserti32x4      m7,        [r0 + 2 * r1],      2
    vinserti32x4      m7,        [r0 + r6],          3

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2

    movu             xm9,        [r0 + 8]
    vinserti32x4      m9,        [r0 + r1 + 8],      1
    vinserti32x4      m9,        [r0 + 2 * r1 + 8],  2
    vinserti32x4      m9,        [r0 + r6 + 8],      3

    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2

    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2],         xm7
    vextracti32x4     [r2 + r3],     m7,    1
    vextracti32x4     [r2 + 2 * r3], m7,    2
    vextracti32x4     [r2 + r7],     m7,    3
%endmacro

%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
    ; register map
    ; m0 , m1 interpolate coeff
    ; m2 , m3, m4  shuffle order table
    ; m5 - pw_1
    ; m6 - pw_512

    movu             ym7,        [r0]
    vinserti32x8      m7,        [r0 + r1], 1
    movu             ym9,        [r0 + 8]
    vinserti32x8      m9,        [r0 + r1 + 8], 1

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2
    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2

    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2],     ym7
    vextracti32x8     [r2 + r3], m7, 1

    movu             ym7,        [r0 + 2 * r1]
    vinserti32x8      m7,        [r0 + r6],          1
    movu             ym9,        [r0 + 2 * r1 + 8]
    vinserti32x8      m9,        [r0 + r6 + 8],      1

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2
    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2

    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2 + 2 * r3],     ym7
    vextracti32x8     [r2 + r7],          m7,    1

    movu             xm7,        [r0 + mmsize/2]
    vinserti32x4      m7,        [r0 + r1 + mmsize/2],          1
    vinserti32x4      m7,        [r0 + 2 * r1 + mmsize/2],      2
    vinserti32x4      m7,        [r0 + r6 + mmsize/2],          3

    pshufb            m8,        m7,        m3
    pshufb            m7,        m2

    movu             xm9,        [r0 + 40]
    vinserti32x4      m9,        [r0 + r1 + 40],      1
    vinserti32x4      m9,        [r0 + 2 * r1 + 40],  2
    vinserti32x4      m9,        [r0 + r6 + 40],      3

    pshufb            m10,       m9,        m3
    pshufb            m11,       m9,        m4
    pshufb            m9,        m2

    pmaddubsw         m7,        m0
    pmaddubsw         m12,       m8,        m1
    pmaddwd           m7,        m5
    pmaddwd           m12,       m5
    paddd             m7,        m12

    pmaddubsw         m8,        m0
    pmaddubsw         m12,       m9,        m1
    pmaddwd           m8,        m5
    pmaddwd           m12,       m5
    paddd             m8,        m12

    pmaddubsw         m9,        m0
    pmaddubsw         m12,       m10,       m1
    pmaddwd           m9,        m5
    pmaddwd           m12,       m5
    paddd             m9,        m12

    pmaddubsw         m10,       m0
    pmaddubsw         m12,      m11,        m1
    pmaddwd           m10,      m5
    pmaddwd           m12,      m5
    paddd             m10,      m12

    packssdw          m7,       m8
    packssdw          m9,       m10
    pmulhrsw          m7,       m6
    pmulhrsw          m9,       m6
    packuswb          m7,       m9
    movu              [r2 + mmsize/2],         xm7
    vextracti32x4     [r2 + r3 + mmsize/2],     m7,    1
    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m7,    2
    vextracti32x4     [r2 + r7 + mmsize/2],     m7,    3
%endmacro

%macro IPFILTER_LUMA_64xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
    sub               r0,    3
    mov               r4d,   r4m
%ifdef PIC
    lea               r5,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r5 + r4 * 8]
    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_512]

%rep %1-1
    PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
    lea               r0,        [r0 + r1]
    lea               r2,        [r2 + r3]
%endrep
    PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
    RET
%endmacro

%if ARCH_X86_64
IPFILTER_LUMA_64xN_AVX512 16
IPFILTER_LUMA_64xN_AVX512 32
IPFILTER_LUMA_64xN_AVX512 48
IPFILTER_LUMA_64xN_AVX512 64
%endif

%macro IPFILTER_LUMA_32xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_32x%1, 4,6,13
    sub               r0,    3
    mov               r4d,   r4m
%ifdef PIC
    lea               r5,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r5 + r4 * 8]
    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_512]

%rep %1/2 -1
    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
    lea               r0,        [r0 + 2 * r1]
    lea               r2,        [r2 + 2 * r3]
%endrep
    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
    RET
%endmacro

%if ARCH_X86_64
IPFILTER_LUMA_32xN_AVX512 8
IPFILTER_LUMA_32xN_AVX512 16
IPFILTER_LUMA_32xN_AVX512 24
IPFILTER_LUMA_32xN_AVX512 32
IPFILTER_LUMA_32xN_AVX512 64
%endif

%macro IPFILTER_LUMA_16xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_16x%1, 4,8,14
    sub               r0,    3
    mov               r4d,   r4m
    lea               r6,    [3 * r1]
    lea               r7,    [3 * r3]
%ifdef PIC
    lea               r5,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r5 + r4 * 8]
    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_512]

%rep %1/4 -1
    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
    lea               r0,        [r0 + 4 * r1]
    lea               r2,        [r2 + 4 * r3]
%endrep
    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
    RET
%endmacro

%if ARCH_X86_64
IPFILTER_LUMA_16xN_AVX512 4
IPFILTER_LUMA_16xN_AVX512 8
IPFILTER_LUMA_16xN_AVX512 12
IPFILTER_LUMA_16xN_AVX512 16
IPFILTER_LUMA_16xN_AVX512 32
IPFILTER_LUMA_16xN_AVX512 64
%endif

%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_8tap_horiz_pp_48x64, 4,8,14
    sub               r0,    3
    mov               r4d,   r4m
    lea               r6,    [3 * r1]
    lea               r7,    [3 * r3]
%ifdef PIC
    lea               r5,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r5 + r4 * 8]
    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_512]

%rep 15
    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
    lea               r0,        [r0 + 4 * r1]
    lea               r2,        [r2 + 4 * r3]
%endrep
    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
    RET
%endif

%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
    ; register map
    ; m0 , m1     - interpolate coeff
    ; m2 , m3, m4 - load shuffle order table
    ; m5          - pw_1
    ; m6          - pw_2000
    ; m7          - store shuffle order table

    movu              ym8,           [r0]
    vinserti32x8      m8,            [r0 + 8],            1
    pshufb            m9,            m8,                  m3
    pshufb            m10,           m8,                  m4
    pshufb            m8,             m2

    movu              ym11,          [r0 + mmsize/2]
    vinserti32x8      m11,           [r0 + mmsize/2 + 8], 1
    pshufb            m12,           m11,                 m3
    pshufb            m13,           m11,                 m4
    pshufb            m11,           m2

    pmaddubsw         m8,            m0
    pmaddubsw         m14,           m9,                  m1
    pmaddwd           m8,            m5
    pmaddwd           m14,           m5
    paddd             m8,            m14

    pmaddubsw         m9,            m0
    pmaddubsw         m14,           m10,                 m1
    pmaddwd           m9,            m5
    pmaddwd           m14,           m5
    paddd             m9,            m14

    pmaddubsw         m11,           m0
    pmaddubsw         m14,           m12,                 m1
    pmaddwd           m11,           m5
    pmaddwd           m14,           m5
    paddd             m11,           m14

    pmaddubsw         m12,           m0
    pmaddubsw         m14,           m13,                 m1
    pmaddwd           m12,           m5
    pmaddwd           m14,           m5
    paddd             m12,           m14


    packssdw          m8,            m9
    packssdw          m11,           m12
    psubw             m8,            m6
    psubw             m11,           m6
    vpermq            m8,            m7,                  m8
    vpermq            m11,           m7,                  m11
    movu              [r2],          m8
    movu              [r2 + mmsize], m11
%endmacro

%macro IPFILTER_LUMA_PS_64xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_ps_64x%1, 4,7,15
    mov               r4d,   r4m
    mov               r5d,   r5m

%ifdef PIC
    lea               r6,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r6 + r4 * 8]
    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_2000]
    mova              m7,        [interp8_hps_store_avx512]

    mov               r4d,       %1
    sub               r0,        3
    test              r5d,       r5d
    jz                .loop
    lea               r6,        [r1 * 3]
    sub               r0,        r6                           ; r0(src)-r6
    add               r4d,       7                            ; blkheight += N - 1

.loop:
    PROCESS_IPFILTER_LUMA_PS_64x1_AVX512
    lea               r0,        [r0 + r1]
    lea               r2,        [r2 + 2 * r3]
    dec               r4d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_LUMA_PS_64xN_AVX512 16
    IPFILTER_LUMA_PS_64xN_AVX512 32
    IPFILTER_LUMA_PS_64xN_AVX512 48
    IPFILTER_LUMA_PS_64xN_AVX512 64
%endif

%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
    ; register map
    ; m0 , m1     - interpolate coeff
    ; m2 , m3, m4 - load shuffle order table
    ; m5          - pw_1
    ; m6          - pw_2000
    ; m7          - store shuffle order table

    movu              ym8,           [r0]
    vinserti32x8      m8,            [r0 + 8],            1
    pshufb            m9,            m8,                  m3
    pshufb            m10,           m8,                  m4
    pshufb            m8,             m2

    pmaddubsw         m8,            m0
    pmaddubsw         m11,           m9,                  m1
    pmaddwd           m8,            m5
    pmaddwd           m11,           m5
    paddd             m8,            m11

    pmaddubsw         m9,            m0
    pmaddubsw         m11,           m10,                 m1
    pmaddwd           m9,            m5
    pmaddwd           m11,           m5
    paddd             m9,            m11

    packssdw          m8,            m9
    psubw             m8,            m6
    vpermq            m8,            m7,                  m8
    movu              [r2],          m8
%endmacro

%macro IPFILTER_LUMA_PS_32xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_ps_32x%1, 4,7,12
    mov               r4d,   r4m
    mov               r5d,   r5m

%ifdef PIC
    lea               r6,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r6 + r4 * 8]
    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_2000]
    mova              m7,        [interp8_hps_store_avx512]

    mov               r4d,       %1
    sub               r0,        3
    test              r5d,       r5d
    jz                .loop
    lea               r6,        [r1 * 3]
    sub               r0,        r6                           ; r0(src)-r6
    add               r4d,       7                            ; blkheight += N - 1

.loop:
    PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
    lea               r0,        [r0 + r1]
    lea               r2,        [r2 + 2 * r3]
    dec               r4d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_LUMA_PS_32xN_AVX512 8
    IPFILTER_LUMA_PS_32xN_AVX512 16
    IPFILTER_LUMA_PS_32xN_AVX512 24
    IPFILTER_LUMA_PS_32xN_AVX512 32
    IPFILTER_LUMA_PS_32xN_AVX512 64
%endif

%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512 0
    movu              xm7,           [r0]
    vinserti32x4      m7,            [r0 + 8],            1
    vinserti32x4      m7,            [r0 + r1],           2
    vinserti32x4      m7,            [r0 + r1 + 8],       3
    pshufb            m8,            m7,                  m3
    pshufb            m9,            m7,                  m4
    pshufb            m7,            m2

    pmaddubsw         m7,            m0
    pmaddubsw         m10,           m8,                  m1
    pmaddwd           m7,            m5
    pmaddwd           m10,           m5
    paddd             m7,            m10

    pmaddubsw         m8,            m0
    pmaddubsw         m10,           m9,                  m1
    pmaddwd           m8,            m5
    pmaddwd           m10,           m5
    paddd             m8,            m10

    packssdw          m7,            m8
    psubw             m7,            m6
    movu              [r2],          ym7
    vextracti32x8     [r2 + r3],     m7,                  1
%endmacro

%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512 0
    movu              xm7,            [r0]
    vinserti32x4      m7,             [r0 + 8],             1
    pshufb            ym8,            ym7,                  ym3
    pshufb            ym9,            ym7,                  ym4
    pshufb            ym7,            ym2

    pmaddubsw         ym7,            ym0
    pmaddubsw         ym10,           ym8,                  ym1
    pmaddwd           ym7,            ym5
    pmaddwd           ym10,           ym5
    paddd             ym7,            ym10

    pmaddubsw         ym8,            ym0
    pmaddubsw         ym10,           ym9,                  ym1
    pmaddwd           ym8,            ym5
    pmaddwd           ym10,           ym5
    paddd             ym8,            ym10

    packssdw          ym7,            ym8
    psubw             ym7,            ym6
    movu              [r2],           ym7
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_LUMA_PS_8TAP_16xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_ps_16x%1, 4,7,11
    mov               r4d,   r4m
    mov               r5d,   r5m
    add               r3,    r3

%ifdef PIC
    lea               r6,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r6 + r4 * 8]
    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_2000]

    ; register map
    ; m0 , m1     - interpolate coeff
    ; m2 , m3, m4 - load shuffle order table
    ; m5          - pw_1
    ; m6          - pw_2000

    mov               r4d,       %1
    sub               r0,        3
    test              r5d,       r5d
    jz                .loop
    lea               r6,        [r1 * 3]
    sub               r0,        r6                           ; r0(src)-r6
    add               r4d,       7                            ; blkheight += N - 1
    PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512
    lea               r0,        [r0 + r1]
    lea               r2,        [r2 + r3]
    dec               r4d

.loop:
    PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512
    lea               r0,        [r0 + 2 * r1]
    lea               r2,        [r2 + 2 * r3]
    sub               r4d,       2
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 4
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 8
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 12
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 16
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 32
    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 64
%endif

%macro PROCESS_IPFILTER_LUMA_PS_48x1_AVX512 0
    ; register map
    ; m0 , m1     - interpolate coeff
    ; m2 , m3, m4 - load shuffle order table
    ; m5          - pw_1
    ; m6          - pw_2000
    ; m7          - store shuffle order table

    movu              ym8,           [r0]
    vinserti32x8      m8,            [r0 + 8],            1
    pshufb            m9,            m8,                  m3
    pshufb            m10,           m8,                  m4
    pshufb            m8,             m2

    pmaddubsw         m8,            m0
    pmaddubsw         m11,           m9,                  m1
    pmaddwd           m8,            m5
    pmaddwd           m11,           m5
    paddd             m8,            m11

    pmaddubsw         m9,            m0
    pmaddubsw         m11,           m10,                 m1
    pmaddwd           m9,            m5
    pmaddwd           m11,           m5
    paddd             m9,            m11

    packssdw          m8,            m9
    psubw             m8,            m6
    vpermq            m8,            m7,                  m8
    movu              [r2],          m8

    movu              ym8,           [r0 + 32]
    vinserti32x4      m8,            [r0 + 40],           1
    pshufb            ym9,           ym8,                 ym3
    pshufb            ym10,           ym8,                ym4
    pshufb            ym8,            ym2

    pmaddubsw         ym8,            ym0
    pmaddubsw         ym11,           ym9,                ym1
    pmaddwd           ym8,            ym5
    pmaddwd           ym11,           ym5
    paddd             ym8,            ym11

    pmaddubsw         ym9,            ym0
    pmaddubsw         ym11,           ym10,               ym1
    pmaddwd           ym9,            ym5
    pmaddwd           ym11,           ym5
    paddd             ym9,            ym11

    packssdw          ym8,            ym9
    psubw             ym8,            ym6
    movu              [r2 + mmsize],  ym8
%endmacro

;-------------------------------------------------------------------------------------------------------------
; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-------------------------------------------------------------------------------------------------------------
%macro IPFILTER_LUMA_PS_48xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_horiz_ps_48x%1, 4,7,12
    mov               r4d,   r4m
    mov               r5d,   r5m

%ifdef PIC
    lea               r6,        [tab_LumaCoeff]
    vpbroadcastd      m0,        [r6 + r4 * 8]
    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
%else
    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
%endif
    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
    vpbroadcastd      m5,        [pw_1]
    vbroadcasti32x8   m6,        [pw_2000]
    mova              m7,        [interp8_hps_store_avx512]

    mov               r4d,       %1
    sub               r0,        3
    test              r5d,       r5d
    jz                .loop
    lea               r6,        [r1 * 3]
    sub               r0,        r6                           ; r0(src)-r6
    add               r4d,       7                            ; blkheight += N - 1

.loop:
    PROCESS_IPFILTER_LUMA_PS_48x1_AVX512
    lea               r0,        [r0 + r1]
    lea               r2,        [r2 + 2 * r3]
    dec               r4d
    jnz               .loop
    RET
%endmacro

%if ARCH_X86_64 == 1
    IPFILTER_LUMA_PS_48xN_AVX512 64
%endif

;-------------------------------------------------------------------------------------------------------------
;avx512 luma_vss code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_LUMA_VERT_SS_8x8_AVX512 0
    lea                  r6,                  [r0 + 4 * r1]
    movu                 xm1,                 [r0]                           ;0 row
    vinserti32x4         m1,                  [r0 + 2 * r1],          1
    vinserti32x4         m1,                  [r0 + 4 * r1],          2
    vinserti32x4         m1,                  [r6 + 2 * r1],          3
    movu                 xm3,                 [r0 + r1]                      ;1 row
    vinserti32x4         m3,                  [r0 + r7],              1
    vinserti32x4         m3,                  [r6 + r1],              2
    vinserti32x4         m3,                  [r6 + r7],              3
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 xm4,                 [r0 + 2 * r1]                  ;2 row
    vinserti32x4         m4,                  [r0 + 4 * r1],          1
    vinserti32x4         m4,                  [r6 + 2 * r1],          2
    vinserti32x4         m4,                  [r6 + 4 * r1],          3
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    lea                  r4,                  [r6 + 4 * r1]
    movu                 xm5,                 [r0 + r7]                      ;3 row
    vinserti32x4         m5,                  [r6 + r1],              1
    vinserti32x4         m5,                  [r6 + r7],              2
    vinserti32x4         m5,                  [r4 + r1],              3
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 xm4,                 [r0 + 4 * r1]                  ;4 row
    vinserti32x4         m4,                  [r6 + 2 * r1],              1
    vinserti32x4         m4,                  [r6 + 4 * r1],              2
    vinserti32x4         m4,                  [r4 + 2 * r1],              3
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 xm11,                [r6 + r1]                      ;5 row
    vinserti32x4         m11,                 [r6 + r7],              1
    vinserti32x4         m11,                 [r4 + r1],              2
    vinserti32x4         m11,                 [r4 + r7],              3
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 xm12,                [r6 + 2 * r1]                  ;6 row
    vinserti32x4         m12,                 [r6 + 4 * r1],          1
    vinserti32x4         m12,                 [r4 + 2 * r1],          2
    vinserti32x4         m12,                 [r4 + 4 * r1],          3
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    lea                  r8,                  [r4 + 4 * r1]
    movu                 xm13,                [r6 + r7]                      ;7 row
    vinserti32x4         m13,                 [r4 + r1],              1
    vinserti32x4         m13,                 [r4 + r7],              2
    vinserti32x4         m13,                 [r8 + r1],              3
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18

    paddd                m8,                  m14
    paddd                m4,                  m12
    paddd                m0,                  m8
    paddd                m1,                  m4

    movu                 xm12,                [r6 + 4 * r1]                 ; 8 row
    vinserti32x4         m12,                 [r4 + 2 * r1],          1
    vinserti32x4         m12,                 [r4 + 4 * r1],          2
    vinserti32x4         m12,                 [r8 + 2 * r1],          3
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18

    paddd                m10,                 m14
    paddd                m11,                 m13
    paddd                m2,                  m10
    paddd                m3,                  m11

    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3

    movu                 [r2],                xm0
    movu                 [r2 + r3],           xm2
    vextracti32x4        [r2 + 2 * r3],       m0,                  1
    vextracti32x4        [r2 + r5],           m2,                  1
    lea                  r2,                  [r2 + 4 * r3]
    vextracti32x4        [r2],                m0,                  2
    vextracti32x4        [r2 + r3],           m2,                  2
    vextracti32x4        [r2 + 2 * r3],       m0,                  3
    vextracti32x4        [r2 + r5],           m2,                  3
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_SS_LUMA_8xN_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_vert_ss_8x%1, 5, 9, 19
    add                   r1d,                r1d
    add                   r3d,                r3d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif

    lea                   r5,                 [3 * r3]
%rep %1/8 - 1
    PROCESS_LUMA_VERT_SS_8x8_AVX512
    lea                   r0,                 [r4]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_SS_8x8_AVX512
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_SS_LUMA_8xN_AVX512 8
    FILTER_VER_SS_LUMA_8xN_AVX512 16
    FILTER_VER_SS_LUMA_8xN_AVX512 32
%endif
%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
    movu                 ym1,                 [r0]
    movu                 ym3,                 [r0 + r1]
    vinserti32x8         m1,                  [r0 + 2 * r1],          1
    vinserti32x8         m3,                  [r0 + r7],              1
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    lea                  r6,                  [r0 + 4 * r1]
    movu                 ym4,                 [r0 + 2 * r1]
    vinserti32x8         m4,                  [r6],                   1
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 ym5,                 [r0 + r7]
    vinserti32x8         m5,                  [r6 + r1],              1
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 ym4,                 [r6]
    vinserti32x8         m4,                  [r6 + 2 * r1],          1
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 ym11,                [r6 + r1]
    vinserti32x8         m11,                 [r6 + r7],              1
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 ym12,                [r6 + 2 * r1]
    vinserti32x8         m12,                 [r6 + 4 * r1],          1
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    lea                  r4,                  [r6 + 4 * r1]
    movu                 ym13,                [r6 + r7]
    vinserti32x8         m13,                 [r4 + r1],              1
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18

    paddd                m8,                  m14
    paddd                m4,                  m12
    paddd                m0,                  m8
    paddd                m1,                  m4

    movu                 ym12,                [r6 + 4 * r1]
    vinserti32x8         m12,                 [r4 + 2 * r1],          1
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18

    paddd                m10,                 m14
    paddd                m11,                 m13
    paddd                m2,                  m10
    paddd                m3,                  m11
%ifidn %1, sp
    paddd                m0,                  m19
    paddd                m1,                  m19
    paddd                m2,                  m19
    paddd                m3,                  m19

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m20,                   m0
    movu                 [r2],                xm0
    vextracti32x4        [r2 + r3],           m0,                    2
    vextracti32x4        [r2 + 2 * r3],       m0,                    1
    vextracti32x4        [r2 + r5],           m0,                    3
%else
    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3

    movu                 [r2],                ym0
    movu                 [r2 + r3],           ym2
    vextracti32x8        [r2 + 2 * r3],       m0,                1
    vextracti32x8        [r2 + r5],           m2,                1
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_S_LUMA_16xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_16x%2, 5, 8, 21
    add                   r1d,                r1d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif
%ifidn %1, sp
    vbroadcasti32x4       m19,                [pd_526336]
    mova                  m20,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif

    lea                   r5,                 [3 * r3]
%rep %2/4 - 1
    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 4
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 8
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 12
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 16
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 32
    FILTER_VER_S_LUMA_16xN_AVX512 ss, 64
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 4
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 8
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 12
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 16
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 32
    FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
%endif
%macro PROCESS_LUMA_VERT_SS_24x8_AVX512 0
    PROCESS_LUMA_VERT_S_16x4_AVX512 ss
    lea                  r4,                  [r6 + 4 * r1]
    lea                  r8,                  [r4 + 4 * r1]
    movu                 ym1,                 [r6]
    movu                 ym3,                 [r6 + r1]
    vinserti32x8         m1,                  [r6 + 2 * r1],          1
    vinserti32x8         m3,                  [r6 + r7],              1
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 ym4,                 [r6 + 2 * r1]
    vinserti32x8         m4,                  [r4],                   1
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 ym5,                 [r6 + r7]
    vinserti32x8         m5,                  [r4 + r1],              1
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 ym4,                 [r4]
    vinserti32x8         m4,                  [r4 + 2 * r1],          1
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 ym11,                [r4 + r1]
    vinserti32x8         m11,                 [r4 + r7],              1
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 ym12,                [r4 + 2 * r1]
    vinserti32x8         m12,                 [r4 + 4 * r1],          1
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 ym13,                [r4 + r7]
    vinserti32x8         m13,                 [r8 + r1],              1
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18

    paddd                m8,                  m14
    paddd                m4,                  m12
    paddd                m0,                  m8
    paddd                m1,                  m4

    movu                 ym12,                [r4 + 4 * r1]
    vinserti32x8         m12,                 [r8 + 2 * r1],          1
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18

    paddd                m10,                 m14
    paddd                m11,                 m13
    paddd                m2,                  m10
    paddd                m3,                  m11

    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3

    lea                  r9,                  [r2 + 4 * r3]
    movu                 [r9],                ym0
    movu                 [r9 + r3],           ym2
    vextracti32x8        [r9 + 2 * r3],       m0,                1
    vextracti32x8        [r9 + r5],           m2,                1

    movu                 xm1,                 [r0 + mmsize/2]
    vinserti32x4         m1,                  [r0 + 2 * r1 + mmsize/2],          1
    vinserti32x4         m1,                  [r0 + 4 * r1 + mmsize/2],          2
    vinserti32x4         m1,                  [r6 + 2 * r1 + mmsize/2],          3
    movu                 xm3,                 [r0 + r1 + mmsize/2]
    vinserti32x4         m3,                  [r0 + r7 + mmsize/2],              1
    vinserti32x4         m3,                  [r6 + r1 + mmsize/2],              2
    vinserti32x4         m3,                  [r6 + r7 + mmsize/2],              3
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 xm4,                 [r0 + 2 * r1 + mmsize/2]
    vinserti32x4         m4,                  [r0 + 4 * r1 + mmsize/2],          1
    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],          2
    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],          3
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 xm5,                 [r0 + r7 + mmsize/2]
    vinserti32x4         m5,                  [r6 + r1 + mmsize/2],              1
    vinserti32x4         m5,                  [r6 + r7 + mmsize/2],              2
    vinserti32x4         m5,                  [r4 + r1 + mmsize/2],              3
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 xm4,                 [r0 + 4 * r1 + mmsize/2]
    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],              1
    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],              2
    vinserti32x4         m4,                  [r4 + 2 * r1 + mmsize/2],              3
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 xm11,                [r6 + r1 + mmsize/2]
    vinserti32x4         m11,                 [r6 + r7 + mmsize/2],              1
    vinserti32x4         m11,                 [r4 + r1 + mmsize/2],              2
    vinserti32x4         m11,                 [r4 + r7 + mmsize/2],              3
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 xm12,                [r6 + 2 * r1 + mmsize/2]
    vinserti32x4         m12,                 [r6 + 4 * r1 + mmsize/2],          1
    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          2
    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          3
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 xm13,                [r6 + r7 + mmsize/2]
    vinserti32x4         m13,                 [r4 + r1 + mmsize/2],              1
    vinserti32x4         m13,                 [r4 + r7 + mmsize/2],              2
    vinserti32x4         m13,                 [r8 + r1 + mmsize/2],              3
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18

    paddd                m8,                  m14
    paddd                m4,                  m12
    paddd                m0,                  m8
    paddd                m1,                  m4

    movu                 xm12,                [r6 + 4 * r1 + mmsize/2]
    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          1
    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          2
    vinserti32x4         m12,                 [r8 + 2 * r1 + mmsize/2],          3
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18

    paddd                m10,                 m14
    paddd                m11,                 m13
    paddd                m2,                  m10
    paddd                m3,                  m11

    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3

    movu                 [r2 + mmsize/2],                xm0
    movu                 [r2 + r3 + mmsize/2],           xm2
    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  1
    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  1
    lea                  r2,                             [r2 + 4 * r3]
    vextracti32x4        [r2 + mmsize/2],                m0,                  2
    vextracti32x4        [r2 + r3 + mmsize/2],           m2,                  2
    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  3
    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  3
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_ZMM avx512
cglobal interp_8tap_vert_ss_24x32, 5, 10, 19
    add                   r1d,                r1d
    add                   r3d,                r3d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif

    lea                   r5,                 [3 * r3]
%rep 3
    PROCESS_LUMA_VERT_SS_24x8_AVX512
    lea                   r0,                 [r4]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_SS_24x8_AVX512
    RET
%endif

%macro PROCESS_LUMA_VERT_S_32x2_AVX512 1
    movu                 m1,                  [r0]                           ;0 row
    movu                 m3,                  [r0 + r1]                      ;1 row
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 m4,                  [r0 + 2 * r1]                  ;2 row
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 m5,                  [r0 + r7]                      ;3 row
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 m4,                  [r0 + 4 * r1]                  ;4 row
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    lea                  r6,                  [r0 + 4 * r1]

    movu                 m11,                 [r6 + r1]                      ;5 row
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 m12,                 [r6 + 2 * r1]                  ;6 row
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 m13,                 [r6 + r7]                      ;7 row
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18
    paddd                m8,                  m14
    paddd                m4,                  m12
    movu                 m12,                 [r6 + 4 * r1]                 ; 8 row
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18
    paddd                m10,                 m14
    paddd                m11,                 m13

    paddd                m0,                  m8
    paddd                m1,                  m4
    paddd                m2,                  m10
    paddd                m3,                  m11
%ifidn %1, sp
    paddd                m0,                  m19
    paddd                m1,                  m19
    paddd                m2,                  m19
    paddd                m3,                  m19

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m20,                   m0
    movu                 [r2],                ym0
    vextracti32x8        [r2 + r3],           m0,                    1
%else
    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    movu                 [r2],                m0
    movu                 [r2 + r3],           m2
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_S_LUMA_32xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_32x%2, 5, 8, 21
    add                   r1d,                r1d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif
%ifidn %1, sp
    vbroadcasti32x4       m19,                [pd_526336]
    mova                  m20,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif

%rep %2/2 - 1
    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
    lea                   r0,                 [r0 + 2 * r1]
    lea                   r2,                 [r2 + 2 * r3]
%endrep
    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_LUMA_32xN_AVX512 ss, 8
    FILTER_VER_S_LUMA_32xN_AVX512 ss, 16
    FILTER_VER_S_LUMA_32xN_AVX512 ss, 32
    FILTER_VER_S_LUMA_32xN_AVX512 ss, 24
    FILTER_VER_S_LUMA_32xN_AVX512 ss, 64
    FILTER_VER_S_LUMA_32xN_AVX512 sp, 8
    FILTER_VER_S_LUMA_32xN_AVX512 sp, 16
    FILTER_VER_S_LUMA_32xN_AVX512 sp, 32
    FILTER_VER_S_LUMA_32xN_AVX512 sp, 24
    FILTER_VER_S_LUMA_32xN_AVX512 sp, 64
%endif

%macro PROCESS_LUMA_VERT_S_48x4_AVX512 1
    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
    movu                 m1,                  [r0 + 2 * r1]
    movu                 m3,                  [r0 + r7]
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 m4,                  [r0 + 4 * r1]
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 m5,                  [r6 + r1]
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    lea                  r4,                  [r6 + 4 * r1]

    movu                 m4,                  [r6 + 2 * r1]
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 m11,                 [r6 + r7]
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 m12,                 [r4]
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 m13,                 [r4 + r1]
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18
    paddd                m8,                  m14
    paddd                m4,                  m12
    movu                 m12,                 [r4 + 2 * r1]
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18
    paddd                m10,                 m14
    paddd                m11,                 m13

    paddd                m0,                  m8
    paddd                m1,                  m4
    paddd                m2,                  m10
    paddd                m3,                  m11
%ifidn %1, sp
    paddd                m0,                  m19
    paddd                m1,                  m19
    paddd                m2,                  m19
    paddd                m3,                  m19

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m20,                   m0
    movu                 [r2 + 2 * r3],       ym0
    vextracti32x8        [r2 + r5],           m0,                    1
%else
    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    movu                 [r2 + 2 * r3],       m0
    movu                 [r2 + r5],           m2
%endif
    movu                 ym1,                 [r0 + mmsize]
    movu                 ym3,                 [r0 + r1 + mmsize]
    vinserti32x8         m1,                  [r0 + 2 * r1 + mmsize], 1
    vinserti32x8         m3,                  [r0 + r7 + mmsize],     1
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 ym4,                 [r0 + 2 * r1 + mmsize]
    vinserti32x8         m4,                  [r6 + mmsize],          1
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 ym5,                 [r0 + r7 + mmsize]
    vinserti32x8         m5,                  [r6 + r1 + mmsize],     1
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 ym4,                 [r6 + mmsize]
    vinserti32x8         m4,                  [r6 + 2 * r1 + mmsize], 1
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 ym11,                [r6 + r1 + mmsize]
    vinserti32x8         m11,                 [r6 + r7 + mmsize],     1
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 ym12,                [r6 + 2 * r1 + mmsize]
    vinserti32x8         m12,                 [r6 + 4 * r1 + mmsize], 1
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 ym13,                [r6 + r7 + mmsize]
    vinserti32x8         m13,                 [r4 + r1 + mmsize],     1
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18
    paddd                m8,                  m14
    paddd                m4,                  m12
    movu                 ym12,                [r6 + 4 * r1 + mmsize]
    vinserti32x8         m12,                 [r4 + 2 * r1 + mmsize], 1
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18
    paddd                m10,                 m14
    paddd                m11,                 m13

    paddd                m0,                  m8
    paddd                m1,                  m4
    paddd                m2,                  m10
    paddd                m3,                  m11
%ifidn %1, sp
    paddd                m0,                  m19
    paddd                m1,                  m19
    paddd                m2,                  m19
    paddd                m3,                  m19

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m20,                   m0
    movu                 [r2 + mmsize/2],                xm0
    vextracti32x4        [r2 + r3 + mmsize/2], m0,                    2
    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,          1
    vextracti32x4        [r2 + r5 + mmsize/2],           m0,          3
%else
    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6

    packssdw             m0,                  m1
    packssdw             m2,                  m3

    movu                 [r2 + mmsize],                ym0
    movu                 [r2 + r3 + mmsize],           ym2
    vextracti32x8        [r2 + 2 * r3 + mmsize],       m0,                1
    vextracti32x8        [r2 + r5 + mmsize],           m2,                1
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_S_LUMA_48x64_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_48x64, 5, 8, 21
    add                   r1d,                r1d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif
%ifidn %1, sp
    vbroadcasti32x4       m19,                [pd_526336]
    mova                  m20,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif

    lea                   r5,                 [3 * r3]
%rep 15
    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_LUMA_48x64_AVX512 ss
    FILTER_VER_S_LUMA_48x64_AVX512 sp
%endif

%macro PROCESS_LUMA_VERT_S_64x2_AVX512 1
    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
    movu                 m1,                  [r0 + mmsize]                  ;0 row
    movu                 m3,                  [r0 + r1 + mmsize]             ;1 row
    punpcklwd            m0,                  m1,                     m3
    pmaddwd              m0,                  m15
    punpckhwd            m1,                  m3
    pmaddwd              m1,                  m15

    movu                 m4,                  [r0 + 2 * r1 + mmsize]         ;2 row
    punpcklwd            m2,                  m3,                     m4
    pmaddwd              m2,                  m15
    punpckhwd            m3,                  m4
    pmaddwd              m3,                  m15

    movu                 m5,                  [r0 + r7 + mmsize]             ;3 row
    punpcklwd            m6,                  m4,                     m5
    pmaddwd              m6,                  m16
    punpckhwd            m4,                  m5
    pmaddwd              m4,                  m16

    paddd                m0,                  m6
    paddd                m1,                  m4

    movu                 m4,                  [r0 + 4 * r1 + mmsize]         ;4 row
    punpcklwd            m6,                  m5,                     m4
    pmaddwd              m6,                  m16
    punpckhwd            m5,                  m4
    pmaddwd              m5,                  m16

    paddd                m2,                  m6
    paddd                m3,                  m5

    movu                 m11,                 [r6 + r1 + mmsize]             ;5 row
    punpcklwd            m8,                  m4,                     m11
    pmaddwd              m8,                  m17
    punpckhwd            m4,                  m11
    pmaddwd              m4,                  m17

    movu                 m12,                 [r6 + 2 * r1 + mmsize]         ;6 row
    punpcklwd            m10,                 m11,                    m12
    pmaddwd              m10,                 m17
    punpckhwd            m11,                 m12
    pmaddwd              m11,                 m17

    movu                 m13,                 [r6 + r7 + mmsize]             ;7 row
    punpcklwd            m14,                 m12,                    m13
    pmaddwd              m14,                 m18
    punpckhwd            m12,                 m13
    pmaddwd              m12,                 m18
    paddd                m8,                  m14
    paddd                m4,                  m12
    movu                 m12,                 [r6 + 4 * r1 + mmsize]         ; 8 row
    punpcklwd            m14,                 m13,                    m12
    pmaddwd              m14,                 m18
    punpckhwd            m13,                 m12
    pmaddwd              m13,                 m18
    paddd                m10,                 m14
    paddd                m11,                 m13

    paddd                m0,                  m8
    paddd                m1,                  m4
    paddd                m2,                  m10
    paddd                m3,                  m11
%ifidn %1, sp
    paddd                m0,                  m19
    paddd                m1,                  m19
    paddd                m2,                  m19
    paddd                m3,                  m19

    psrad                m0,                  12
    psrad                m1,                  12
    psrad                m2,                  12
    psrad                m3,                  12

    packssdw             m0,                  m1
    packssdw             m2,                  m3
    packuswb             m0,                  m2
    vpermq               m0,                  m20,                   m0
    movu                 [r2 + mmsize/2],     ym0
    vextracti32x8        [r2 + r3 + mmsize/2], m0,                    1
%else
    psrad                m0,                  6
    psrad                m1,                  6
    psrad                m2,                  6
    psrad                m3,                  6
    packssdw             m0,                  m1
    packssdw             m2,                  m3
    movu                 [r2 + mmsize],       m0
    movu                 [r2 + r3 + mmsize],  m2
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_S_LUMA_64xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_64x%2, 5, 8, 21
    add                   r1d,                r1d
    lea                   r7,                 [3 * r1]
    sub                   r0,                 r7
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [pw_LumaCoeffVer_avx512]
    mova                  m15,                [r5 + r4]
    mova                  m16,                [r5 + r4 + 1 * mmsize]
    mova                  m17,                [r5 + r4 + 2 * mmsize]
    mova                  m18,                [r5 + r4 + 3 * mmsize]
%else
    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
    mova                  m15,                [r5]
    mova                  m16,                [r5 + 1 * mmsize]
    mova                  m17,                [r5 + 2 * mmsize]
    mova                  m18,                [r5 + 3 * mmsize]
%endif
%ifidn %1, sp
    vbroadcasti32x4       m19,                [pd_526336]
    mova                  m20,                [interp8_vsp_store_avx512]
%else
    add                   r3d,                r3d
%endif

%rep %2/2 - 1
    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
    lea                   r0,                 [r0 + 2 * r1]
    lea                   r2,                 [r2 + 2 * r3]
%endrep
    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VER_S_LUMA_64xN_AVX512 ss, 16
    FILTER_VER_S_LUMA_64xN_AVX512 ss, 32
    FILTER_VER_S_LUMA_64xN_AVX512 ss, 48
    FILTER_VER_S_LUMA_64xN_AVX512 ss, 64
    FILTER_VER_S_LUMA_64xN_AVX512 sp, 16
    FILTER_VER_S_LUMA_64xN_AVX512 sp, 32
    FILTER_VER_S_LUMA_64xN_AVX512 sp, 48
    FILTER_VER_S_LUMA_64xN_AVX512 sp, 64
%endif
;-------------------------------------------------------------------------------------------------------------
;avx512 luma_vss code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;avx512 luma_vpp and luma_vps code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_LUMA_VERT_16x8_AVX512 1
    lea                   r5,                 [r0 + 4 * r1]
    lea                   r4,                 [r5 + 4 * r1]
    movu                  xm1,                [r0]
    vinserti32x4          m1,                 [r0 + 2 * r1],       1
    vinserti32x4          m1,                 [r5],                2
    vinserti32x4          m1,                 [r5 + 2 * r1],       3
    movu                  xm3,                [r0 + r1]
    vinserti32x4          m3,                 [r0 + r6],           1
    vinserti32x4          m3,                 [r5 + r1],           2
    vinserti32x4          m3,                 [r5 + r6],           3
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  xm4,                [r0 + 2 * r1]
    vinserti32x4          m4,                 [r0 + 4 * r1],       1
    vinserti32x4          m4,                 [r5 + 2 * r1],       2
    vinserti32x4          m4,                 [r5 + 4 * r1],       3
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    movu                  xm5,                [r0 + r6]
    vinserti32x4          m5,                 [r5 + r1],           1
    vinserti32x4          m5,                 [r5 + r6],           2
    vinserti32x4          m5,                 [r4 + r1],           3
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m6
    paddw                 m1,                 m4

    movu                  xm4,                [r0 + 4 * r1]
    vinserti32x4          m4,                 [r5 + 2 * r1],       1
    vinserti32x4          m4,                 [r5 + 4 * r1],       2
    vinserti32x4          m4,                 [r4 + 2 * r1],       3
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9

    paddw                 m2,                 m6
    paddw                 m3,                 m5

    movu                  xm15,               [r5 + r1]
    vinserti32x4          m15,                [r5 + r6],           1
    vinserti32x4          m15,                [r4 + r1],           2
    vinserti32x4          m15,                [r4 + r6],           3
    punpcklbw             m12,                m4,                 m15
    pmaddubsw             m12,                m10
    punpckhbw             m13,                m4,                 m15
    pmaddubsw             m13,                m10

    lea                   r8,                 [r4 + 4 * r1]
    movu                  xm4,                [r5 + 2 * r1]
    vinserti32x4          m4,                 [r5 + 4 * r1],       1
    vinserti32x4          m4,                 [r4 + 2 * r1],       2
    vinserti32x4          m4,                 [r4 + 4 * r1],       3
    punpcklbw             m14,                m15,                 m4
    pmaddubsw             m14,                m10
    punpckhbw             m15,                m4
    pmaddubsw             m15,                m10

    movu                  xm5,                [r5 + r6]
    vinserti32x4          m5,                 [r4 + r1],           1
    vinserti32x4          m5,                 [r4 + r6],           2
    vinserti32x4          m5,                 [r8 + r1],           3
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m11
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m11

    paddw                 m12,                m6
    paddw                 m13,                m4

    movu                  xm4,                [r5 + 4 * r1]
    vinserti32x4          m4,                 [r4 + 2 * r1],       1
    vinserti32x4          m4,                 [r4 + 4 * r1],       2
    vinserti32x4          m4,                 [r8 + 2 * r1],       3
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m11
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m11

    paddw                 m14,                m6
    paddw                 m15,                m5

    paddw                 m0,                 m12
    paddw                 m1,                 m13
    paddw                 m2,                 m14
    paddw                 m3,                 m15
%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7

    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2],               xm0
    movu                  [r2 + r3],          xm2
    vextracti32x4         [r2 + 2 * r3],      m0,                  1
    vextracti32x4         [r2 + r7],          m2,                  1
    lea                   r2,                 [r2 + 4 * r3]
    vextracti32x4         [r2],               m0,                  2
    vextracti32x4         [r2 + r3],          m2,                  2
    vextracti32x4         [r2 + 2 * r3],      m0,                  3
    vextracti32x4         [r2 + r7],          m2,                  3
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m12,                 m16
    mova                  m13,                 m17
    vpermi2q              m12,                 m0,                m1
    vpermi2q              m13,                 m0,                m1
    movu                  [r2],               ym12
    vextracti32x8         [r2 + 2 * r3],      m12,                 1

    psubw                 m2,                 m7
    psubw                 m3,                 m7
    mova                  m14,                 m16
    mova                  m15,                 m17
    vpermi2q              m14,                 m2,                m3
    vpermi2q              m15,                 m2,                m3
    movu                  [r2 + r3],          ym14
    vextracti32x8         [r2 + r7],          m14,                 1
    lea                   r2,                 [r2 + 4 * r3]

    movu                  [r2],               ym13
    movu                  [r2 + r3],          ym15
    vextracti32x8         [r2 + 2 * r3],      m13,                 1
    vextracti32x8         [r2 + r7],          m15,                 1
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_LUMA_16xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_16x%2, 5, 9, 18
    mov                   r4d,                r4m
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + 1 * mmsize]
    mova                  m10,                [r5 + r4 + 2 * mmsize]
    mova                  m11,                [r5 + r4 + 3 * mmsize]
%else
    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
%endif
%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m16,                [interp4_vps_store1_avx512]
    mova                  m17,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]
    sub                   r0,                 r6

%rep %2/8 - 1
    PROCESS_LUMA_VERT_16x8_AVX512 %1
    lea                   r0,                 [r4]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_16x8_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_LUMA_16xN_AVX512 pp, 8
    FILTER_VERT_LUMA_16xN_AVX512 pp, 16
    FILTER_VERT_LUMA_16xN_AVX512 pp, 32
    FILTER_VERT_LUMA_16xN_AVX512 pp, 64

    FILTER_VERT_LUMA_16xN_AVX512 ps, 8
    FILTER_VERT_LUMA_16xN_AVX512 ps, 16
    FILTER_VERT_LUMA_16xN_AVX512 ps, 32
    FILTER_VERT_LUMA_16xN_AVX512 ps, 64
%endif
%macro PROCESS_LUMA_VERT_32x4_AVX512 1
    lea                   r5,                 [r0 + 4 * r1]
    movu                  ym1,                [r0]
    vinserti32x8          m1,                 [r0 + 2 * r1],       1
    movu                  ym3,                [r0 + r1]
    vinserti32x8          m3,                 [r0 + r6],           1
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  ym4,                [r0 + 2 * r1]
    vinserti32x8          m4,                 [r0 + 4 * r1],       1
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    movu                  ym5,                [r0 + r6]
    vinserti32x8          m5,                 [r5 + r1],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m6
    paddw                 m1,                 m4

    movu                  ym4,                [r0 + 4 * r1]
    vinserti32x8          m4,                 [r5 + 2 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9

    paddw                 m2,                 m6
    paddw                 m3,                 m5

    lea                   r4,                 [r5 + 4 * r1]
    movu                  ym15,               [r5 + r1]
    vinserti32x8          m15,                [r5 + r6],           1
    punpcklbw             m12,                m4,                 m15
    pmaddubsw             m12,                m10
    punpckhbw             m13,                m4,                 m15
    pmaddubsw             m13,                m10

    movu                  ym4,                [r5 + 2 * r1]
    vinserti32x8          m4,                 [r5 + 4 * r1],       1
    punpcklbw             m14,                m15,                 m4
    pmaddubsw             m14,                m10
    punpckhbw             m15,                m4
    pmaddubsw             m15,                m10

    movu                  ym5,                [r5 + r6]
    vinserti32x8          m5,                 [r4 + r1],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m11
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m11

    paddw                 m12,                m6
    paddw                 m13,                m4

    movu                  ym4,                [r5 + 4 * r1]
    vinserti32x8          m4,                 [r4 + 2 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m11
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m11

    paddw                 m14,                m6
    paddw                 m15,                m5

    paddw                 m0,                 m12
    paddw                 m1,                 m13
    paddw                 m2,                 m14
    paddw                 m3,                 m15
%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7

    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2],               ym0
    movu                  [r2 + r3],          ym2
    vextracti32x8         [r2 + 2 * r3],      m0,                  1
    vextracti32x8         [r2 + r7],          m2,                  1
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m12,                 m16
    mova                  m13,                 m17
    vpermi2q              m12,                 m0,                m1
    vpermi2q              m13,                 m0,                m1
    movu                  [r2],               m12
    movu                  [r2 + 2 * r3],      m13

    psubw                 m2,                 m7
    psubw                 m3,                 m7
    mova                  m14,                 m16
    mova                  m15,                 m17
    vpermi2q              m14,                 m2,                m3
    vpermi2q              m15,                 m2,                m3
    movu                  [r2 + r3],          m14
    movu                  [r2 + r7],          m15
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_LUMA_32xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_32x%2, 5, 8, 18
    mov                   r4d,                r4m
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + 1 * mmsize]
    mova                  m10,                [r5 + r4 + 2 * mmsize]
    mova                  m11,                [r5 + r4 + 3 * mmsize]
%else
    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
%endif
%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m16,                [interp4_vps_store1_avx512]
    mova                  m17,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]
    sub                   r0,                 r6

%rep %2/4 - 1
    PROCESS_LUMA_VERT_32x4_AVX512 %1
    lea                   r0,                 [r0 + 4 * r1]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_32x4_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_LUMA_32xN_AVX512 pp, 8
    FILTER_VERT_LUMA_32xN_AVX512 pp, 16
    FILTER_VERT_LUMA_32xN_AVX512 pp, 24
    FILTER_VERT_LUMA_32xN_AVX512 pp, 32
    FILTER_VERT_LUMA_32xN_AVX512 pp, 64

    FILTER_VERT_LUMA_32xN_AVX512 ps, 8
    FILTER_VERT_LUMA_32xN_AVX512 ps, 16
    FILTER_VERT_LUMA_32xN_AVX512 ps, 24
    FILTER_VERT_LUMA_32xN_AVX512 ps, 32
    FILTER_VERT_LUMA_32xN_AVX512 ps, 64
%endif
%macro PROCESS_LUMA_VERT_48x8_AVX512 1
%ifidn %1, pp
    PROCESS_LUMA_VERT_32x4_AVX512 pp
%else
    PROCESS_LUMA_VERT_32x4_AVX512 ps
%endif
    lea                   r8,                 [r4 + 4 * r1]
    lea                   r9,                 [r2 + 4 * r3]
    movu                  ym1,                [r5]
    vinserti32x8          m1,                 [r5 + 2 * r1],       1
    movu                  ym3,                [r5 + r1]
    vinserti32x8          m3,                 [r5 + r6],           1
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  ym4,                [r5 + 2 * r1]
    vinserti32x8          m4,                 [r5 + 4 * r1],       1
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    movu                  ym5,                [r5 + r6]
    vinserti32x8          m5,                 [r4 + r1],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m6
    paddw                 m1,                 m4

    movu                  ym4,                [r5 + 4 * r1]
    vinserti32x8          m4,                 [r4 + 2 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9

    paddw                 m2,                 m6
    paddw                 m3,                 m5

    movu                  ym15,               [r4 + r1]
    vinserti32x8          m15,                [r4 + r6],           1
    punpcklbw             m12,                m4,                 m15
    pmaddubsw             m12,                m10
    punpckhbw             m13,                m4,                 m15
    pmaddubsw             m13,                m10

    movu                  ym4,                [r4 + 2 * r1]
    vinserti32x8          m4,                 [r4 + 4 * r1],       1
    punpcklbw             m14,                m15,                 m4
    pmaddubsw             m14,                m10
    punpckhbw             m15,                m4
    pmaddubsw             m15,                m10

    movu                  ym5,                [r4 + r6]
    vinserti32x8          m5,                 [r8 + r1],           1
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m11
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m11

    paddw                 m12,                m6
    paddw                 m13,                m4

    movu                  ym4,                [r4 + 4 * r1]
    vinserti32x8          m4,                 [r8 + 2 * r1],       1
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m11
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m11

    paddw                 m14,                m6
    paddw                 m15,                m5

    paddw                 m0,                 m12
    paddw                 m1,                 m13
    paddw                 m2,                 m14
    paddw                 m3,                 m15
%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7
    packuswb              m0,                 m1
    packuswb              m2,                 m3

    movu                  [r9],               ym0
    movu                  [r9 + r3],          ym2
    vextracti32x8         [r9 + 2 * r3],      m0,                  1
    vextracti32x8         [r9 + r7],          m2,                  1
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m12,                 m16
    mova                  m13,                 m17
    vpermi2q              m12,                 m0,                m1
    vpermi2q              m13,                 m0,                m1
    movu                  [r9],               m12
    movu                  [r9 + 2 * r3],      m13

    psubw                 m2,                 m7
    psubw                 m3,                 m7
    mova                  m14,                 m16
    mova                  m15,                 m17
    vpermi2q              m14,                 m2,                m3
    vpermi2q              m15,                 m2,                m3
    movu                  [r9 + r3],          m14
    movu                  [r9 + r7],          m15
%endif
    movu                  xm1,                [r0 + mmsize/2]
    vinserti32x4          m1,                 [r0 + 2 * r1 + mmsize/2],       1
    vinserti32x4          m1,                 [r5 + mmsize/2],                2
    vinserti32x4          m1,                 [r5 + 2 * r1 + mmsize/2],       3
    movu                  xm3,                [r0 + r1 + mmsize/2]
    vinserti32x4          m3,                 [r0 + r6 + mmsize/2],           1
    vinserti32x4          m3,                 [r5 + r1 + mmsize/2],           2
    vinserti32x4          m3,                 [r5 + r6 + mmsize/2],           3
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r0 + 4 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r5 + 2 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       3
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    movu                  xm5,                [r0 + r6 + mmsize/2]
    vinserti32x4          m5,                 [r5 + r1 + mmsize/2],           1
    vinserti32x4          m5,                 [r5 + r6 + mmsize/2],           2
    vinserti32x4          m5,                 [r4 + r1 + mmsize/2],           3
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m6
    paddw                 m1,                 m4

    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r5 + 2 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       3
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9

    paddw                 m2,                 m6
    paddw                 m3,                 m5

    movu                  xm15,               [r5 + r1 + mmsize/2]
    vinserti32x4          m15,                [r5 + r6 + mmsize/2],           1
    vinserti32x4          m15,                [r4 + r1 + mmsize/2],           2
    vinserti32x4          m15,                [r4 + r6 + mmsize/2],           3
    punpcklbw             m12,                m4,                 m15
    pmaddubsw             m12,                m10
    punpckhbw             m13,                m4,                 m15
    pmaddubsw             m13,                m10

    movu                  xm4,                [r5 + 2 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r4 + 4 * r1 + mmsize/2],       3
    punpcklbw             m14,                m15,                 m4
    pmaddubsw             m14,                m10
    punpckhbw             m15,                m4
    pmaddubsw             m15,                m10

    movu                  xm5,                [r5 + r6 + mmsize/2]
    vinserti32x4          m5,                 [r4 + r1 + mmsize/2],           1
    vinserti32x4          m5,                 [r4 + r6 + mmsize/2],           2
    vinserti32x4          m5,                 [r8 + r1 + mmsize/2],           3
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m11
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m11

    paddw                 m12,                m6
    paddw                 m13,                m4

    movu                  xm4,                [r5 + 4 * r1 + mmsize/2]
    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       1
    vinserti32x4          m4,                 [r4 + 4 * r1 + mmsize/2],       2
    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       3
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m11
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m11

    paddw                 m14,                m6
    paddw                 m15,                m5

    paddw                 m0,                 m12
    paddw                 m1,                 m13
    paddw                 m2,                 m14
    paddw                 m3,                 m15
%ifidn %1, pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7

    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2 + mmsize/2],               xm0
    movu                  [r2 + r3 + mmsize/2],          xm2
    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
    lea                   r2,                 [r2 + 4 * r3]
    vextracti32x4         [r2 + mmsize/2],               m0,                  2
    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m12,                 m16
    mova                  m13,                 m17
    vpermi2q              m12,                 m0,                m1
    vpermi2q              m13,                 m0,                m1
    movu                  [r2 + mmsize],               ym12
    vextracti32x8         [r2 + 2 * r3 + mmsize],      m12,                 1

    psubw                 m2,                 m7
    psubw                 m3,                 m7
    mova                  m14,                 m16
    mova                  m15,                 m17
    vpermi2q              m14,                 m2,                m3
    vpermi2q              m15,                 m2,                m3
    movu                  [r2 + r3 + mmsize],          ym14
    vextracti32x8         [r2 + r7 + mmsize],          m14,                 1
    lea                   r2,                          [r2 + 4 * r3]

    movu                  [r2 + mmsize],               ym13
    movu                  [r2 + r3 + mmsize],          ym15
    vextracti32x8         [r2 + 2 * r3 + mmsize],      m13,                 1
    vextracti32x8         [r2 + r7 + mmsize],          m15,                 1
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_LUMA_48x64_AVX512 1
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_48x64, 5, 10, 18
    mov                   r4d,                r4m
    shl                   r4d,                8

%ifdef PIC
    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + 1 * mmsize]
    mova                  m10,                [r5 + r4 + 2 * mmsize]
    mova                  m11,                [r5 + r4 + 3 * mmsize]
%else
    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
%endif
%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m16,                [interp4_vps_store1_avx512]
    mova                  m17,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    lea                   r7,                 [3 * r3]
    sub                   r0,                 r6

%rep 7
    PROCESS_LUMA_VERT_48x8_AVX512 %1
    lea                   r0,                 [r4]
    lea                   r2,                 [r2 + 4 * r3]
%endrep
    PROCESS_LUMA_VERT_48x8_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
    FILTER_VERT_LUMA_48x64_AVX512 pp
    FILTER_VERT_LUMA_48x64_AVX512 ps
%endif
%macro PROCESS_LUMA_VERT_64x2_AVX512 1
    lea                   r5,                 [r0 + 4 * r1]
    movu                  m1,                 [r0]
    movu                  m3,                 [r0 + r1]
    punpcklbw             m0,                 m1,                  m3
    pmaddubsw             m0,                 m8
    punpckhbw             m1,                 m3
    pmaddubsw             m1,                 m8

    movu                  m4,                 [r0 + 2 * r1]
    punpcklbw             m2,                 m3,                  m4
    pmaddubsw             m2,                 m8
    punpckhbw             m3,                 m4
    pmaddubsw             m3,                 m8

    movu                  m5,                 [r0 + r6]
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m9
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m9

    paddw                 m0,                 m6
    paddw                 m1,                 m4

    movu                  m4,                 [r0 + 4 * r1]
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m9
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m9

    paddw                 m2,                 m6
    paddw                 m3,                 m5

    movu                  m15,                [r5 + r1]
    punpcklbw             m12,                m4,                  m15
    pmaddubsw             m12,                m10
    punpckhbw             m13,                m4,                  m15
    pmaddubsw             m13,                m10

    movu                  m4,                 [r5 + 2 * r1]
    punpcklbw             m14,                m15,                 m4
    pmaddubsw             m14,                m10
    punpckhbw             m15,                m4
    pmaddubsw             m15,                m10

    movu                  m5,                 [r5 + r6]
    punpcklbw             m6,                 m4,                  m5
    pmaddubsw             m6,                 m11
    punpckhbw             m4,                 m5
    pmaddubsw             m4,                 m11

    paddw                 m12,                m6
    paddw                 m13,                m4

    movu                  m4,                 [r5 + 4 * r1]
    punpcklbw             m6,                 m5,                  m4
    pmaddubsw             m6,                 m11
    punpckhbw             m5,                 m4
    pmaddubsw             m5,                 m11

    paddw                 m14,                m6
    paddw                 m15,                m5

    paddw                 m0,                 m12
    paddw                 m1,                 m13
    paddw                 m2,                 m14
    paddw                 m3,                 m15
%ifidn %1,pp
    pmulhrsw              m0,                 m7
    pmulhrsw              m1,                 m7
    pmulhrsw              m2,                 m7
    pmulhrsw              m3,                 m7

    packuswb              m0,                 m1
    packuswb              m2,                 m3
    movu                  [r2],               m0
    movu                  [r2 + r3],          m2
%else
    psubw                 m0,                 m7
    psubw                 m1,                 m7
    mova                  m12,                 m16
    mova                  m13,                 m17
    vpermi2q              m12,                 m0,                m1
    vpermi2q              m13,                 m0,                m1
    movu                  [r2],               m12
    movu                  [r2 + mmsize],      m13

    psubw                 m2,                 m7
    psubw                 m3,                 m7
    mova                  m14,                 m16
    mova                  m15,                 m17
    vpermi2q              m14,                 m2,                m3
    vpermi2q              m15,                 m2,                m3
    movu                  [r2 + r3],          m14
    movu                  [r2 + r3 + mmsize], m15
%endif
%endmacro
;-----------------------------------------------------------------------------------------------------------------
; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VERT_LUMA_64xN_AVX512 2
INIT_ZMM avx512
cglobal interp_8tap_vert_%1_64x%2, 5, 8, 18
    mov                   r4d,                r4m
    shl                   r4d,                8
%ifdef PIC
    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
    mova                  m8,                 [r5 + r4]
    mova                  m9,                 [r5 + r4 + 1 * mmsize]
    mova                  m10,                [r5 + r4 + 2 * mmsize]
    mova                  m11,                [r5 + r4 + 3 * mmsize]
%else
    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
%endif
%ifidn %1, pp
    vbroadcasti32x8       m7,                 [pw_512]
%else
    shl                   r3d,                1
    vbroadcasti32x8       m7,                 [pw_2000]
    mova                  m16,                [interp4_vps_store1_avx512]
    mova                  m17,                [interp4_vps_store2_avx512]
%endif

    lea                   r6,                 [3 * r1]
    sub                   r0,                 r6
    lea                   r7,                 [3 * r3]

%rep %2/2 - 1
    PROCESS_LUMA_VERT_64x2_AVX512 %1
    lea                   r0,                 [r0 + 2 * r1]
    lea                   r2,                 [r2 + 2 * r3]
%endrep
    PROCESS_LUMA_VERT_64x2_AVX512 %1
    RET
%endmacro

%if ARCH_X86_64
FILTER_VERT_LUMA_64xN_AVX512 pp, 16
FILTER_VERT_LUMA_64xN_AVX512 pp, 32
FILTER_VERT_LUMA_64xN_AVX512 pp, 48
FILTER_VERT_LUMA_64xN_AVX512 pp, 64

FILTER_VERT_LUMA_64xN_AVX512 ps, 16
FILTER_VERT_LUMA_64xN_AVX512 ps, 32
FILTER_VERT_LUMA_64xN_AVX512 ps, 48
FILTER_VERT_LUMA_64xN_AVX512 ps, 64
%endif
;-------------------------------------------------------------------------------------------------------------
;avx512 luma_vpp and luma_vps code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;ipfilter_luma_avx512 code end
;-------------------------------------------------------------------------------------------------------------