view source/common/x86/intrapred8.asm @ 9572:e2bd981f1ea8

asm-avx2: cleanup redundant instruction
author Praveen Tiwari <praveen@multicorewareinc.com>
date Wed, 25 Feb 2015 16:02:10 +0530
parents 3e4e3e2cafab
children e1b7ddbe1ecb
line wrap: on
line source
;*****************************************************************************
;* Copyright (C) 2013 x265 project
;*
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/

%include "x86inc.asm"
%include "x86util.asm"

SECTION_RODATA 32

pb_0_8        times 8 db  0,  8
pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
c_trans_4x4           db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
tab_Si:               db  0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
pb_fact0:             db  0,  2,  4,  6,  8, 10, 12, 14,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_12_0:        db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  7,  0
c_mode32_13_0:        db  3,  6, 10, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_13_shuf:     db  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  3,  2,  1,  0
c_mode32_14_shuf:     db 15, 14, 13,  0,  2,  3,  4,  5,  6,  7, 10, 11, 12, 13, 14, 15
c_mode32_14_0:        db 15, 12, 10,  7,  5,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_15_0:        db 15, 13, 11,  9,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0
c_mode32_16_0:        db 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0
c_mode32_17_0:        db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
c_mode32_18_0:        db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
tab_S1:               db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
tab_S2:         db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0

c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_26_20:         db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src3_11_4_12:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_14_8:          db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_src5_13_5_13:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
c_ang8_2_28:          db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
c_ang8_src6_14_7_15:  db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
c_ang8_22_16:         db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

c_ang8_21_10       :  db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
c_ang8_src2_10_3_11:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
c_ang8_31_20:         db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src4_12_4_12:  times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_9_30:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
c_ang8_src5_13_6_14:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
c_ang8_19_8:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8

;; (blkSize - 1 - x)
pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
pw_planar16_0:        dw 15, 14, 13, 12, 11, 10, 9,  8
pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16

trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7

const ang_table
%assign x 0
%rep 32
    times 8 db (32-x), x
%assign x x+1
%endrep

SECTION .text

cextern pw_4
cextern pw_8
cextern pw_16
cextern pw_32
cextern pw_1024
cextern pb_unpackbd1
cextern multiL
cextern multiH
cextern multiH2
cextern multiH3
cextern multi_2Row

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc4, 5,5,3
    inc         r2
    pxor        m0, m0
    movd        m1, [r2]
    movd        m2, [r2 + 8]
    punpckldq   m1, m2
    psadbw      m1, m0              ; m1 = sum

    test        r4d, r4d

    mov         r4d, 4096
    movd        m2, r4d
    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
    movd        r4d, m1             ; r4d = dc_val
    pshufb      m1, m0              ; m1 = byte [dc_val ...]

    ; store DC 4x4
    lea         r3, [r1 * 3]
    movd        [r0], m1
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m1
    movd        [r0 + r3], m1

    ; do DC filter
    jz         .end
    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
    add         r4d, r3d            ; r4d = DC * 3 + 2
    movd        m1, r4d
    pshuflw     m1, m1, 0           ; m1 = pixDCx3

    ; filter top
    pmovzxbw    m2, [r2]
    paddw       m2, m1
    psraw       m2, 2
    packuswb    m2, m2
    movd        [r0], m2            ; overwrite top-left pixel, we will update it later

    ; filter top-left
    movzx       r4d, byte [r2 + 8]
    add         r3d, r4d
    movzx       r4d, byte [r2]
    add         r3d, r4d
    shr         r3d, 2
    mov         [r0], r3b

    ; filter left
    add         r0, r1
    pmovzxbw    m2, [r2 + 9]
    paddw       m2, m1
    psraw       m2, 2
    packuswb    m2, m2
    pextrb      [r0], m2, 0
    pextrb      [r0 + r1], m2, 1
    pextrb      [r0 + r1 * 2], m2, 2

.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc8, 5, 7, 3
    lea             r3, [r2 + 17]
    inc             r2
    pxor            m0,            m0
    movh            m1,            [r2]
    movh            m2,            [r3]
    punpcklqdq      m1,            m2
    psadbw          m1,            m0
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r5d,           m1
    add             r5d,           8
    shr             r5d,           4     ; sum = sum / 16
    movd            m1,            r5d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

    test            r4d,           r4d

    ; store DC 8x8
    mov             r6,            r0
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movh            [r0],          m1
    movh            [r0 + r1],     m1

    ; Do DC Filter
    jz              .end
    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
    add             r5d,           r4d            ; r5d = DC * 3 + 2
    movd            m1,            r5d
    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
    pshufd          m1,            m1, 0

    ; filter top
    pmovzxbw        m2,            [r2]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movh            [r6],          m2

    ; filter top-left
    movzx           r5d, byte      [r3]
    add             r4d,           r5d
    movzx           r3d, byte      [r2]
    add             r3d,           r4d
    shr             r3d,           2
    mov             [r6],          r3b

    ; filter left
    add             r6,            r1
    pmovzxbw        m2,            [r2 + 17]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    pextrb          [r6],          m2, 0
    pextrb          [r6 + r1],     m2, 1
    pextrb          [r6 + 2 * r1], m2, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 3
    pextrb          [r6 + r1 * 2], m2, 4
    pextrb          [r6 + r1 * 4], m2, 6
    lea             r1,            [r1 * 3]
    pextrb          [r6 + r1],     m2, 5

.end:
    RET

;--------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;--------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc16, 5, 7, 4
    lea             r3, [r2 + 33]
    inc             r2
    pxor            m0,            m0
    movu            m1,            [r2]
    movu            m2,            [r3]
    psadbw          m1,            m0
    psadbw          m2,            m0
    paddw           m1,            m2
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r5d,           m1
    add             r5d,           16
    shr             r5d,           5     ; sum = sum / 32
    movd            m1,            r5d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

    test            r4d,           r4d

    ; store DC 16x16
    mov             r6,            r0
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    lea             r0,            [r0 + r1 * 2]
    movu            [r0],          m1
    movu            [r0 + r1],     m1

    ; Do DC Filter
    jz              .end
    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
    add             r5d,           r4d            ; r5d = DC * 3 + 2
    movd            m1,            r5d
    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
    pshufd          m1,            m1, 0

    ; filter top
    pmovzxbw        m2,            [r2]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    movh            [r6],          m2
    pmovzxbw        m3,            [r2 + 8]
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3
    movh            [r6 + 8],      m3

    ; filter top-left
    movzx           r5d, byte      [r3]
    add             r4d,           r5d
    movzx           r3d, byte      [r2]
    add             r3d,           r4d
    shr             r3d,           2
    mov             [r6],          r3b

    ; filter left
    add             r6,            r1
    pmovzxbw        m2,            [r2 + 33]
    paddw           m2,            m1
    psraw           m2,            2
    packuswb        m2,            m2
    pextrb          [r6],          m2, 0
    pextrb          [r6 + r1],     m2, 1
    pextrb          [r6 + r1 * 2], m2, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 3
    pextrb          [r6 + r1 * 2], m2, 4
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 5
    pextrb          [r6 + r1 * 2], m2, 6
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m2, 7

    pmovzxbw        m3,            [r2 + 41]
    paddw           m3,            m1
    psraw           m3,            2
    packuswb        m3,            m3
    pextrb          [r6 + r1 * 2], m3, 0
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 1
    pextrb          [r6 + r1 * 2], m3, 2
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 3
    pextrb          [r6 + r1 * 2], m3, 4
    lea             r6,            [r6 + r1 * 2]
    pextrb          [r6 + r1],     m3, 5
    pextrb          [r6 + r1 * 2], m3, 6

.end:
    RET

;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc32, 3, 5, 5
    lea             r3, [r2 + 65]
    inc             r2
    pxor            m0,            m0
    movu            m1,            [r2]
    movu            m2,            [r2 + 16]
    movu            m3,            [r3]
    movu            m4,            [r3 + 16]
    psadbw          m1,            m0
    psadbw          m2,            m0
    psadbw          m3,            m0
    psadbw          m4,            m0
    paddw           m1,            m2
    paddw           m3,            m4
    paddw           m1,            m3
    pshufd          m2,            m1, 2
    paddw           m1,            m2

    movd            r4d,           m1
    add             r4d,           32
    shr             r4d,           6     ; sum = sum / 64
    movd            m1,            r4d
    pshufb          m1,            m0    ; m1 = byte [dc_val ...]

%rep 2
    ; store DC 16x16
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
    movu            [r0],          m1
    movu            [r0 + r1],     m1
    movu            [r0 + 16],     m1
    movu            [r0 + r1 + 16],m1
    lea             r0,            [r0 + 2 * r1]
%endrep

    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar4, 3,3,7
    pmovzxbw        m1, [r2 + 1]
    pmovzxbw        m2, [r2 + 9]
    pshufhw         m3, m1, 0               ; topRight
    pshufd          m3, m3, 0xAA
    pshufhw         m4, m2, 0               ; bottomLeft
    pshufd          m4, m4, 0xAA

    pmullw          m3, [multi_2Row]        ; (x + 1) * topRight
    pmullw          m0, m1, [pw_planar4_1]  ; (blkSize - 1 - y) * above[x]
    mova            m6, [pw_planar4_0]
    paddw           m3, [pw_4]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

    pshuflw         m5, m2, 0
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0], m5

    pshuflw         m5, m2, 01010101b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0 + r1], m5
    lea             r0, [r0 + 2 * r1]

    pshuflw         m5, m2, 10101010b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0], m5

    pshuflw         m5, m2, 11111111b
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 3
    packuswb        m5, m5
    movd            [r0 + r1], m5
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar8, 3,3,7
    pmovzxbw        m1, [r2 + 1]
    pmovzxbw        m2, [r2 + 17]

    movd            m3, [r2 + 9]            ; topRight   = above[8];
    movd            m4, [r2 + 25]           ; bottomLeft = left[8];

    pxor            m0, m0
    pshufb          m3, m0
    pshufb          m4, m0
    punpcklbw       m3, m0                  ; v_topRight
    punpcklbw       m4, m0                  ; v_bottomLeft

    pmullw          m3, [multiL]            ; (x + 1) * topRight
    pmullw          m0, m1, [pw_planar8_1]  ; (blkSize - 1 - y) * above[x]
    mova            m6, [pw_planar8_0]
    paddw           m3, [pw_8]
    paddw           m3, m4
    paddw           m3, m0
    psubw           m4, m1

%macro INTRA_PRED_PLANAR8 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%endif
    pmullw          m5, m6
    paddw           m5, m3
    paddw           m3, m4
    psraw           m5, 4
    packuswb        m5, m5
    movh            [r0], m5
    lea             r0, [r0 + r1]
%endmacro

    INTRA_PRED_PLANAR8 0
    INTRA_PRED_PLANAR8 1
    INTRA_PRED_PLANAR8 2
    INTRA_PRED_PLANAR8 3
    INTRA_PRED_PLANAR8 4
    INTRA_PRED_PLANAR8 5
    INTRA_PRED_PLANAR8 6
    INTRA_PRED_PLANAR8 7
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar16, 3,3,8
    pmovzxbw        m2, [r2 + 1]
    pmovzxbw        m7, [r2 + 9]

    movd            m3, [r2 + 17]               ; topRight   = above[16]
    movd            m6, [r2 + 49]               ; bottomLeft = left[16]

    pxor            m0, m0
    pshufb          m3, m0
    pshufb          m6, m0
    punpcklbw       m3, m0                      ; v_topRight
    punpcklbw       m6, m0                      ; v_bottomLeft

    pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m3, [multiL]                ; (x + 1) * topRight
    pmullw          m1, m2, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
    pmullw          m5, m7, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
    paddw           m4, [pw_16]
    paddw           m3, [pw_16]
    paddw           m4, m6
    paddw           m3, m6
    paddw           m4, m5
    paddw           m3, m1
    psubw           m1, m6, m7
    psubw           m6, m2

    pmovzxbw        m2, [r2 + 33]
    pmovzxbw        m7, [r2 + 41]

%macro INTRA_PRED_PLANAR16 1
%if (%1 < 4)
    pshuflw         m5, m2, 0x55 * %1
    pshufd          m5, m5, 0
%else
%if (%1 < 8)
    pshufhw         m5, m2, 0x55 * (%1 - 4)
    pshufd          m5, m5, 0xAA
%else
%if (%1 < 12)
    pshuflw         m5, m7, 0x55 * (%1 - 8)
    pshufd          m5, m5, 0
%else
    pshufhw         m5, m7, 0x55 * (%1 - 12)
    pshufd          m5, m5, 0xAA
%endif
%endif
%endif
    pmullw          m0, m5, [pw_planar8_0]
    pmullw          m5, [pw_planar16_0]
    paddw           m0, m4
    paddw           m5, m3
    paddw           m3, m6
    paddw           m4, m1
    psraw           m5, 5
    psraw           m0, 5
    packuswb        m5, m0
    movu            [r0], m5
    lea             r0, [r0 + r1]
%endmacro

    INTRA_PRED_PLANAR16 0
    INTRA_PRED_PLANAR16 1
    INTRA_PRED_PLANAR16 2
    INTRA_PRED_PLANAR16 3
    INTRA_PRED_PLANAR16 4
    INTRA_PRED_PLANAR16 5
    INTRA_PRED_PLANAR16 6
    INTRA_PRED_PLANAR16 7
    INTRA_PRED_PLANAR16 8
    INTRA_PRED_PLANAR16 9
    INTRA_PRED_PLANAR16 10
    INTRA_PRED_PLANAR16 11
    INTRA_PRED_PLANAR16 12
    INTRA_PRED_PLANAR16 13
    INTRA_PRED_PLANAR16 14
    INTRA_PRED_PLANAR16 15
    RET

;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_planar32, 3,4,12
%else
cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
  %define           m8  [rsp + 0 * mmsize]
  %define           m9  [rsp + 1 * mmsize]
  %define           m10 [rsp + 2 * mmsize]
  %define           m11 [rsp + 3 * mmsize]
%endif
    movd            m3, [r2 + 33]               ; topRight   = above[32]

    pxor            m7, m7
    pshufb          m3, m7
    punpcklbw       m3, m7                      ; v_topRight

    pmullw          m0, m3, [multiL]            ; (x + 1) * topRight
    pmullw          m1, m3, [multiH]            ; (x + 1) * topRight
    pmullw          m2, m3, [multiH2]           ; (x + 1) * topRight
    pmullw          m3, [multiH3]               ; (x + 1) * topRight

    movd            m6, [r2 + 97]               ; bottomLeft = left[32]
    pshufb          m6, m7
    punpcklbw       m6, m7                      ; v_bottomLeft

    paddw           m0, m6
    paddw           m1, m6
    paddw           m2, m6
    paddw           m3, m6
    paddw           m0, [pw_32]
    paddw           m1, [pw_32]
    paddw           m2, [pw_32]
    paddw           m3, [pw_32]

    pmovzxbw        m4, [r2 + 1]
    pmullw          m5, m4, [pw_planar32_1]
    paddw           m0, m5
    psubw           m5, m6, m4
    mova            m8, m5

    pmovzxbw        m4, [r2 + 9]
    pmullw          m5, m4, [pw_planar32_1]
    paddw           m1, m5
    psubw           m5, m6, m4
    mova            m9, m5

    pmovzxbw        m4, [r2 + 17]
    pmullw          m5, m4, [pw_planar32_1]
    paddw           m2, m5
    psubw           m5, m6, m4
    mova            m10, m5

    pmovzxbw        m4, [r2 + 25]
    pmullw          m5, m4, [pw_planar32_1]
    paddw           m3, m5
    psubw           m5, m6, m4
    mova            m11, m5
    add             r2, 65                      ; (2 * blkSize + 1)

%macro INTRA_PRED_PLANAR32 0
    movd            m4, [r2]
    pshufb          m4, m7
    punpcklbw       m4, m7

    pmullw          m5, m4, [pw_planar32_L]
    pmullw          m6, m4, [pw_planar32_H]
    paddw           m5, m0
    paddw           m6, m1
    paddw           m0, m8
    paddw           m1, m9
    psraw           m5, 6
    psraw           m6, 6
    packuswb        m5, m6
    movu            [r0], m5

    pmullw          m5, m4, [pw_planar16_0]
    pmullw          m4, [pw_planar8_0]
    paddw           m5, m2
    paddw           m4, m3
    paddw           m2, m10
    paddw           m3, m11
    psraw           m5, 6
    psraw           m4, 6
    packuswb        m5, m4
    movu            [r0 + 16], m5

    lea             r0, [r0 + r1]
    inc             r2
%endmacro

    mov             r3, 4
.loop:
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    INTRA_PRED_PLANAR32
    dec             r3
    jnz             .loop
    RET

;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang4_2, 3,5,3
    lea         r4, [r2 + 2]
    add         r2, 10
    cmp         r3m, byte 34
    cmove       r2, r4

    movh        m0, [r2]
    movd        [r0], m0
    palignr     m1, m0, 1
    movd        [r0 + r1], m1
    palignr     m2, m0, 2
    movd        [r0 + r1 * 2], m2
    lea         r1, [r1 * 3]
    psrldq      m0, 3
    movd        [r0 + r1], m0
    RET

INIT_XMM sse4
cglobal intra_pred_ang4_3, 3,5,5
    mov         r4, 1
    cmp         r3m, byte 33
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m2, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    palignr     m3, m0, 6       ; [x x x x x x x x 8 7 7 6 6 5 5 4]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m3

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 + 6 * 16]   ; [26]
    movhps      m3, [r3]            ; [20]
    movh        m4, [r3 - 6 * 16]   ; [14]
    movhps      m4, [r3 - 12 * 16]  ; [ 8]
    jmp        .do_filter4x4

    ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
ALIGN 16
.do_filter4x4:
    mova        m1, [pw_1024]

    pmaddubsw   m0, m3
    pmulhrsw    m0, m1
    pmaddubsw   m2, m4
    pmulhrsw    m2, m1
    packuswb    m0, m2

    ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
    jz         .store

    ; transpose 4x4
    pshufb      m0, [c_trans_4x4]

.store:
    ; TODO: use pextrd here after intrinsic ssse3 removed
    movd        [r0], m0
    pextrd      [r0 + r1], m0, 1
    pextrd      [r0 + r1 * 2], m0, 2
    lea         r1, [r1 * 3]
    pextrd      [r0 + r1], m0, 3
    RET

cglobal intra_pred_ang4_4, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 32
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m1, m3

    lea         r3, [ang_table + 18 * 16]
    movh        m3, [r3 +  3 * 16]  ; [21]
    movhps      m3, [r3 -  8 * 16]  ; [10]
    movh        m4, [r3 + 13 * 16]  ; [31]
    movhps      m4, [r3 +  2 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_5, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 31
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
    punpcklqdq  m0, m1
    punpcklqdq  m2, m1, m3

    lea         r3, [ang_table + 10 * 16]
    movh        m3, [r3 +  7 * 16]  ; [17]
    movhps      m3, [r3 -  8 * 16]  ; [ 2]
    movh        m4, [r3 +  9 * 16]  ; [19]
    movhps      m4, [r3 -  6 * 16]  ; [ 4]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_6, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 30
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m2, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    punpcklqdq  m0, m0
    punpcklqdq  m2, m2

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 -  6 * 16]  ; [13]
    movhps      m3, [r3 +  7 * 16]  ; [26]
    movh        m4, [r3 - 12 * 16]  ; [ 7]
    movhps      m4, [r3 +  1 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_7, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 29
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m3, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
    punpcklqdq  m2, m0, m3
    punpcklqdq  m0, m0

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 - 11 * 16]  ; [ 9]
    movhps      m3, [r3 -  2 * 16]  ; [18]
    movh        m4, [r3 +  7 * 16]  ; [27]
    movhps      m4, [r3 - 16 * 16]  ; [ 4]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_8, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 28
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 13 * 16]
    movh        m3, [r3 -  8 * 16]  ; [ 5]
    movhps      m3, [r3 -  3 * 16]  ; [10]
    movh        m4, [r3 +  2 * 16]  ; [15]
    movhps      m4, [r3 +  7 * 16]  ; [20]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_9, 3,5,5
    xor         r4, r4
    inc         r4
    cmp         r3m, byte 27
    mov         r3, 9
    cmove       r3, r4

    movh        m0, [r2 + r3]    ; [8 7 6 5 4 3 2 1]
    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 4 * 16]
    movh        m3, [r3 -  2 * 16]  ; [ 2]
    movhps      m3, [r3 -  0 * 16]  ; [ 4]
    movh        m4, [r3 +  2 * 16]  ; [ 6]
    movhps      m4, [r3 +  4 * 16]  ; [ 8]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_10, 3,3,4
    movd        m0, [r2 + 9]            ; [8 7 6 5 4 3 2 1]
    pshufb      m0, [pb_unpackbd1]
    pshufd      m1, m0, 1
    movhlps     m2, m0
    pshufd      m3, m0, 3
    movd        [r0 + r1], m1
    movd        [r0 + r1 * 2], m2
    lea         r1, [r1 * 3]
    movd        [r0 + r1], m3
    cmp         r4m, byte 0
    jz          .quit

    ; filter
    pmovzxbw    m0, m0                  ; [-1 -1 -1 -1]
    movh        m1, [r2]                ; [4 3 2 1 0]
    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
    pshufb      m1, [pb_unpackbw1]      ; [4 3 2 1]
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0
.quit:
    movd        [r0], m0
    RET

INIT_XMM sse4
cglobal intra_pred_ang4_26, 3,4,3
    movd        m0, [r2 + 1]            ; [8 7 6 5 4 3 2 1]

    ; store
    movd        [r0], m0
    movd        [r0 + r1], m0
    movd        [r0 + r1 * 2], m0
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0

    ; filter
    cmp         r4m, byte 0
    jz         .quit

    pshufb      m0, [pb_0_8]            ; [ 1  1  1  1]
    movh        m1, [r2 + 8]                ; [-4 -3 -2 -1 0]
    pinsrb      m1, [r2], 0
    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
    pshufb      m1, [pb_unpackbw1]      ; [-4 -3 -2 -1]
    psubw       m1, m2
    psraw       m1, 1
    paddw       m0, m1
    packuswb    m0, m0

    pextrb      [r0], m0, 0
    pextrb      [r0 + r1], m0, 1
    pextrb      [r0 + r1 * 2], m0, 2
    pextrb      [r0 + r3], m0, 3
.quit:
    RET

cglobal intra_pred_ang4_11, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 25
    mov         r3, 8
    cmove       r3, r4

    movh        m0, [r2 + r3]        ; [x x x 4 3 2 1 0]
    pinsrb      m0, [r2], 0
    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 24 * 16]

    movh        m3, [r3 +  6 * 16]  ; [24]
    movhps      m3, [r3 +  4 * 16]  ; [26]
    movh        m4, [r3 +  2 * 16]  ; [28]
    movhps      m4, [r3 +  0 * 16]  ; [30]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_12, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 24
    mov         r3, 8
    cmove       r3, r4

    movh        m0, [r2 + r3]        ; [x x x 4 3 2 1 0]
    pinsrb      m0, [r2], 0
    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    mova        m2, m0

    lea         r3, [ang_table + 20 * 16]
    movh        m3, [r3 +  7 * 16]  ; [27]
    movhps      m3, [r3 +  2 * 16]  ; [22]
    movh        m4, [r3 -  3 * 16]  ; [17]
    movhps      m4, [r3 -  8 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_13, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 23
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m1, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m1, [r2], 1
    palignr     m0, m1, 1       ; [x x x 4 3 2 1 0]
    palignr     m2, m1, 2       ; [x x x x 4 3 2 1]
    pinsrb      m1, [r2 + r3 + 4], 0
    punpcklbw   m1, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m2          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m2, m0, m1
    punpcklqdq  m0, m0

    lea         r3, [ang_table + 21 * 16]
    movh        m3, [r3 +  2 * 16]  ; [23]
    movhps      m3, [r3 -  7 * 16]  ; [14]
    movh        m4, [r3 - 16 * 16]  ; [ 5]
    movhps      m4, [r3 +  7 * 16]  ; [28]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_14, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 22
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m0
    punpcklqdq  m2, m2

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 +  0 * 16]  ; [19]
    movhps      m3, [r3 - 13 * 16]  ; [ 6]
    movh        m4, [r3 +  6 * 16]  ; [25]
    movhps      m4, [r3 -  7 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_15, 4,5,5
    xor         r4, r4
    cmp         r3m, byte 21
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
    pinsrb      m3, [r2 + r3 + 4], 0
    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m2
    punpcklqdq  m2, m4

    lea         r3, [ang_table + 23 * 16]
    movh        m3, [r3 -  8 * 16]  ; [15]
    movhps      m3, [r3 +  7 * 16]  ; [30]
    movh        m4, [r3 - 10 * 16]  ; [13]
    movhps      m4, [r3 +  5 * 16]  ; [28]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_16, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 20
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m2, [r2 + r4 - 1]    ; [x x 4 3 2 1 0 x]
    pinsrb      m2, [r2], 1
    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
    pinsrb      m3, [r2 + r3 + 3], 0
    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    punpcklqdq  m0, m2
    punpcklqdq  m2, m4

    lea         r3, [ang_table + 19 * 16]
    movh        m3, [r3 -  8 * 16]  ; [11]
    movhps      m3, [r3 +  3 * 16]  ; [22]
    movh        m4, [r3 - 18 * 16]  ; [ 1]
    movhps      m4, [r3 -  7 * 16]  ; [12]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_17, 3,5,5
    xor         r4, r4
    cmp         r3m, byte 19
    mov         r3, 8
    jz          .next
    xchg        r3, r4
.next:
    movh        m3, [r2 + r4 - 1]    ; [- - 4 3 2 1 0 x]
    pinsrb      m3, [r2], 1
    palignr     m0, m3, 1       ; [- - - 4 3 2 1 0]
    palignr     m1, m3, 2       ; [- - - - 4 3 2 1]
    mova        m4, m0
    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
    pinsrb      m3, [r2 + r3 + 1], 0
    punpcklbw   m1, m3, m4      ; [3 2 2 1 1 0 0 x]
    punpcklqdq  m0, m1

    pslldq      m2, m3, 1       ; [- 4 3 2 1 0 x y]
    pinsrb      m2, [r2 + r3 + 2], 0
    pslldq      m1, m2, 1       ; [4 3 2 1 0 x y z]
    pinsrb      m1, [r2 + r3 + 4], 0
    punpcklbw   m1, m2          ; [1 0 0 x x y y z]
    punpcklbw   m2, m3          ; [2 1 1 0 0 x x y]
    punpcklqdq  m2, m1

    lea         r3, [ang_table + 14 * 16]
    movh        m3, [r3 -  8 * 16]  ; [ 6]
    movhps      m3, [r3 -  2 * 16]  ; [12]
    movh        m4, [r3 +  4 * 16]  ; [18]
    movhps      m4, [r3 + 10 * 16]  ; [24]
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)

cglobal intra_pred_ang4_18, 3,5,1
    mov         r4d, [r2 + 8]
    mov         r3b, byte [r2]
    mov         [r2 + 8], r3b
    mov         r3d, [r2 + 8]
    bswap       r3d
    movd        m0, r3d

    pinsrd      m0, [r2 + 1], 1     ; [- 3 2 1 0 -1 -2 -3]
    lea         r3, [r1 * 3]
    movd        [r0 + r3], m0
    psrldq      m0, 1
    movd        [r0 + r1 * 2], m0
    psrldq      m0, 1
    movd        [r0 + r1], m0
    psrldq      m0, 1
    movd        [r0], m0
    mov         [r2 + 8], r4w
    RET

;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang8_2, 3,5,2
    lea         r4,             [r2 + 2]
    add         r2,             18
    cmp         r3m,            byte 34
    cmove       r2,             r4
    movu        m0,             [r2]
    lea         r4,             [r1 * 3]

    movh        [r0],           m0
    palignr     m1,             m0, 1
    movh        [r0 + r1],      m1
    palignr     m1,             m0, 2
    movh        [r0 + r1 * 2],  m1
    palignr     m1,             m0, 3
    movh        [r0 + r4],      m1
    palignr     m1,             m0, 4
    lea         r0,             [r0 + r1 * 4]
    movh        [r0],           m1
    palignr     m1,             m0, 5
    movh        [r0 + r1],      m1
    palignr     m1,             m0, 6
    movh        [r0 + r1 * 2],  m1
    palignr     m1,             m0, 7
    movh        [r0 + r4],      m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang8_3, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 33
    cmove       r2,        r4
    lea         r3,        [ang_table + 22 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m4,        m0, [r3 + 4 * 16]          ; [26]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r3 - 2 * 16]              ; [20]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m5,        [r3 - 8 * 16]              ; [14]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m6,        [r4]                       ; [ 8]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8                  ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]

    pmaddubsw   m6,        m1, [r4 - 6 * 16]          ; [ 2]
    pmulhrsw    m6,        m3

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [28]
    pmulhrsw    m1,        m3
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10                 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]

    pmaddubsw   m1,        [r3]                       ; [22]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 12                     ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]

    pmaddubsw   m2,        [r3 - 6 * 16]              ; [16]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         .transpose8x8

ALIGN 16
.transpose8x8:
    jz         .store

    ; transpose 8x8
    punpckhbw   m0,        m4, m5
    punpcklbw   m4,        m5
    punpckhbw   m2,        m4, m0
    punpcklbw   m4,        m0

    punpckhbw   m0,        m6, m1
    punpcklbw   m6,        m1
    punpckhbw   m1,        m6, m0
    punpcklbw   m6,        m0

    punpckhdq   m5,        m4, m6
    punpckldq   m4,        m6
    punpckldq   m6,        m2, m1
    punpckhdq   m2,        m1
    mova        m1,        m2

.store:
    lea         r4,              [r1 * 3]
    movh        [r0],            m4
    movhps      [r0 + r1],       m4
    movh        [r0 + r1 * 2],   m5
    movhps      [r0 + r4],       m5
    add         r0,              r4
    movh        [r0 + r1],       m6
    movhps      [r0 + r1 * 2],   m6
    movh        [r0 + r4],       m1
    movhps      [r0 + r1 * 4],   m1
    RET

cglobal intra_pred_ang8_4, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 32
    cmove       r2,        r4
    lea         r3,        [ang_table + 24 * 16]
    lea         r4,        [ang_table + 10 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [21]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r4]                       ; [10]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 7 * 16]              ; [31]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m6,        [r3 - 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m6,        m1, [r4 - 1 * 16]          ; [ 9]
    pmulhrsw    m6,        m3

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [30]
    pmulhrsw    m1,        m3
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8                  ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]

    pmaddubsw   m1,        [r3 - 5 * 16]              ; [19]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 10                     ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]

    pmaddubsw   m2,        [r4 - 2 * 16]              ; [8]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_5, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 31
    cmove       r2,        r4
    lea         r3,        [ang_table + 17 * 16]
    lea         r4,        [ang_table +  2 * 16]
    mova        m3,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3]                   ; [17]
    pmulhrsw    m4,        m3
    pmaddubsw   m1,        [r4]                       ; [2]
    pmulhrsw    m1,        m3
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 2 * 16]              ; [19]
    pmulhrsw    m5,        m3

    palignr     m6,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
    mova        m1,        m6

    pmaddubsw   m1,        [r4 + 2 * 16]              ; [4]
    pmulhrsw    m1,        m3
    packuswb    m5,        m1

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [21]
    pmulhrsw    m6,        m3

    palignr     m1,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    mova        m7,        m1
    pmaddubsw   m7,        [r4 + 4 * 16]              ; [6]
    pmulhrsw    m7,        m3
    packuswb    m6,        m7

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [23]
    pmulhrsw    m1,        m3

    palignr     m2,        m0, 8                      ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]

    pmaddubsw   m2,        [r4 + 6 * 16]              ; [8]
    pmulhrsw    m2,        m3
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_6, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 30
    cmove       r2,        r4
    lea         r3,        [ang_table + 20 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    mova        m1,        m0

    pmaddubsw   m4,        m0, [r3 - 7 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m5,        m6, [r4 - 1 * 16]          ; [7]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3]                       ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m6,        m1, [r4 - 7 * 16]          ; [1]
    pmulhrsw    m6,        m7

    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 6 * 16]              ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [27]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 6                      ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]

    pmaddubsw   m2,        [r4]                       ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_7, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 29
    cmove       r2,        r4
    lea         r3,        [ang_table + 24 * 16]
    lea         r4,        [ang_table +  6 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m0, [r4 + 3 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 + 3 * 16]          ; [27]
    pmulhrsw    m5,        m7

    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m6,        m1, [r4 - 2 * 16]          ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 7 * 16]          ; [13]
    pmulhrsw    m6,        m7

    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 2 * 16]              ; [22]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [31]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 4                      ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]

    pmaddubsw   m2,        [r4 + 2 * 16]              ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_8, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 28
    cmove       r2,        r4
    lea         r3,        [ang_table + 23 * 16]
    lea         r4,        [ang_table +  8 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m2,        m0, 2                      ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]

    pmaddubsw   m4,        m0, [r4 - 3 * 16]          ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 + 2 * 16]          ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 - 8 * 16]          ; [15]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 - 3 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 2 * 16]          ; [25]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        [r3 + 7 * 16]              ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m2, [r4 - 5 * 16]          ; [3]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r4]                       ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_9, 3,5,8
    lea         r4,        [r2 + 1]
    add         r2,        17
    cmp         r3m,       byte 27
    cmove       r2,        r4
    lea         r3,        [ang_table + 10 * 16]
    mova        m7,        [pw_1024]

    movu        m0,        [r2]                       ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]

    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m0, [r3 - 8 * 16]          ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]          ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 - 4 * 16]          ; [6]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 - 2 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3]                   ; [10]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m0, [r3 + 2 * 16]          ; [12]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        m0, [r3 + 4 * 16]          ; [14]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r3 + 6 * 16]              ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_10, 3,6,5
    movh        m0,        [r2 + 17]
    mova        m4,        [pb_unpackbq]
    palignr     m1,        m0, 2
    pshufb      m1,        m4
    palignr     m2,        m0, 4
    pshufb      m2,        m4
    palignr     m3,        m0, 6
    pshufb      m3,        m4
    pshufb      m0,        m4

    lea         r5,             [r1 * 3]
    movhps      [r0 + r1],      m0
    movh        [r0 + r1 * 2],  m1
    movhps      [r0 + r5],      m1
    lea         r3,             [r0 + r1 * 4]
    movh        [r3],           m2
    movhps      [r3 + r1],      m2
    movh        [r3 + r1 * 2],  m3
    movhps      [r3 + r5],      m3

; filter
    cmp         r4m, byte 0
    jz         .quit

    pmovzxbw    m0,        m0
    movu        m1,        [r2]
    palignr     m2,        m1, 1
    pshufb      m1,        m4
    pmovzxbw    m1,        m1
    pmovzxbw    m2,        m2
    psubw       m2,        m1
    psraw       m2,        1
    paddw       m0,        m2
    packuswb    m0,        m0

.quit:
    movh        [r0],      m0
    RET

cglobal intra_pred_ang8_26, 3,6,3
    movu        m2,             [r2]
    palignr     m0,             m2, 1
    lea         r5,             [r1 * 3]
    movh        [r0],           m0
    movh        [r0 + r1],      m0
    movh        [r0 + r1 * 2],  m0
    movh        [r0 + r5],      m0
    lea         r3,             [r0 + r1 * 4]
    movh        [r3],           m0
    movh        [r3 + r1],      m0
    movh        [r3 + r1 * 2],  m0
    movh        [r3 + r5],      m0

; filter
    cmp         r4m, byte 0
    jz         .quit

    pshufb      m2,        [pb_unpackbq]
    movhlps     m1,        m2
    pmovzxbw    m2,        m2
    movu        m0,        [r2 + 17]
    pmovzxbw    m1,        m1
    pmovzxbw    m0,        m0
    psubw       m0,        m2
    psraw       m0,        1
    paddw       m1,        m0
    packuswb    m1,        m1
    pextrb      [r0],          m1, 0
    pextrb      [r0 + r1],     m1, 1
    pextrb      [r0 + r1 * 2], m1, 2
    pextrb      [r0 + r5],     m1, 3
    pextrb      [r3],          m1, 4
    pextrb      [r3 + r1],     m1, 5
    pextrb      [r3 + r1 * 2], m1, 6
    pextrb      [r3 + r5],     m1, 7
.quit:
    RET

cglobal intra_pred_ang8_11, 3,5,8
    xor         r4,        r4
    cmp         r3m,       byte 25
    mov         r3,        16
    cmove       r3,        r4

    movu        m0,        [r2 + r3]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m0,        [r2], 0
    palignr     m1,        m0, 1                      ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]

    punpcklbw   m0,        m1                         ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r3,        [ang_table + 23 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m4,        m0, [r3 + 7 * 16]          ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 + 5 * 16]          ; [28]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        m0, [r3 + 3 * 16]          ; [26]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        m0, [r3 + 1 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 - 1 * 16]          ; [22]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m0, [r3 - 3 * 16]          ; [20]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        m0, [r3 - 5 * 16]          ; [18]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r3 - 7 * 16]              ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_12, 3,5,8
    xor         r4,        r4
    cmp         r3m,       byte 24
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    pslldq      m0,        m1, 1                      ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    pinsrb      m0,        [r2 + r3 + 6], 0

    lea         r4,        [ang_table + 22 * 16]
    mova        m7,        [pw_1024]

    punpckhbw   m2,        m0, m1                     ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
    punpcklbw   m0,        m1                         ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m2,        m0, 2                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m2, [r4 + 5 * 16]          ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m1,        m0, [r4 + 7 * 16]          ; [29]
    pmulhrsw    m1,        m7

    pmaddubsw   m0,        [r4 + 2 * 16]              ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    pmaddubsw   m5,        m2, [r4 - 5 * 16]          ; [17]
    pmulhrsw    m5,        m7

    lea         r4,        [ang_table + 7 * 16]
    pmaddubsw   m6,        m2, [r4 + 5 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r4]                   ; [7]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        [r4 - 5 * 16]              ; [2]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_13, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 23
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 +  r4]                 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    pslldq      m1,        1                          ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    pinsrb      m1,        [r2 + r3 + 4], 0
    pslldq      m0,        m1, 1                      ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m5,        m0, m1                     ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
    punpcklbw   m0,        m1                         ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m1,        m5, m0, 2                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m5,        m0, 4                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 24 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m4,        m5, [r4 - 1 * 16]          ; [23]
    pmulhrsw    m4,        m7

    pmaddubsw   m6,        m1, [r4 + 4 * 16]          ; [28]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        [r4]                       ; [24]
    pmulhrsw    m0,        m7

    lea         r4,        [ang_table + 13 * 16]
    pmaddubsw   m3,        m5, [r4 + 1 * 16]          ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3

    pmaddubsw   m5,        [r4 - 8 * 16]              ; [5]
    pmulhrsw    m5,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 6 * 16]          ; [19]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        m1, [r4 - 3 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 12 * 16]             ; [1]
    pmulhrsw    m1,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_14, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 22
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4 - 2]              ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
    pinsrb      m1,        [r2], 2
    pinsrb      m1,        [r2 + r3 + 2], 1
    pinsrb      m1,        [r2 + r3 + 5], 0
    pslldq      m0,        m1, 1                      ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m2,        m0, m1                     ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
    punpcklbw   m0,        m1                         ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m1,        m2, m0, 2                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m6,        m2, m0, 4                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m2,        m0, 6                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 24 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [19]
    pmulhrsw    m4,        m3

    pmaddubsw   m0,        [r4]                       ; [24]
    pmulhrsw    m0,        m3

    pmaddubsw   m5,        m6, [r4 + 1 * 16]          ; [25]
    pmulhrsw    m5,        m3

    lea         r4,        [ang_table + 12 * 16]
    pmaddubsw   m6,        [r4]                       ; [12]
    pmulhrsw    m6,        m3
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r4 + 19 * 16]         ; [31]
    pmulhrsw    m6,        m3

    pmaddubsw   m2,        [r4 - 6 * 16]              ; [6]
    pmulhrsw    m2,        m3
    packuswb    m4,        m2

    pmaddubsw   m2,        m1, [r4 + 6 * 16]          ; [18]
    pmulhrsw    m2,        m3
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 7 * 16]              ; [5]
    pmulhrsw    m1,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_15, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 21
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    movu        m2,        [r2 + r3]
    pshufb      m2,        [c_mode16_15]
    palignr     m1,        m2, 13                     ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
    pslldq      m0,        m1, 1                      ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
    pinsrb      m0,        [r2 + r3 + 8], 0
    punpckhbw   m4,        m0, m1                     ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
    punpcklbw   m0,        m1                         ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    palignr     m1,        m4, m0, 2                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m6,        m4, m0, 4                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m5,        m4, m0, 6                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m0, 8                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 23 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m4,        [r4 - 8 * 16]              ; [15]
    pmulhrsw    m4,        m3

    pmaddubsw   m2,        m5, [r4 + 7 * 16]          ; [30]
    pmulhrsw    m2,        m3
    packuswb    m4,        m2

    pmaddubsw   m5,        [r4 - 10 * 16]             ; [13]
    pmulhrsw    m5,        m3

    pmaddubsw   m2,        m6, [r4 + 5 * 16]          ; [28]
    pmulhrsw    m2,        m3
    packuswb    m5,        m2

    pmaddubsw   m2,        m1, [r4 + 3 * 16]          ; [26]
    pmulhrsw    m2,        m3

    pmaddubsw   m0,        [r4 + 1 * 16]              ; [24]
    pmulhrsw    m0,        m3

    lea         r4,        [ang_table + 11 * 16]
    pmaddubsw   m6,        [r4]                       ; [11]
    pmulhrsw    m6,        m3
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 - 2 * 16]              ; [9]
    pmulhrsw    m1,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_16, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 20
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m1,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m1,        [r2], 0
    movu        m2,        [r2 + r3]
    pshufb      m2,        [c_mode16_16]
    palignr     m1,        m2, 12                     ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
    pslldq      m0,        m1, 1                      ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
    pinsrb      m0,        [r2 + r3 + 8], 0
    punpckhbw   m4,        m0, m1                     ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
    punpcklbw   m0,        m1                         ; [3 2 2 1 1 0 0 a a b b c c d d e]
    palignr     m1,        m4, m0, 2                  ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    palignr     m6,        m4, m0, 4                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    palignr     m2,        m4, m0, 6                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m5,        m4, m0, 8                  ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m0, 10                     ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 22 * 16]
    mova        m7,        [pw_1024]

    pmaddubsw   m3,        m5, [r4]                   ; [22]
    pmulhrsw    m3,        m7

    pmaddubsw   m0,        [r4 + 2 * 16]              ; [24]
    pmulhrsw    m0,        m7

    lea         r4,        [ang_table + 9 * 16]

    pmaddubsw   m4,        [r4 + 2 * 16]              ; [11]
    pmulhrsw    m4,        m7
    packuswb    m4,        m3

    pmaddubsw   m2,        [r4 + 3 * 16]              ; [12]
    pmulhrsw    m2,        m7

    pmaddubsw   m5,        [r4 - 8 * 16]              ; [1]
    pmulhrsw    m5,        m7
    packuswb    m5,        m2

    mova        m2,        m6
    pmaddubsw   m6,        [r4 + 14 * 16]             ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m2,        [r4 -  7 * 16]             ; [2]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    pmaddubsw   m1,        [r4 + 4 * 16]              ; [13]
    pmulhrsw    m1,        m7
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_17, 4,5,8
    xor         r4,        r4
    cmp         r3m,       byte 19
    mov         r3,        16
    jz          .next
    xchg        r3,        r4
.next:

    movu        m2,        [r2 + r4]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m2,        [r2], 0
    movu        m1,        [r2 + r3]
    pshufb      m1,        [c_mode16_17]
    palignr     m2,        m1, 11                     ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
    pslldq      m0,        m2, 1                      ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
    pinsrb      m0,        [r2 + r3 + 7], 0
    punpckhbw   m1,        m0, m2                     ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    punpcklbw   m0,        m2                         ; [2 1 1 0 0 a a b b c c d d e e f]

    palignr     m5,        m1, m0, 8                  ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    palignr     m2,        m1, m0, 10                 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    palignr     m4,        m1, m0, 12                 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    lea         r4,        [ang_table + 17 * 16]
    mova        m3,        [pw_1024]

    pmaddubsw   m2,        [r4 - 5 * 16]              ; [12]
    pmulhrsw    m2,        m3

    pmaddubsw   m4,        [r4 - 11 * 16]             ; [6]
    pmulhrsw    m4,        m3
    packuswb    m4,        m2

    pmaddubsw   m5,        [r4 + 1 * 16]              ; [18]
    pmulhrsw    m5,        m3

    palignr     m2,        m1, m0, 6                  ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
    pmaddubsw   m2,        [r4 + 7 * 16]              ; [24]
    pmulhrsw    m2,        m3
    packuswb    m5,        m2

    palignr     m6,        m1, m0, 4                  ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
    mova        m2,        m6
    pmaddubsw   m6,        [r4 + 13 * 16]             ; [30]
    pmulhrsw    m6,        m3

    pmaddubsw   m2,        [r4 - 13 * 16]             ; [4]
    pmulhrsw    m2,        m3
    packuswb    m6,        m2

    palignr     m1,        m0, 2                      ; [3 2 2 1 1 0 0 a a b b c c d d e]
    pmaddubsw   m1,        [r4 - 7 * 16]              ; [10]
    pmulhrsw    m1,        m3

    pmaddubsw   m0,        [r4 - 1 * 16]              ; [16]
    pmulhrsw    m0,        m3
    packuswb    m1,        m0
    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)

cglobal intra_pred_ang8_18, 4,4,1
    movu        m0, [r2 + 16]
    pinsrb      m0, [r2], 0
    pshufb      m0, [pb_swap8]
    movhps      m0, [r2 + 1]
    lea         r2, [r0 + r1 * 4]
    lea         r3, [r1 * 3]
    movh        [r2 + r3], m0
    psrldq      m0, 1
    movh        [r2 + r1 * 2], m0
    psrldq      m0, 1
    movh        [r2 + r1], m0
    psrldq      m0, 1
    movh        [r2], m0
    psrldq      m0, 1
    movh        [r0 + r3], m0
    psrldq      m0, 1
    movh        [r0 + r1 * 2], m0
    psrldq      m0, 1
    movh        [r0 + r1], m0
    psrldq      m0, 1
    movh        [r0], m0
    RET

%macro TRANSPOSE_STORE_8x8 6
  %if %2 == 1
    ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
    punpckhbw   m0,        %3, %4
    punpcklbw   %3,        %4
    punpckhbw   %4,        %3, m0
    punpcklbw   %3,        m0

    punpckhbw   m0,        %5, m1
    punpcklbw   %5,        %6
    punpckhbw   %6,        %5, m0
    punpcklbw   %5,        m0

    punpckhdq   m0,        %3, %5
    punpckldq   %3,        %5
    punpckldq   %5,        %4, %6
    punpckhdq   %4,        %6

    movh        [r0 +       + %1 * 8], %3
    movhps      [r0 +  r1   + %1 * 8], %3
    movh        [r0 +  r1*2 + %1 * 8], m0
    movhps      [r0 +  r5   + %1 * 8], m0
    movh        [r6         + %1 * 8], %5
    movhps      [r6 +  r1   + %1 * 8], %5
    movh        [r6 +  r1*2 + %1 * 8], %4
    movhps      [r6 +  r5   + %1 * 8], %4
  %else
    ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
    movh        [r0         ], %3
    movhps      [r0 + r1    ], %3
    movh        [r0 + r1 * 2], %4
    movhps      [r0 + r5    ], %4
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], %5
    movhps      [r0 + r1    ], %5
    movh        [r0 + r1 * 2], %6
    movhps      [r0 + r5    ], %6
    lea         r0, [r0 + r1 * 4]
  %endif
%endmacro

;------------------------------------------------------------------------------------------
; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang16_2, 3,5,3
    lea             r4, [r2 + 2]
    add             r2, 34
    cmp             r3m, byte 34
    cmove           r2, r4
    movu            m0, [r2]
    movu            m1, [r2 + 16]
    movu            [r0], m0
    palignr         m2, m1, m0, 1
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 2
    movu            [r0], m2
    palignr         m2, m1, m0, 3
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 4
    movu            [r0], m2
    palignr         m2, m1, m0, 5
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 6
    movu            [r0], m2
    palignr         m2, m1, m0, 7
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 8
    movu            [r0], m2
    palignr         m2, m1, m0, 9
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 10
    movu            [r0], m2
    palignr         m2, m1, m0, 11
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 12
    movu            [r0], m2
    palignr         m2, m1, m0, 13
    movu            [r0 + r1], m2
    lea             r0, [r0 + r1 * 2]
    palignr         m2, m1, m0, 14
    movu            [r0], m2
    palignr         m2, m1, m0, 15
    movu            [r0 + r1], m2
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_3, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4

    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8

    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 12

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_33, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4

    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 8

    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 12

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2

    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 10

    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        m2,        [r2 + 14]                  ; [00]

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_4, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1

    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 10

    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7

    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4
    mova        m6,        m5

    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8
    palignr     m2,        m0, 10

    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_32, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1


    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7

    palignr     m6,        m2, m0, 4

    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m0, 6

    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7

    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    palignr     m1,        m2, m0, 8

    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7

    palignr     m2,        m0, 10

    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7

    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1

    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2

    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m5,        m2, m0, 4
    mova        m6,        m5

    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7

    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m6,        m2, m0, 6

    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7

    palignr     m1,        m2, m0, 8
    palignr     m2,        m0, 10

    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7

    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_5, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m1,        [r2 + 2]                   ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    palignr     m5,        m2, m3, 2

    pmaddubsw   m4,        m3, [r3 +      16]         ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 4

    pmaddubsw   m5,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 6

    pmaddubsw   m6,        [r3 + 5 * 16]              ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 8

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m4,        m2, m3, 8
    palignr     m5,        m2, m3, 10

    pmaddubsw   m4,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 12

    pmaddubsw   m5,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 14

    pmaddubsw   m6,        [r3 + 13 * 16]             ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_31, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m1,        [r2 + 2]                   ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    palignr     m5,        m2, m3, 2

    pmaddubsw   m4,        m3, [r3 +      16]         ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 4

    pmaddubsw   m5,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 6

    pmaddubsw   m6,        [r3 + 5 * 16]              ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 8

    pmaddubsw   m1,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m4,        m2, m3, 8
    palignr     m5,        m2, m3, 10

    pmaddubsw   m4,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 12

    pmaddubsw   m5,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1

    palignr     m1,        m2, m3, 14

    pmaddubsw   m6,        [r3 + 13 * 16]             ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_6, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 2

    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m3, 4

    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 6

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m4,        m2, m3, 6
    palignr     m6,        m2, m3, 8

    pmaddubsw   m4,        [r3 +  5 * 16]             ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m0,        m2, m3, 10

    pmaddubsw   m6,        m0, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 12

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_30, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m6,        m2, m3, 2

    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m1,        m2, m3, 4

    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 6

    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m4,        m2, m3, 6
    palignr     m6,        m2, m3, 8

    pmaddubsw   m4,        [r3 +  5 * 16]             ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m0,        m2, m3, 10

    pmaddubsw   m6,        m0, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 12

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_7, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    palignr     m1,        m2, m3, 2

    pmaddubsw   m5,        m3, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 4

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    palignr     m1,        m2, m3, 4

    pmaddubsw   m4,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m0,        m2, m3, 6

    pmaddubsw   m5,        m0, [r3 - 13 * 16]         ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 8

    pmaddubsw   m1,        m2, [r3 - 9 * 16]          ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_29, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m1,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m3, m1                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m3,        m1                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    palignr     m1,        m2, m3, 2

    pmaddubsw   m5,        m3, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m0,        m2, m3, 4

    pmaddubsw   m1,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        [r3 - 8 * 16]              ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    palignr     m1,        m2, m3, 4

    pmaddubsw   m4,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    palignr     m0,        m2, m3, 6

    pmaddubsw   m5,        m0, [r3 - 13 * 16]         ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m2,        m3, 8

    pmaddubsw   m1,        m2, [r3 - 9 * 16]          ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_8, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m1,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m1, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m0,        m1, m3                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m1,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m1, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        m1, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2

    pmaddubsw   m5,        m1, [r3 - 1 * 16]          ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 + 9 * 16]          ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m2,        m1, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    palignr     m2,        m0, m1, 2
    palignr     m3,        m0, m1, 4

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 7 * 16]          ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2

    pmaddubsw   m6,        m3, [r3 - 15 * 16]         ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r3]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_28, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m1,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m1, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m0,        m1, m3                     ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m1,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m1, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        m1, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2

    pmaddubsw   m5,        m1, [r3 - 1 * 16]          ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m1, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m1, [r3 + 9 * 16]          ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m2,        m1, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2

    palignr     m2,        m0, m1, 2
    palignr     m3,        m0, m1, 4

    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 7 * 16]          ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2

    pmaddubsw   m6,        m3, [r3 - 15 * 16]         ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r3]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_9, 3,7,8
    add         r2,        32
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

.loop:
    movu        m2,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m3,        m2, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpcklbw   m2,        m3                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m2, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m2, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m2, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m2, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m2, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m2, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m2, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    punpcklqdq  m1,        m3                         ; [00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_27, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2 + 1]                   ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    palignr     m2,        m3, 1                      ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpcklbw   m3,        m2                         ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]

    pmaddubsw   m4,        m3, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_10, 5,6,8
    lea         r5,        [r1 * 3]
    pxor        m7,        m7

    movu        m0,        [r2 + 1 + 32]
    palignr     m1,        m0, 1
    pshufb      m1,        m7
    palignr     m2,        m0, 2
    pshufb      m2,        m7
    palignr     m3,        m0, 3
    pshufb      m3,        m7
    palignr     m4,        m0, 4
    pshufb      m4,        m7
    palignr     m5,        m0, 5
    pshufb      m5,        m7
    palignr     m6,        m0, 6
    pshufb      m6,        m7

    movu        [r0 + r1],      m1
    movu        [r0 + r1 * 2],  m2
    movu        [r0 + r5],      m3
    lea         r3,             [r0 + r1 * 4]
    movu        [r3],           m4
    movu        [r3 + r1],      m5
    movu        [r3 + r1 * 2],  m6

    palignr     m1,        m0, 7
    pshufb      m1,        m7
    movhlps     m2,        m0
    pshufb      m2,        m7
    palignr     m3,        m0, 9
    pshufb      m3,        m7
    palignr     m4,        m0, 10
    pshufb      m4,        m7
    palignr     m5,        m0, 11
    pshufb      m5,        m7
    palignr     m6,        m0, 12
    pshufb      m6,        m7

    movu        [r3 + r5],      m1
    lea         r3,             [r3 + r1 * 4]
    movu        [r3],           m2
    movu        [r3 + r1],      m3
    movu        [r3 + r1 * 2],  m4
    movu        [r3 + r5],      m5
    lea         r3,             [r3 + r1 * 4]
    movu        [r3],           m6

    palignr     m1,        m0, 13
    pshufb      m1,        m7
    palignr     m2,        m0, 14
    pshufb      m2,        m7
    palignr     m3,        m0, 15
    pshufb      m3,        m7
    pshufb      m0,        m7

    movu        [r3 + r1],      m1
    movu        [r3 + r1 * 2],  m2
    movu        [r3 + r5],      m3

; filter
    cmp         r4w, byte 0
    jz         .quit
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        [r2]
    movu        m3,        [r2 + 1]

    pshufb      m2,        m7
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1
.quit:
    movu        [r0],      m0
    RET

INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_ang16_26, 3,8,5
    mov     r7, r4mp
    %define bfilter r7w
%else
cglobal intra_pred_ang16_26, 5,7,5,0-4
    %define bfilter dword[rsp]
    mov     bfilter, r4
%endif
    movu        m0,             [r2 + 1]

    lea         r4,             [r1 * 3]
    lea         r3,             [r0 + r1 * 4]
    lea         r5,             [r3 + r1 * 4]
    lea         r6,             [r5 + r1 * 4]

    movu        [r0],           m0
    movu        [r0 + r1],      m0
    movu        [r0 + r1 * 2],  m0
    movu        [r0 + r4],      m0
    movu        [r3],           m0
    movu        [r3 + r1],      m0
    movu        [r3 + r1 * 2],  m0
    movu        [r3 + r4],      m0
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0

    movu        [r6],           m0
    movu        [r6 + r1],      m0
    movu        [r6 + r1 * 2],  m0
    movu        [r6 + r4],      m0

; filter
    cmp         bfilter, byte 0
    jz         .quit

    pxor        m4,        m4
    pshufb      m0,        m4
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    movu        m3,        [r2 + 1 + 32]

    pshufb      m2,        m4
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1

    pextrb      [r0],           m0, 0
    pextrb      [r0 + r1],      m0, 1
    pextrb      [r0 + r1 * 2],  m0, 2
    pextrb      [r0 + r4],      m0, 3
    pextrb      [r3],           m0, 4
    pextrb      [r3 + r1],      m0, 5
    pextrb      [r3 + r1 * 2],  m0, 6
    pextrb      [r3 + r4],      m0, 7
    pextrb      [r5],           m0, 8
    pextrb      [r5 + r1],      m0, 9
    pextrb      [r5 + r1 * 2],  m0, 10
    pextrb      [r5 + r4],      m0, 11
    pextrb      [r6],           m0, 12
    pextrb      [r6 + r1],      m0, 13
    pextrb      [r6 + r1 * 2],  m0, 14
    pextrb      [r6 + r4],      m0, 15
.quit:
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_11, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]              ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    punpcklqdq  m1,        m2                         ;[00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m3,        [r2 + 40]              ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    punpcklqdq  m1,        m2                         ;[00]

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_25, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       2
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

.loop:
    movu        m3,        [r2]                   ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    mova        m2,        m3
    palignr     m1,        m3, 1                  ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    punpcklbw   m3,        m1                     ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m3, [r3 + 14 * 16]         ; [30]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m3, [r3 + 12 * 16]         ; [28]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        m3, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 + 6 * 16]          ; [22]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r3 + 4 * 16]          ; [20]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r3 - 8 * 16]          ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r3 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1

    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m5
    movhps      [r0 + r5    ], m5
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m6
    movhps      [r0 + r1    ], m6
    movh        [r0 + r1 * 2], m1
    movh        [r0 + r5    ], m2

    lea         r0,        [r6 + 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_12, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m0,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_12]

    palignr     m0,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m0, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r4 - 14 * 16]             ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 15

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_24, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m0,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_12]

    palignr     m0,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1

    pmaddubsw   m5,        m0, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r4 - 14 * 16]             ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 15

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 1 * 16]          ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    palignr     m3,        m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_13, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_13]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m5, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        [r4 - 11 * 16]             ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_23, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_13]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m0,        m5, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m0,        m7
    packuswb    m4,        m0

    pmaddubsw   m5,        [r4 - 11 * 16]             ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_14, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_14]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_22, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m5,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pshufb      m2,        [c_mode16_14]

    palignr     m5,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        m5, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 x x x x x x x]

    pmaddubsw   m4,        m3, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_15, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_15]

    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 16]                  ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 -  5 * 16]         ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7  * 16]         ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 +  4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_21, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_15]

    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 16]                  ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]

    pmaddubsw   m4,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 -  5 * 16]         ; [11]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pmaddubsw   m1,        m3, [r4 - 7  * 16]         ; [09]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 9  * 16]         ; [07]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 +  4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pmaddubsw   m6,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_16, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_16]              ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 5  * 16]             ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1                           ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 9  * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 5  * 16]         ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 +  9 * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_20, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_16]              ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 5  * 16]             ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1                           ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 9  * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 5  * 16]         ; [11]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 15 * 16]         ; [01]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 7  * 16]         ; [23]
    pmulhrsw    m6,        m7

    pmaddubsw   m0,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 3  * 16]         ; [13]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pmaddubsw   m4,        m3, [r4 - 13 * 16]         ; [03]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 +  9 * 16]         ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pmaddubsw   m1,        m3, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m3,        [r4]                       ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_17, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    movu        m3,        [r2 + 32]                  ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    pinsrb      m3,        [r2], 0
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2]
    pshufb      m2,        [c_mode16_17]              ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 -  4 * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
    pinsrb      m2,       [r2 + 5], 0                 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pslldq      m2,       1                           ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1                           ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1

    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]

    movu        m1,        [r2 + 1 + 32]              ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]

    pmaddubsw   m4,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  2 * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_19, 4,7,8
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    movu        m3,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    punpckhbw   m4,        m3, m3                     ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
    punpcklbw   m3,        m3                         ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    movu        m2,        [r2 + 32]
    pinsrb      m2,        [r2], 0
    pshufb      m2,        [c_mode16_17]              ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
    palignr     m4,        m3, 1                      ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]

    pmaddubsw   m4,        [r4 - 10 * 16]             ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 15

    pmaddubsw   m5,        m3, [r4 -  4 * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
    pinsrb      m2,       [r2 + 5 + 32], 0            ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1                           ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1                           ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pslldq      m2,       1                           ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1                           ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1                           ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1                           ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 - 2  * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1                           ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1                           ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1

    lea         r0,        [r6 + 8]

    movu        m1,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    pslldq      m3,        m1, 1                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
    punpckhbw   m3,        m1                         ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
    palignr     m2,        m2, 6                      ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
    movlhps     m2,        m1                         ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]

    pmaddubsw   m4,        m3, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m4,        m7

    palignr     m3,        m2, 14

    pmaddubsw   m5,        m3, [r4 - 4  * 16]         ; [12]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 2  * 16]         ; [18]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 8  * 16]         ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m3, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 - 6  * 16]         ; [10]
    pmulhrsw    m1,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m0,        m3, [r4]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m4,        m3, [r4 + 6  * 16]         ; [22]
    pmulhrsw    m4,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m5,        m3, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5

    pmaddubsw   m5,        m3, [r4 - 14 * 16]         ; [02]
    pmulhrsw    m5,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  8 * 16]         ; [08]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m6,        m3, [r4 -  2 * 16]         ; [14]
    pmulhrsw    m6,        m7

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 4  * 16]         ; [20]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1

    pslldq      m2,       1
    palignr     m3,       m2, 14

    pmaddubsw   m1,        m3, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        [r4 - 16 * 16]             ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
    RET

INIT_XMM sse4
cglobal intra_pred_ang16_18, 4,5,3
    movu        m0,         [r2]
    movu        m1,         [r2 + 32]
    mova        m2,         [c_mode16_18]
    pshufb      m1,         m2

    lea         r2,         [r1 * 2]
    lea         r3,         [r1 * 3]
    lea         r4,         [r1 * 4]
    movu        [r0],       m0
    palignr     m2,         m0, m1, 15
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 14
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 13
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 12
    movu        [r0],       m2
    palignr     m2,         m0, m1, 11
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 10
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 9
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 8
    movu        [r0],       m2
    palignr     m2,         m0, m1, 7
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 6
    movu        [r0 + r2],  m2
    palignr     m2,         m0, m1, 5
    movu        [r0 + r3],  m2
    lea         r0,         [r0 + r4]
    palignr     m2,         m0, m1, 4
    movu        [r0],       m2
    palignr     m2,         m0, m1, 3
    movu        [r0 + r1],  m2
    palignr     m2,         m0, m1, 2
    movu        [r0 + r2],  m2
    palignr     m0,         m1, 1
    movu        [r0 + r3],  m0
    RET

; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
%macro PROC32_8x8 10  ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
  %if %3 == 0
  %else
    pshufb      m0, [r3]
    pmaddubsw   m0, [r4 + %3 * 16]
    pmulhrsw    m0, [pw_1024]
  %endif
  %if %4 == 0
    pmovzxbw    m1, m1
  %else
    pshufb      m1, [r3]
    pmaddubsw   m1, [r4 + %4 * 16]
    pmulhrsw    m1, [pw_1024]
  %endif
  %if %3 == 0
    packuswb    m1, m1
    movlhps     m0, m1
  %else
    packuswb    m0, m1
  %endif
    mova        m1, [pw_1024]
  %if %5 == 0
  %else
    pshufb      m2, [r3]
    pmaddubsw   m2, [r4 + %5 * 16]
    pmulhrsw    m2, m1
  %endif
  %if %6 == 0
    pmovzxbw    m3, m3
  %else
    pshufb      m3, [r3]
    pmaddubsw   m3, [r4 + %6 * 16]
    pmulhrsw    m3, m1
  %endif
  %if %5 == 0
    packuswb    m3, m3
    movlhps     m2, m3
  %else
    packuswb    m2, m3
  %endif
  %if %7 == 0
  %else
    pshufb      m4, [r3]
    pmaddubsw   m4, [r4 + %7 * 16]
    pmulhrsw    m4, m1
  %endif
  %if %8 == 0
    pmovzxbw    m5, m5
  %else
    pshufb      m5, [r3]
    pmaddubsw   m5, [r4 + %8 * 16]
    pmulhrsw    m5, m1
  %endif
  %if %7 == 0
    packuswb    m5, m5
    movlhps     m4, m5
  %else
    packuswb    m4, m5
  %endif
  %if %9 == 0
  %else
    pshufb      m6, [r3]
    pmaddubsw   m6, [r4 + %9 * 16]
    pmulhrsw    m6, m1
  %endif
  %if %10 == 0
    pmovzxbw    m7, m7
  %else
    pshufb      m7, [r3]
    pmaddubsw   m7, [r4 + %10 * 16]
    pmulhrsw    m7, m1
  %endif
  %if %9 == 0
    packuswb    m7, m7
    movlhps     m6, m7
  %else
    packuswb    m6, m7
  %endif

  %if %2 == 1
    ; transpose
    punpckhbw   m1,        m0, m2
    punpcklbw   m0,        m2
    punpckhbw   m3,        m0, m1
    punpcklbw   m0,        m1

    punpckhbw   m1,        m4, m6
    punpcklbw   m4,        m6
    punpckhbw   m6,        m4, m1
    punpcklbw   m4,        m1

    punpckhdq   m2,        m0, m4
    punpckldq   m0,        m4
    punpckldq   m4,        m3, m6
    punpckhdq   m3,        m6

    movh        [r0 +       + %1 * 8], m0
    movhps      [r0 +  r1   + %1 * 8], m0
    movh        [r0 +  r1*2 + %1 * 8], m2
    movhps      [r0 +  r5   + %1 * 8], m2
    movh        [r6         + %1 * 8], m4
    movhps      [r6 +  r1   + %1 * 8], m4
    movh        [r6 +  r1*2 + %1 * 8], m3
    movhps      [r6 +  r5   + %1 * 8], m3
  %else
    movh        [r0         ], m0
    movhps      [r0 + r1    ], m0
    movh        [r0 + r1 * 2], m2
    movhps      [r0 + r5    ], m2
    lea         r0, [r0 + r1 * 4]
    movh        [r0         ], m4
    movhps      [r0 + r1    ], m4
    movh        [r0 + r1 * 2], m6
    movhps      [r0 + r5    ], m6
  %endif
%endmacro

%macro MODE_3_33 1
    movu        m0,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9  8 7 6 5 4 3 2 1]
    palignr     m1,        m0, 1                      ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 8
    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 12
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 14]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 8
    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 12
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 21]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 10
    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 27]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_4_32 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 6
    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 10
    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 4
    mova        m6,        m5
    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m6,        m2, m0, 6
    pmaddubsw   m6,        [r3 +      16]             ; [17]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 8
    pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 10
    pmaddubsw   m2,        [r3]                       ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 12]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m5,        [r3 - 16]                  ; [15]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m1,        [r3 - 12 * 16]             ; [4]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    pmaddubsw   m6,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    pmaddubsw   m1,        [r3 - 2 * 16]              ; [14]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    mova        m2,        m1
    pmaddubsw   m1,        [r3 - 13 * 16]             ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3 + 8 * 16]              ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 17]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    palignr     m5,        m2, m0, 2
    pmaddubsw   m1,        m5, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    pmaddubsw   m6,        [r3 - 4 * 16]              ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m6,        m2, m0, 6
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 15 * 16]             ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m1,        m2, m0, 8
    pmaddubsw   m1,        [r3 - 5 * 16]              ; [11]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 22]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_5_31 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 +      16]          ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 14 * 16]              ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 3 * 16]               ; [19]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 12 * 16]              ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 5 * 16]               ; [21]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 10 * 16]              ; [6]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 7 * 16]               ; [23]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 8
    pmaddubsw   m2,        [r3 - 8 * 16]               ; [8]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m1,        m2, m0, 2
    mova        m5,        m1
    pmaddubsw   m4,        m0, [r3 + 9 * 16]           ; [25]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 - 6 * 16]               ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 11 * 16]              ; [27]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 4
    mova        m1,        m6
    pmaddubsw   m6,        [r3 - 4 * 16]               ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 13 * 16]          ; [29]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 6
    mova        m3,        m1
    pmaddubsw   m3,        [r3 - 2 * 16]               ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 15 * 16]              ; [31]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 8
    pmaddubsw   m2,        [r3]                        ; [16]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 10]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 15 * 16]          ; [1]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 2 * 16]               ; [18]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    mova        m1,        m5
    pmaddubsw   m5,        [r3 - 13 * 16]              ; [3]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        [r3 + 4 * 16]               ; [20]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 11 * 16]          ; [5]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 6 * 16]               ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m0, 6
    pmaddubsw   m1,        m2, [r3 - 9 * 16]           ; [7]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        [r3 + 8 * 16]               ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 14]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 7 * 16]           ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]              ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m5,        m2, m0, 2
    mova        m1,        m5
    pmaddubsw   m5,        [r3 - 5 * 16]               ; [11]
    pmulhrsw    m5,        m7
    pmaddubsw   m1,        [r3 + 12 * 16]              ; [28]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 3 * 16]           ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]              ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m0, 6
    pmaddubsw   m1,        m2, [r3 - 16]               ; [15]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 18]                   ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_6_30 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m1,        m0
    pmaddubsw   m4,        m0, [r3 - 3 * 16]          ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m1,        [r3 + 10 * 16]             ; [26]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    palignr     m6,        m2, m0, 2
    pmaddubsw   m5,        m6, [r3 - 9 * 16]          ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 4 * 16]              ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 15 * 16]         ; [1]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m1, [r3 - 2 * 16]          ; [14]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
    pmulhrsw    m1,        m7
    palignr     m2,        m0, 6
    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 +  5 * 16]         ; [21]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m6,        m0
    pmaddubsw   m1,        m6, [r3 - 14 * 16]         ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m6, [r3 - 16]              ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m3,        m2, m0, 2
    pmaddubsw   m6,        m3, [r3 - 7 * 16]          ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        [r3 + 6 * 16]              ; [22]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    palignr     m2,        m0, 4
    pmaddubsw   m1,        m2, [r3 - 13 * 16]         ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r3]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 +  13 * 16]        ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 7]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m5,        m2, m0, 2
    pmaddubsw   m1,        m5, [r3 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        [r3 + 7 * 16]              ; [23]
    pmulhrsw    m5,        m7
    palignr     m1,        m2, m0, 4
    pmaddubsw   m6,        m1, [r3 - 12 * 16]         ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 + 16]              ; [17]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    palignr     m2,        m2, m0, 6
    pmaddubsw   m1,        m2, [r3 - 5 * 16]          ; [11]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        m2, [r3 + 8 * 16]          ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 11]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m5,        m0
    pmaddubsw   m4,        m0, [r3 - 11 * 16]         ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m5, [r3 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
    pmulhrsw    m5,        m7
    palignr     m6,        m2, m0, 2
    pmaddubsw   m1,        m6, [r3 - 4 * 16]          ; [12]
    pmulhrsw    m1,        m7
    packuswb    m5,        m1
    pmaddubsw   m6,        [r3 + 9 * 16]              ; [25]
    pmulhrsw    m6,        m7
    palignr     m1,        m2, m0, 4
    pmaddubsw   m2,        m1, [r3 - 10 * 16]         ; [6]
    pmulhrsw    m2,        m7
    packuswb    m6,        m2
    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 14]                  ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_7_29 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    mova        m5,        m0
    pmaddubsw   m4,        m0, [r3 - 7 * 16]         ; [9]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m5, [r3 + 2 * 16]         ; [18]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        [r3 + 11 * 16]            ; [27]
    pmulhrsw    m5,        m7
    palignr     m1,        m2, m0, 2
    palignr     m2,        m0, 4
    pmaddubsw   m6,        m1, [r3 - 12 * 16]        ; [4]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m1, [r3 - 3 * 16]         ; [13]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m1, [r3 + 6 * 16]         ; [22]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        [r3 + 15 * 16]            ; [31]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]         ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 16]             ; [17]
    pmulhrsw    m4,        m7
    pmaddubsw   m2,        [r3 + 10 * 16]            ; [26]
    pmulhrsw    m2,        m7
    packuswb    m4,        m2
    movu        m0,        [r2 + 4]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m5,        m0, [r3 - 13 * 16]        ; [03]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 5 * 16]         ; [21]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]            ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 9 * 16]         ; [07]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r3]                  ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 9 * 16]         ; [25]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 6]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m1,        m0, [r3 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m0, [r3 - 5 * 16]         ; [11]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 13 * 16]        ; [29]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 - 10 * 16]        ; [6]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 - 16]             ; [15]
    pmulhrsw    m1,        m7
    pmaddubsw   m2,        m2, [r3 + 8 * 16]         ; [24]
    pmulhrsw    m2,        m7
    packuswb    m1,        m2

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m0,        [r2 + 8]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m4,        m0, [r3 - 15 * 16]        ; [1]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]         ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r3 + 3 * 16]         ; [19]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r3 - 11 * 16]        ; [5]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 2 * 16]         ; [14]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 + 7 * 16]         ; [23]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 10]                 ; [0]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_8_28 1
    movu        m0,        [r2 + 1]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m4,        m0, [r3 - 11 * 16]     ; [5]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r3 - 6 * 16]      ; [10]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r3 - 1 * 16]      ; [15]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 + 4 * 16]      ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 9 * 16]      ; [25]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        [r3 + 14 * 16]         ; [30]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 13 * 16]     ; [3]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3 - 8 * 16]      ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 - 3 * 16]      ; [13]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 2 * 16]      ; [18]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 7 * 16]      ; [23]
    pmulhrsw    m5,        m7
    pmaddubsw   m2,        [r3 + 12 * 16]         ; [28]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2
    movu        m0,        [r2 + 3]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m6,        m0, [r3 - 15 * 16]     ; [01]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 - 10 * 16]     ; [06]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 - 5 * 16]      ; [11]
    pmulhrsw    m1,        m7
    mova        m2,        m0
    pmaddubsw   m0,        [r3]                   ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 5 * 16]      ; [21]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 10 * 16]     ; [26]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 15 * 16]     ; [31]
    pmulhrsw    m5,        m7
    movu        m0,        [r2 + 4]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m2,        m0, [r3 - 12 * 16]     ; [4]
    pmulhrsw    m2,        m7
    packuswb    m5,        m2
    pmaddubsw   m6,        m0, [r3 - 7 * 16]      ; [9]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 - 2 * 16]      ; [14]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 + 3 * 16]      ; [19]
    pmulhrsw    m1,        m7
    mova        m2,        m0
    pmaddubsw   m0,        [r3 + 8 * 16]          ; [24]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 13 * 16]     ; [29]
    pmulhrsw    m4,        m7
    movu        m0,        [r2 + 5]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    pmaddubsw   m1,        m0, [r3 - 14 * 16]     ; [2]
    pmulhrsw    m1,        m7
    packuswb    m4,        m1
    pmaddubsw   m5,        m0, [r3 - 9 * 16]      ; [7]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r3 - 4 * 16]      ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r3 + 16]          ; [17]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r3 + 6 * 16]      ; [22]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r3 + 11 * 16]         ; [27]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 6]               ; [00]

    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_9_27 1
    movu        m2,        [r2 + 1]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 - 14 * 16]   ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r3 - 12 * 16]   ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r3 - 10 * 16]   ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]    ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 - 6 * 16]    ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r3 - 4 * 16]    ; [12]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r3 - 2 * 16]    ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]             ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1

    pmaddubsw   m4,        m2, [r3 + 2 * 16]    ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]    ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 6 * 16]    ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]    ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 + 10 * 16]   ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]   ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 + 14 * 16]   ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 2]             ; [00]

    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1

    movu        m2,        [r2 + 2]
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 - 14 * 16]   ; [2]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r3 - 12 * 16]   ; [4]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r3 - 10 * 16]   ; [6]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 - 8 * 16]    ; [8]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 - 6 * 16]    ; [10]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r3 - 4 * 16]    ; [12]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r3 - 2 * 16]    ; [14]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r3]             ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0

    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1

    movu        m2,        [r2 + 2]
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r3 + 2 * 16]    ; [18]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r3 + 4 * 16]    ; [20]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r3 + 6 * 16]    ; [22]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r3 + 8 * 16]    ; [24]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r3 + 10 * 16]   ; [26]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r3 + 12 * 16]   ; [28]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r3 + 14 * 16]   ; [30]
    pmulhrsw    m1,        m7
    packuswb    m1,        m1
    movhps      m1,        [r2 + 3]             ; [00]

     TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_12_24 1
    movu        m2,        [r2]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    palignr     m0,        m2, 2
    pmaddubsw   m4,        m0, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m0, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 + 13 * 16]         ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    movu        m0,        [r2 - 2]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 15 * 16]         ; [1]
    pmulhrsw    m5,        m7
    movu        m0,        [r2 - 3]
    palignr     m1,        m0, 1
    punpckhbw   m2,        m0, m1
    punpcklbw   m0,        m1
    palignr     m2,        m0, 2
    pmaddubsw   m6,        m2, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 13 * 16]         ; [3]
    pmulhrsw    m4,        m7
    movu        m2,        [r2 - 4]
    palignr     m1,        m2, 1
    punpckhbw   m0,        m2, m1
    punpcklbw   m2,        m1
    palignr     m0,        m2, 2
    pmaddubsw   m5,        m0, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m0, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m0, [r4 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m0, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m0, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m0, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7
    movu        m2,        [pb_fact0]
    pshufb      m0,        m2
    pmovzxbw    m0,        m0
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

;------------------------------------------------------------------------------------------
; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang32_2, 3,5,4
    lea             r4, [r2]
    add             r2, 64
    cmp             r3m, byte 34
    cmove           r2, r4
    movu            m0, [r2 + 2]
    movu            m1, [r2 + 18]
    movu            m3, [r2 + 34]

    lea             r3, [r1 * 3]

    movu            [r0], m0
    movu            [r0 + 16], m1
    palignr         m2, m1, m0, 1
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 1
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 2
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 2
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 3
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 3
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 4
    movu            [r0], m2
    palignr         m2, m3, m1, 4
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 5
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 5
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 6
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 6
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 7
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 7
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 8
    movu            [r0], m2
    palignr         m2, m3, m1, 8
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 9
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 9
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 10
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 10
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 11
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 11
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m1, m0, 12
    movu            [r0], m2
    palignr         m2, m3, m1, 12
    movu            [r0 + 16], m2
    palignr         m2, m1, m0, 13
    movu            [r0 + r1], m2
    palignr         m2, m3, m1, 13
    movu            [r0 + r1 + 16], m2
    palignr         m2, m1, m0, 14
    movu            [r0 + r1 * 2], m2
    palignr         m2, m3, m1, 14
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m1, m0, 15
    movu            [r0 + r3], m2
    palignr         m2, m3, m1, 15
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    movu            [r0], m1
    movu            m0, [r2 + 50]
    movu            [r0 + 16], m3
    palignr         m2, m3, m1, 1
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 1
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 2
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 2
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 3
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 3
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 4
    movu            [r0], m2
    palignr         m2, m0, m3, 4
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 5
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 5
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 6
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 6
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 7
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 7
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 8
    movu            [r0], m2
    palignr         m2, m0, m3, 8
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 9
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 9
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 10
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 10
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 11
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 11
    movu            [r0 + r3 + 16], m2

    lea             r0, [r0 + r1 * 4]

    palignr         m2, m3, m1, 12
    movu            [r0], m2
    palignr         m2, m0, m3, 12
    movu            [r0 + 16], m2
    palignr         m2, m3, m1, 13
    movu            [r0 + r1], m2
    palignr         m2, m0, m3, 13
    movu            [r0 + r1 + 16], m2
    palignr         m2, m3, m1, 14
    movu            [r0 + r1 * 2], m2
    palignr         m2, m0, m3, 14
    movu            [r0 + r1 * 2 + 16], m2
    palignr         m2, m3, m1, 15
    movu            [r0 + r3], m2
    palignr         m2, m0, m3, 15
    movu            [r0 + r3 + 16], m2
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_3, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_3_33 1
    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_4, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                    ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]               ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_4_32 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_5, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_5_31 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_6, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]                  ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]             ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_6_30 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_7, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]               ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]          ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_7_29 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_8, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_8_28 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_9, 3,7,8
    add         r2,        64
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]         ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]    ; r6 -> 4 * stride
    mova        m7,        [pw_1024]
.loop:
    MODE_9_27 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
    pxor        m7, m7
    mov         r6, 2
    movu        m0, [r2]
    movu        m1, [r2 + 1]
    mova        m8, m0
    mova        m9, m1
    mov         r3d, r4d
    lea         r4, [r1 * 3]

.loop:
    movu        m0, [r2 + 1 + 64]
    palignr     m1, m0, 1
    pshufb      m1, m7
    palignr     m2, m0, 2
    pshufb      m2, m7
    palignr     m3, m0, 3
    pshufb      m3, m7
    palignr     m4, m0, 4
    pshufb      m4, m7
    palignr     m5, m0, 5
    pshufb      m5, m7
    palignr     m6, m0, 6
    pshufb      m6, m7

    movu        [r0 + r1], m1
    movu        [r0 + r1 + 16], m1
    movu        [r0 + r1 * 2], m2
    movu        [r0 + r1 * 2 + 16], m2
    movu        [r0 + r4], m3
    movu        [r0 + r4 + 16], m3
    lea         r5, [r0 + r1 * 4]
    movu        [r5], m4
    movu        [r5 + 16], m4
    movu        [r5 + r1], m5
    movu        [r5 + r1 + 16], m5
    movu        [r5 + r1 * 2], m6
    movu        [r5 + r1 * 2 + 16], m6

    palignr     m1, m0, 7
    pshufb      m1, m7
    movhlps     m2, m0
    pshufb      m2, m7
    palignr     m3, m0, 9
    pshufb      m3, m7
    palignr     m4, m0, 10
    pshufb      m4, m7
    palignr     m5, m0, 11
    pshufb      m5, m7
    palignr     m6, m0, 12
    pshufb      m6, m7

    movu        [r5 + r4], m1
    movu        [r5 + r4 + 16], m1
    lea         r5, [r5 + r1 * 4]
    movu        [r5], m2
    movu        [r5 + 16], m2
    movu        [r5 + r1], m3
    movu        [r5 + r1 + 16], m3
    movu        [r5 + r1 * 2], m4
    movu        [r5 + r1 * 2 + 16], m4
    movu        [r5 + r4], m5
    movu        [r5 + r4 + 16], m5
    lea         r5, [r5 + r1 * 4]
    movu        [r5], m6
    movu        [r5 + 16], m6

    palignr     m1, m0, 13
    pshufb      m1, m7
    palignr     m2, m0, 14
    pshufb      m2, m7
    palignr     m3, m0, 15
    pshufb      m3, m7
    pshufb      m0, m7

    movu        [r5 + r1], m1
    movu        [r5 + r1 + 16], m1
    movu        [r5 + r1 * 2], m2
    movu        [r5 + r1 * 2 + 16], m2
    movu        [r5 + r4], m3
    movu        [r5 + r4 + 16], m3

; filter
    cmp         r3d, byte 0
    jz         .quit
    movhlps     m1, m0
    pmovzxbw    m0, m0
    mova        m1, m0
    movu        m2, m8
    movu        m3, m9

    pshufb      m2, m7
    pmovzxbw    m2, m2
    movhlps     m4, m3
    pmovzxbw    m3, m3
    pmovzxbw    m4, m4
    psubw       m3, m2
    psubw       m4, m2
    psraw       m3, 1
    psraw       m4, 1
    paddw       m0, m3
    paddw       m1, m4
    packuswb    m0, m1

.quit:
    movu        [r0], m0
    movu        [r0 + 16], m0
    dec         r6
    lea         r0, [r5 + r1 * 4]
    lea         r2, [r2 + 16]
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_11, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 16]
    pxor        m1, m1
    pshufb      m0, m1                   ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    mova        [rsp], m0
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 16 + 64]
    movu        m2, [r2 + 32 + 64]
    movu        [rsp + 1], m0
    movu        [rsp + 1 + 16], m1
    movu        [rsp + 1 + 32], m2
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 1]            ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  0, 1, 30,28,26,24,22,20,18,16

    ; Row[8 - 15]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  1, 1, 14,12,10,8,6,4,2,0

    ; Row[16 - 23]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  2, 1, 30,28,26,24,22,20,18,16

    ; Row[24 - 31]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  3, 1, 14,12,10,8,6,4,2,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

%macro MODE_12_24_ROW0 1
    movu        m0,        [r3 + 6]
    pshufb      m0,        [c_mode32_12_0]
    pinsrb      m0,        [r3 + 26], 12
    mova        above,     m0
    movu        m2,        [r2]
  %if %1 == 1
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r4 + 11 * 16]         ; [27]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 + 6 * 16]          ; [22]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 16]              ; [17]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]          ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 9 * 16]          ; [7]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 - 14 * 16]         ; [2]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    movu        m1,        [r2]                       ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
  %if %1 == 1
    pinsrb      m1,        [r3], 0
  %endif
    palignr     m2,        m1, above, 15              ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    punpcklbw   m2,        m1                         ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
    pmaddubsw   m1,        m2, [r4 + 13 * 16]             ; [29]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 + 8 * 16]          ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 3 * 16]          ; [19]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 2 * 16]          ; [14]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 7 * 16]          ; [09]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]         ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    palignr     m2,        above, 14                  ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
    pmaddubsw   m6,        m2, [r4 + 15 * 16]         ; [31]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 10 * 16]         ; [26]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 + 5 * 16]          ; [21]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4]                   ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 5 * 16]          ; [11]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 10 * 16]         ; [06]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 15 * 16]         ; [1]
    pmulhrsw    m5,        m7
    pslldq      m1,        above, 1
    palignr     m2,        m1, 14
    pmaddubsw   m6,        m2, [r4 + 12 * 16]         ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 7 * 16]          ; [23]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 + 2 * 16]          ; [18]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 3 * 16]          ; [13]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 8 * 16]          ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 13 * 16]         ; [3]
    pmulhrsw    m4,        m7
    pslldq      m1,        above, 2
    palignr     m2,        m1, 14
    pmaddubsw   m5,        m2, [r4 + 14 * 16]         ; [30]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 + 9 * 16]          ; [25]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]          ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 16]              ; [15]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 6 * 16]          ; [10]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 11 * 16]         ; [05]
    pmulhrsw    m1,        m7
    movu        m0,        [pb_fact0]
    pshufb      m2,        m0
    pmovzxbw    m2,        m2
    packuswb    m1,        m2
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

INIT_XMM sse4
cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
  %define above    [rsp + 0 * mmsize]
    mov         r3,        r2
    add         r2,        64
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    MODE_12_24_ROW0 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_12_24 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

%macro MODE_13_23_ROW0 1
    movu        m0,        [r3 + 1]
    movu        m1,        [r3 + 15]
    pshufb      m0,        [c_mode32_13_0]
    pshufb      m1,        [c_mode32_13_0]
    punpckldq   m0,        m1
    pshufb      m0,        [c_mode32_13_shuf]
    mova        above,     m0
    movu        m2,        [r2]
  %if (%1 == 1)
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1
    punpcklbw   m2,        m1
    pmaddubsw   m4,        m2, [r4 + 7 * 16]         ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 - 2 * 16]         ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 - 11 * 16]        ; [5]
    pmulhrsw    m5,        m7
    movu        m1,        [r2]                      ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
  %if (%1 == 1)
    pinsrb      m1,        [r3], 0
  %endif
    palignr     m2,        m1, above, 15             ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
    punpcklbw   m2,        m1                        ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
    pmaddubsw   m6,        m2, [r4 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 3 * 16]         ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m0,        m2, [r4 - 6 * 16]         ; [10]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r4 - 15 * 16]        ; [1]
    pmulhrsw    m1,        m7
    palignr     m2,        above, 14
    pmaddubsw   m3,        m2, [r4 + 8 * 16]         ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 16]             ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 - 10 * 16]        ; [6]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pslldq      m0,        above, 1
    palignr     m2,        m0, 14
    pmaddubsw   m5,        m2, [r4 + 13 * 16]        ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 5 * 16]         ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m1,        m2, [r4 + 9 * 16]         ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r4]                  ; [16]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 - 9 * 16]         ; [7]
    pmulhrsw    m4,        m7
    pslldq      m0,        above, 3
    palignr     m2,        m0, 14
    pmaddubsw   m3,        m2, [r4 + 14 * 16]        ; [30]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 5 * 16]         ; [21]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 13 * 16]        ; [3]
    pmulhrsw    m6,        m7
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m0,        m2, [r4 + 10 * 16]        ; [26]
    pmulhrsw    m0,        m7
    packuswb    m6,        m0
    pmaddubsw   m1,        m2, [r4 + 16]             ; [17]
    pmulhrsw    m1,        m7
    pmaddubsw   m0,        m2, [r4 - 8 * 16]         ; [8]
    pmulhrsw    m0,        m7
    packuswb    m1,        m0
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pslldq      m0,        above, 5
    palignr     m2,        m0, 14
    pmaddubsw   m4,        m2, [r4 + 15 * 16]        ; [31]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 + 6 * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 3 * 16]         ; [13]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]        ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pslldq      m0,        1
    palignr     m2,        m0, 14
    pmaddubsw   m6,        m2, [r4 + 11 * 16]        ; [27]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 2 * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 7 * 16]         ; [09]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m2, [r4 - 16 * 16]        ; [00]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

%macro MODE_13_23 2
    movu        m2,        [r2]                      ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m4,        m0, [r4 + 7 * 16]         ; [23]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m0, [r4 - 2 * 16]         ; [14]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m0, [r4 - 11 * 16]        ; [05]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 12 * 16]        ; [28]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 + 3 * 16]         ; [19]
    pmulhrsw    m6,        m7
    pmaddubsw   m3,        m2, [r4 - 6 * 16]         ; [10]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m2, [r4 - 15 * 16]        ; [1]
    pmulhrsw    m1,        m7
    movu        m2,        [r2 - 2]                  ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
    palignr     m3,        m2, 1                     ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    punpckhbw   m0,        m2, m3
    punpcklbw   m2,        m3
    palignr     m0,        m2, 2
    pmaddubsw   m3,        m0, [r4 + 8 * 16]         ; [24]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    mova        m3,        m0
    TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m3, [r4 - 16]             ; [15]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m3, [r4 - 10 * 16]        ; [6]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 + 13 * 16]        ; [29]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 + 4 * 16]         ; [20]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 5 * 16]         ; [11]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 - 14 * 16]        ; [2]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    movu        m2,        [r2 - 4]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m1,        m0, [r4 + 9 * 16]         ; [25]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m0, [r4]                  ; [16]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    mova        m3,        m0
    TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m3, [r4 - 9 * 16]         ; [7]
    pmulhrsw    m4,        m7
    pmaddubsw   m3,        m2, [r4 + 14 * 16]        ; [30]
    pmulhrsw    m3,        m7
    packuswb    m4,        m3
    pmaddubsw   m5,        m2, [r4 + 5 * 16]         ; [21]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 4 * 16]         ; [12]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    pmaddubsw   m6,        m2, [r4 - 13 * 16]        ; [3]
    pmulhrsw    m6,        m7
    movu        m2,        [r2 - 6]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpckhbw   m0,        m2, m1                    ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    palignr     m0,        m2, 2                     ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
    pmaddubsw   m3,        m0, [r4 + 10 * 16]        ; [26]
    pmulhrsw    m3,        m7
    packuswb    m6,        m3
    pmaddubsw   m1,        m0, [r4 + 16]             ; [17]
    pmulhrsw    m1,        m7
    pmaddubsw   m3,        m0, [r4 - 8 * 16]         ; [8]
    pmulhrsw    m3,        m7
    packuswb    m1,        m3
    TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
    pmaddubsw   m4,        m2, [r4 + 15 * 16]        ; [31]
    pmulhrsw    m4,        m7
    pmaddubsw   m5,        m2, [r4 + 6 * 16]         ; [22]
    pmulhrsw    m5,        m7
    packuswb    m4,        m5
    pmaddubsw   m5,        m2, [r4 - 3 * 16]         ; [13]
    pmulhrsw    m5,        m7
    pmaddubsw   m6,        m2, [r4 - 12 * 16]        ; [04]
    pmulhrsw    m6,        m7
    packuswb    m5,        m6
    movu        m2,        [r2 - 7]                  ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
  %if ((%1 & %2) == 1)
    pinsrb      m2,        [r3], 0
  %endif
    palignr     m1,        m2, 1                     ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    punpcklbw   m2,        m1                        ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
    pmaddubsw   m6,        m2, [r4 + 11 * 16]        ; [27]
    pmulhrsw    m6,        m7
    pmaddubsw   m1,        m2, [r4 + 2 * 16]         ; [18]
    pmulhrsw    m1,        m7
    packuswb    m6,        m1
    pmaddubsw   m1,        m2, [r4 - 7 * 16]         ; [09]
    pmulhrsw    m1,        m7
    movu        m0,        [pb_fact0]
    pshufb      m2,        m0
    pmovzxbw    m2,        m2
    packuswb    m1,        m2
    TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro

INIT_XMM sse4
cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
    mov         r3,        r2
    add         r2,        64
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]                  ; r5 -> 3 * stride
    lea         r6,        [r0 + r1 * 4]             ; r6 -> 4 * stride
    mova        m7,        [pw_1024]

    MODE_13_23_ROW0 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        7

    MODE_13_23 1, 1
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    mov         r3,        2
.loop:
    MODE_13_23 1, 0
    lea         r0,        [r6 + r1 * 4]
    lea         r6,        [r6 + r1 * 8]
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_14, 3,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_14_0]      ; [x x x x x x x x x 0 2 5 7 10 12 15]
    pshufb      m1, [c_mode32_14_0]      ; [x x x x x x x x x 15 17 20 22 25 27 30]
    pslldq      m1, 10                   ; [17 20 22 25 27 30 x x x x x x x x x x x]
    palignr     m0, m1, 10               ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
    mova        [rsp], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 13], m0
    movu        [rsp + 13 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 13]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 4]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m4
    PROC32_8x8  0, 1, 19,6,25,12,31,18,5,24

    ; Row[8 - 15]
    movu        m7, [r2 - 7]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    mova        m3, m1
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  1, 1, 11,30,17,4,23,10,29,16

    ; Row[16 - 23]
    movu        m7, [r2 - 10]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m3
    mova        m6, m7
    PROC32_8x8  2, 1, 3,22,9,28,15,2,21,8

    ; Row[24 - 31]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 2
    mova        m1, m0
    mova        m2, m0
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  3, 1, 27,14,1,20,7,26,13,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_15, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_15_0]      ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
    pshufb      m1, [c_mode32_15_0]      ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
    mova        [rsp], m1
    movu        [rsp + 8], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 17], m0
    movu        [rsp + 17 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 17]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 5]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  0, 1, 15,30,13,28,11,26,9,24

    ; Row[8 - 15]
    movu        m7, [r2 - 9]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  1, 1, 7,22,5,20,3,18,1,16

    ; Row[16 - 23]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  2, 1, 31,14,29,12,27,10,25,8

    ; Row[24 - 31]
    movu        m7, [r2 - 17]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  3, 1, 23,6,21,4,19,2,17,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_16, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 15]
    pshufb      m0, [c_mode32_16_0]      ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
    pshufb      m1, [c_mode32_16_0]      ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
    mova        [rsp], m1
    movu        [rsp + 10], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 21], m0
    movu        [rsp + 21 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 21]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]        ; r6 -> 4 * stride
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 1, 11,22,1,12,23,2,13,24

    ; Row[8 - 15]
    movu        m7, [r2 - 11]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    palignr     m2, m7, 3
    mova        m3, m2
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  1, 1, 3,14,25,4,15,26,5,16

    ; Row[16 - 23]
    movu        m7, [r2 - 16]
    palignr     m0, m7, 4
    mova        m1, m0
    palignr     m2, m7, 3
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  2, 1, 27,6,17,28,7,18,29,8

    ; Row[24 - 31]
    movu        m7, [r2 - 21]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    PROC32_8x8  3, 1, 19,30,9,20,31,10,21,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_17, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2]
    movu        m1, [r2 + 16]
    pshufb      m0, [c_mode32_17_0]
    pshufb      m1, [c_mode32_17_0]
    mova        [rsp     ], m1
    movu        [rsp + 13], m0
    movu        m0, [r2 + 1 + 64]
    movu        m1, [r2 + 1 + 16 + 64]
    movu        [rsp + 26], m0
    movu        [rsp + 26 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 25]          ; r2 -> [0]
    lea         r3, [c_shuf8_0]         ; r3 -> shuffle8
    lea         r4, [ang_table]         ; r4 -> ang_table
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r0 + r1 * 4]       ; r6 -> 4 * stride
    mova        m5, [pw_1024]           ; m5 -> 1024
    mova        m6, [c_deinterval8]     ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 1, 6,12,18,24,30,4,10,16

    ; Row[7 - 15]
    movu        m7, [r2 - 12]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  1, 1, 22,28,2,8,14,20,26,0

    ; Row[16 - 23]
    movu        m7, [r2 - 19]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  2, 1, 6,12,18,24,30,4,10,16

    ; Row[24 - 31]
    movu        m7, [r2 - 25]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    PROC32_8x8  3, 1, 22,28,2,8,14,20,26,0

    lea         r0, [r6 + r1 * 4]
    lea         r6, [r6 + r1 * 8]
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]

    RET

INIT_XMM sse4
cglobal intra_pred_ang32_18, 4,5,5
    movu        m0, [r2]               ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
    movu        m1, [r2 + 16]          ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
    movu        m2, [r2 + 1 + 64]      ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
    movu        m3, [r2 + 17 + 64]     ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]

    lea         r2, [r1 * 2]
    lea         r3, [r1 * 3]
    lea         r4, [r1 * 4]

    movu        [r0], m0
    movu        [r0 + 16], m1

    pshufb      m2, [c_mode32_18_0]    ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
    pshufb      m3, [c_mode32_18_0]    ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]

    palignr     m4, m0, m2, 15
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 15
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 14
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 14
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 13
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 13
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 12
    movu        [r0], m4
    palignr     m4, m1, m0, 12
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 11
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 11
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 10
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 10
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 9
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 9
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 8
    movu        [r0], m4
    palignr     m4, m1, m0, 8
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 7
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 7
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 6
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 6
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 5
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 5
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m0, m2, 4
    movu        [r0], m4
    palignr     m4, m1, m0, 4
    movu        [r0 + 16], m4
    palignr     m4, m0, m2, 3
    movu        [r0 + r1], m4
    palignr     m4, m1, m0, 3
    movu        [r0 + r1 + 16], m4
    palignr     m4, m0, m2, 2
    movu        [r0 + r2], m4
    palignr     m4, m1, m0, 2
    movu        [r0 + r2 + 16], m4
    palignr     m4, m0, m2, 1
    movu        [r0 + r3], m4
    palignr     m4, m1, m0, 1
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    movu        [r0], m2
    movu        [r0 + 16], m0
    palignr     m4, m2, m3, 15
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 15
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 14
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 14
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 13
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 13
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 12
    movu        [r0], m4
    palignr     m4, m0, m2, 12
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 11
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 11
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 10
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 10
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 9
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 9
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 8
    movu        [r0], m4
    palignr     m4, m0, m2, 8
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 7
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 7
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 6
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 6
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 5
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 5
    movu        [r0 + r3 + 16], m4

    lea         r0, [r0 + r4]

    palignr     m4, m2, m3, 4
    movu        [r0], m4
    palignr     m4, m0, m2, 4
    movu        [r0 + 16], m4
    palignr     m4, m2, m3, 3
    movu        [r0 + r1], m4
    palignr     m4, m0, m2, 3
    movu        [r0 + r1 + 16], m4
    palignr     m4, m2, m3, 2
    movu        [r0 + r2], m4
    palignr     m4, m0, m2, 2
    movu        [r0 + r2 + 16], m4
    palignr     m4, m2, m3, 1
    movu        [r0 + r3], m4
    palignr     m4, m0, m2, 1
    movu        [r0 + r3 + 16], m4
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_19, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 16 + 64]
    pshufb      m0, [c_mode32_17_0]
    pshufb      m1, [c_mode32_17_0]
    mova        [rsp     ], m1
    movu        [rsp + 13], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 26], m0
    movu        [rsp + 26 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 25]          ; r2 -> [0]
    lea         r3, [c_shuf8_0]         ; r3 -> shuffle8
    lea         r4, [ang_table]         ; r4 -> ang_table
    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
    lea         r6, [r0]                ; r6 -> r0
    mova        m5, [pw_1024]           ; m5 -> 1024
    mova        m6, [c_deinterval8]     ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 0, 6,12,18,24,30,4,10,16

    ; Row[7 - 15]
    movu        m7, [r2 - 12]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 22,28,2,8,14,20,26,0

    ; Row[16 - 23]
    movu        m7, [r2 - 19]
    palignr     m0, m7, 6
    palignr     m1, m7, 5
    palignr     m2, m7, 4
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 6,12,18,24,30,4,10,16

    ; Row[24 - 31]
    movu        m7, [r2 - 25]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 22,28,2,8,14,20,26,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_20, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_16_0]      ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
    pshufb      m1, [c_mode32_16_0]      ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
    mova        [rsp], m1
    movu        [rsp + 10], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 21], m0
    movu        [rsp + 21 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 21]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 6]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    mova        m2, m1
    palignr     m3, m7, 3
    palignr     m4, m7, 2
    mova        m5, m4
    palignr     m6, m7, 1
    PROC32_8x8  0, 0, 11,22,1,12,23,2,13,24

    ; Row[8 - 15]
    movu        m7, [r2 - 11]
    palignr     m0, m7, 5
    palignr     m1, m7, 4
    palignr     m2, m7, 3
    mova        m3, m2
    palignr     m4, m7, 2
    palignr     m5, m7, 1
    mova        m6, m5
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 3,14,25,4,15,26,5,16

    ; Row[16 - 23]
    movu        m7, [r2 - 16]
    palignr     m0, m7, 4
    mova        m1, m0
    palignr     m2, m7, 3
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 27,6,17,28,7,18,29,8

    ; Row[24 - 31]
    movu        m7, [r2 - 21]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 19,30,9,20,31,10,21,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_21, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_15_0]      ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
    pshufb      m1, [c_mode32_15_0]      ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
    mova        [rsp], m1
    movu        [rsp + 8], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 17], m0
    movu        [rsp + 17 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 17]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 5]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    PROC32_8x8  0, 0, 15,30,13,28,11,26,9,24

    ; Row[8 - 15]
    movu        m7, [r2 - 9]
    palignr     m0, m7, 4
    palignr     m1, m7, 3
    mova        m2, m1
    palignr     m3, m7, 2
    mova        m4, m3
    palignr     m5, m7, 1
    mova        m6, m5
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 7,22,5,20,3,18,1,16

    ; Row[16 - 23]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 31,14,29,12,27,10,25,8

    ; Row[24 - 31]
    movu        m7, [r2 - 17]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 23,6,21,4,19,2,17,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_22, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 64]
    pinsrb      m0, [r2], 0
    movu        m1, [r2 + 15 + 64]
    pshufb      m0, [c_mode32_14_0]      ; [x x x x x x x x x 0 2 5 7 10 12 15]
    pshufb      m1, [c_mode32_14_0]      ; [x x x x x x x x x 15 17 20 22 25 27 30]
    pslldq      m1, 10                   ; [17 20 22 25 27 30 x x x x x x x x x x x]
    palignr     m0, m1, 10               ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
    mova        [rsp], m0
    movu        m0, [r2 + 1]
    movu        m1, [r2 + 1 + 16]
    movu        [rsp + 13], m0
    movu        [rsp + 13 + 16], m1
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 13]           ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2 - 4]
    palignr     m0, m7, 3
    mova        m1, m0
    palignr     m2, m7, 2
    mova        m3, m2
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m4
    PROC32_8x8  0, 0, 19,6,25,12,31,18,5,24

    ; Row[8 - 15]
    movu        m7, [r2 - 7]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    mova        m3, m1
    palignr     m4, m7, 1
    mova        m5, m4
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 11,30,17,4,23,10,29,16

    ; Row[16 - 23]
    movu        m7, [r2 - 10]
    palignr     m0, m7, 3
    palignr     m1, m7, 2
    mova        m2, m1
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m3
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 3,22,9,28,15,2,21,8

    ; Row[24 - 31]
    movu        m7, [r2 - 13]
    palignr     m0, m7, 2
    mova        m1, m0
    mova        m2, m0
    palignr     m3, m7, 1
    mova        m4, m3
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 27,14,1,20,7,26,13,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
    lea         r3,        [r2 + 64]
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    MODE_13_23_ROW0 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_13_23 0, 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
  %define above    [rsp + 0 * mmsize]
    lea         r3,        [r2 + 64]
    lea         r4,        [ang_table + 16 * 16]
    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
    mov         r6,        r0
    mova        m7,        [pw_1024]

    MODE_12_24_ROW0 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        7
    mov         r3,        3
.loop:
    MODE_12_24 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r3
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_25, 4,7,8
    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
    mov         r6, rsp
    sub         rsp, 64+gprsize
    and         rsp, ~63
    mov         [rsp+64], r6

    ; collect reference pixel
    movu        m0, [r2 + 16 + 64]
    pxor        m1, m1
    pshufb      m0, m1                   ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
    mova        [rsp], m0
    movu        m0, [r2]
    movu        m1, [r2 + 16]
    movu        m2, [r2 + 32]
    movu        [rsp + 1], m0
    movu        [rsp + 1 + 16], m1
    movu        [rsp + 1 + 32], m2
    mov         [rsp + 63], byte 4

    ; filter
    lea         r2, [rsp + 1]            ; r2 -> [0]
    lea         r3, [c_shuf8_0]          ; r3 -> shuffle8
    lea         r4, [ang_table]          ; r4 -> ang_table
    lea         r5, [r1 * 3]             ; r5 -> 3 * stride
    lea         r6, [r0]                 ; r6 -> r0
    mova        m5, [pw_1024]            ; m5 -> 1024
    mova        m6, [c_deinterval8]      ; m6 -> c_deinterval8

.loop:
    ; Row[0 - 7]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    PROC32_8x8  0, 0, 30,28,26,24,22,20,18,16

    ; Row[8 - 15]
    movu        m7, [r2]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  1, 0, 14,12,10,8,6,4,2,0

    ; Row[16 - 23]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  2, 0, 30,28,26,24,22,20,18,16

    ; Row[24 - 31]
    movu        m7, [r2 - 1]
    mova        m0, m7
    mova        m1, m7
    mova        m2, m7
    mova        m3, m7
    mova        m4, m7
    mova        m5, m7
    mova        m6, m7
    lea         r0, [r0 + r1 * 4]
    PROC32_8x8  3, 0, 14,12,10,8,6,4,2,0

    add         r6, 8
    mov         r0, r6
    add         r2, 8
    dec         byte [rsp + 63]
    jnz        .loop
    mov         rsp, [rsp+64]
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
    mov         r6,             2
    movu        m0,             [r2 + 64]
    pinsrb      m0,             [r2], 0
    movu        m1,             [r2 + 1 + 64]
    mova        m8,             m0
    mova        m9,             m1
    mov         r3d,            r4d
    lea         r4,             [r1 * 3]

.loop:
    movu        m0,             [r2 + 1]

    movu        [r0],           m0
    movu        [r0 + r1],      m0
    movu        [r0 + r1 * 2],  m0
    movu        [r0 + r4],      m0
    lea         r5,             [r0 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r0 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0
    lea         r5,             [r5 + r1 * 4]
    movu        [r5],           m0
    movu        [r5 + r1],      m0
    movu        [r5 + r1 * 2],  m0
    movu        [r5 + r4],      m0

; filter
    cmp         r3d, byte 0
    jz         .quit

    pxor        m4,        m4
    pshufb      m0,        m4
    pmovzxbw    m0,        m0
    mova        m1,        m0
    movu        m2,        m8
    movu        m3,        m9

    pshufb      m2,        m4
    pmovzxbw    m2,        m2
    movhlps     m4,        m3
    pmovzxbw    m3,        m3
    pmovzxbw    m4,        m4
    psubw       m3,        m2
    psubw       m4,        m2
    psraw       m3,        1
    psraw       m4,        1
    paddw       m0,        m3
    paddw       m1,        m4
    packuswb    m0,        m1

    pextrb      [r0],           m0, 0
    pextrb      [r0 + r1],      m0, 1
    pextrb      [r0 + r1 * 2],  m0, 2
    pextrb      [r0 + r4],      m0, 3
    lea         r5,             [r0 + r1 * 4]
    pextrb      [r5],           m0, 4
    pextrb      [r5 + r1],      m0, 5
    pextrb      [r5 + r1 * 2],  m0, 6
    pextrb      [r5 + r4],      m0, 7
    lea         r5,             [r5 + r1 * 4]
    pextrb      [r5],           m0, 8
    pextrb      [r5 + r1],      m0, 9
    pextrb      [r5 + r1 * 2],  m0, 10
    pextrb      [r5 + r4],      m0, 11
    lea         r5,             [r5 + r1 * 4]
    pextrb      [r5],           m0, 12
    pextrb      [r5 + r1],      m0, 13
    pextrb      [r5 + r1 * 2],  m0, 14
    pextrb      [r5 + r4],      m0, 15

.quit:
    lea         r2, [r2 + 16]
    add         r0, 16
    dec         r6d
    jnz         .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_27, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_9_27 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_28, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_8_28 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_29, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_7_29 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_30, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_6_30 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_31, 3,7,8
    lea         r3,        [ang_table + 16 * 16]
    mov         r4d,       4
    lea         r5,        [r1 * 3]
    mov         r6,        r0
    mova        m7,        [pw_1024]
.loop:
    MODE_5_31 0
    add         r6,        8
    mov         r0,        r6
    add         r2,        8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_32, 3,7,8
    lea         r3,     [ang_table + 16 * 16]
    mov         r4d,    4
    lea         r5,     [r1 * 3]
    mov         r6,     r0
    mova        m7,     [pw_1024]
.loop:
    MODE_4_32 0
    add         r6,      8
    mov         r0,     r6
    add         r2,     8
    dec         r4
    jnz        .loop
    RET

INIT_XMM sse4
cglobal intra_pred_ang32_33, 3,7,8
    lea         r3,    [ang_table + 16 * 16]
    mov         r4d,   4
    lea         r5,    [r1 * 3]
    mov         r6,    r0
    mova        m7,    [pw_1024]
.loop:
    MODE_3_33 0
    add         r6,    8
    mov         r0,    r6
    add         r2,    8
    dec         r4
    jnz        .loop
    RET

;-----------------------------------------------------------------------------
; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal all_angs_pred_4x4, 4, 4, 8

; mode 2

movh      m0,         [r1 + 10]
movd      [r0],       m0

palignr   m1,         m0,      1
movd      [r0 + 4],   m1

palignr   m1,         m0,      2
movd      [r0 + 8],   m1

palignr   m1,         m0,      3
movd      [r0 + 12],  m1

; mode 3

mova          m2,        [pw_1024]

pslldq        m1,        m0,         1
pinsrb        m1,        [r1 + 9],   0
punpcklbw     m1,        m0

lea           r3,        [ang_table]

pmaddubsw     m6,        m1,        [r3 + 26 * 16]
pmulhrsw      m6,        m2
packuswb      m6,        m6
movd          [r0 + 16], m6

palignr       m0,        m1,        2

mova          m7,        [r3 + 20 * 16]

pmaddubsw     m3,        m0,        m7
pmulhrsw      m3,        m2
packuswb      m3,        m3
movd          [r0 + 20], m3

; mode 6 [row 3]
movd          [r0 + 76], m3

palignr       m3,        m1,       4

pmaddubsw     m4,        m3,        [r3 + 14 * 16]
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 24], m4

palignr       m4,        m1,        6

pmaddubsw     m4,        [r3 + 8 * 16]
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 28], m4

; mode 4

pmaddubsw     m5,        m1,        [r3 + 21 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 32], m5

pmaddubsw     m5,        m0,        [r3 + 10 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 36], m5

pmaddubsw     m5,        m0,        [r3 + 31 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 40], m5

pmaddubsw     m4,        m3,        m7
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 44], m4

; mode 5

pmaddubsw     m5,        m1,        [r3 + 17 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 48], m5

pmaddubsw     m5,        m0,        [r3 + 2 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 52], m5

pmaddubsw     m5,        m0,        [r3 + 19 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 56], m5

pmaddubsw     m4,        m3,        [r3 + 4 * 16]
pmulhrsw      m4,        m2
packuswb      m4,        m4
movd          [r0 + 60], m4

; mode 6

pmaddubsw     m5,        m1,        [r3 + 13 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 64], m5

movd          [r0 + 68], m6

pmaddubsw     m5,        m0,        [r3 + 7 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 72], m5

; mode 7

pmaddubsw     m5,        m1,        [r3 + 9 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 80], m5

pmaddubsw     m5,        m1,        [r3 + 18 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 84], m5

pmaddubsw     m5,        m1,        [r3 + 27 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 88], m5

pmaddubsw     m5,        m0,        [r3 + 4 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 92], m5

; mode 8

pmaddubsw     m5,        m1,        [r3 + 5 * 16]
pmulhrsw      m5,        m2
packuswb      m5,        m5
movd          [r0 + 96], m5

pmaddubsw     m5,         m1,       [r3 + 10 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 100], m5

pmaddubsw     m5,         m1,        [r3 + 15 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 104], m5

pmaddubsw     m5,         m1,        [r3 + 20 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 108], m5

; mode 9

pmaddubsw     m5,         m1,        [r3 + 2 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 112], m5

pmaddubsw     m5,         m1,        [r3 + 4 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 116], m5

pmaddubsw     m5,         m1,        [r3 + 6 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 120], m5

pmaddubsw     m5,         m1,        [r3 + 8 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 124], m5

; mode 10

movd         m3,         [r1 + 9]
pshufd       m4,         m3,        0
movu         [r0 + 128], m4

pxor         m5,         m5
movd         m7,         [r1 + 1]
pshufd       m4,         m7,        0
punpcklbw    m4,         m5

pinsrb       m7,         [r1],      0
pshufb       m6,         m7,        m5
punpcklbw    m6,         m5

psubw        m4,         m6
psraw        m4,         1

pshufb       m6,         m3,       m5
punpcklbw    m6,         m5

paddw        m4,         m6
packuswb     m4,         m5

pextrb       [r0 + 128],  m4,    0
pextrb       [r0 + 132],  m4,    1
pextrb       [r0 + 136],  m4,    2
pextrb       [r0 + 140],  m4,    3

; mode 11

pslldq        m1,        m1,         2
pinsrb        m1,        [r1],       0
pinsrb        m1,        [r1 + 9],   1

pmaddubsw     m3,         m1,        [r3 + 30 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 144], m3

pmaddubsw     m3,         m1,        [r3 + 28 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 148], m3

pmaddubsw     m3,         m1,        [r3 + 26 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 152], m3

pmaddubsw     m3,         m1,        [r3 + 24 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 156], m3

; mode 12

pmaddubsw     m3,         m1,        [r3 + 27 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 160], m3

pmaddubsw     m3,         m1,        [r3 + 22 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 164], m3

pmaddubsw     m3,         m1,        [r3 + 17 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 168], m3

pmaddubsw     m3,         m1,        [r3 + 12 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 172], m3

; mode 13

pmaddubsw     m3,         m1,        [r3 + 23 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 176], m3

pmaddubsw     m3,         m1,        [r3 + 14 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 180], m3

pmaddubsw     m3,         m1,        [r3 + 5 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 184], m3

pslldq        m5,         m1,        2
pinsrb        m5,         [r1 + 0],  1
pinsrb        m5,         [r1 + 4],  0

pmaddubsw     m4,         m5,        [r3 + 28 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 188], m4

; mode 14

pmaddubsw     m4,         m1,        [r3 + 19 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 192], m4

pmaddubsw     m7,         m1,        [r3 + 6 * 16]
pmulhrsw      m7,         m2
packuswb      m7,         m7
movd          [r0 + 196], m7

pinsrb        m5,         [r1 + 2],  0

pmaddubsw     m4,         m5,        [r3 + 25 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 200], m4

pmaddubsw     m4,         m5,        [r3 + 12 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 204], m4

; mode 15

pmaddubsw     m4,         m1,        [r3 + 15 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 208], m4

pmaddubsw     m4,         m5,        [r3 + 30 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 212], m4

pmaddubsw     m4,         m5,        [r3 + 13 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 216], m4

pslldq        m4,         m5,         2
pinsrb        m4,         [r1 + 2],   1
pinsrb        m4,         [r1 + 4],   0

pmaddubsw     m6,         m4,         [r3 + 28 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 220], m6

; mode 16

pmaddubsw     m6,         m1,        [r3 + 11 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 224], m6

pmaddubsw     m6,         m5,        [r3 + 22 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 228], m6

pmaddubsw     m6,         m5,        [r3 + 1 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 232], m6

pinsrb        m4,         [r1 + 3],  0

pmaddubsw     m4,         [r3 + 12 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 236], m4

; mode 17

movd          [r0 + 240],  m7

pslldq        m1,         2
pinsrb        m1,         [r1 + 1],  0
pinsrb        m1,         [r1 + 0],  1

pmaddubsw     m3,         m1,        [r3 + 12 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 244], m3

pslldq        m1,         2
pinsrb        m1,         [r1 + 1],  1
pinsrb        m1,         [r1 + 2],  0

pmaddubsw     m3,         m1,        [r3 + 18 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 248], m3

pslldq        m1,         2
pinsrb        m1,         [r1 + 2],  1
pinsrb        m1,         [r1 + 4],  0

pmaddubsw     m1,         [r3 + 24 * 16]
pmulhrsw      m1,         m2
packuswb      m1,         m1
movd          [r0 + 252], m1

; mode 18

movh          m1,         [r1]
movd          [r0 + 256], m1

pslldq        m3,         m1,         1
pinsrb        m3,         [r1 + 9],   0
movd          [r0 + 260], m3

pslldq        m4,         m3,         1
pinsrb        m4,         [r1 + 10],  0
movd          [r0 + 264], m4

pslldq        m4,         1
pinsrb        m4,         [r1 + 11],  0
movd          [r0 + 268], m4

; mode 19

palignr       m3,         m1,        1
punpcklbw     m1,         m3

pmaddubsw     m7,         m1,        [r3 + 6 * 16]
pmulhrsw      m7,         m2
packuswb      m7,         m7
movd          [r0 + 272], m7

pslldq        m3,         m1,         2
pinsrb        m3,         [r1],       1
pinsrb        m3,         [r1 + 9],   0

pmaddubsw     m4,         m3,         [r3 + 12 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 276], m4

pslldq        m4,         m3,         2
pinsrb        m4,         [r1 + 9],   1
pinsrb        m4,         [r1 + 10],  0

pmaddubsw     m5,         m4,         [r3 + 18 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 280], m5

pslldq        m4,         2
pinsrb        m4,         [r1 + 10],  1
pinsrb        m4,         [r1 + 12],  0

pmaddubsw     m4,         [r3 + 24 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 284], m4

; mode 20

pmaddubsw     m4,         m1,        [r3 + 11 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 288], m4

pinsrb        m3,         [r1 + 10],  0

pmaddubsw     m4,         m3,        [r3 + 22 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 292], m4

pmaddubsw     m4,         m3,        [r3 + 1 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 296], m4

pslldq        m6,         m3,        2
pinsrb        m6,         [r1 + 10], 1
pinsrb        m6,         [r1 + 11], 0

pmaddubsw     m5,         m6,        [r3 + 12 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 300], m5

; mode 21

pmaddubsw     m4,         m1,        [r3 + 15 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 304], m4

pmaddubsw     m4,         m3,        [r3 + 30 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 308], m4

pmaddubsw     m4,         m3,        [r3 + 13 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 312], m4

pinsrb        m6,         [r1 + 12],   0

pmaddubsw     m6,         [r3 + 28 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 316], m6

; mode 22

pmaddubsw     m4,         m1,         [r3 + 19 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 320], m4

movd          [r0 + 324], m7

pmaddubsw     m4,         m3,        [r3 + 25 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 328], m4

pmaddubsw     m4,         m3,         [r3 + 12 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 332], m4

; mode 23

pmaddubsw     m4,         m1,         [r3 + 23 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 336], m4

pmaddubsw     m4,         m1,         [r3 + 14 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 340], m4

pmaddubsw     m4,         m1,         [r3 + 5 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 344], m4

pinsrb         m3,        [r1 + 12],   0

pmaddubsw     m3,         [r3 + 28 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 348], m3

; mode 24

pmaddubsw     m3,         m1,         [r3 + 27 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 352], m3

pmaddubsw     m3,         m1,         [r3 + 22 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 356], m3

pmaddubsw     m3,         m1,         [r3 + 17 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 360], m3

pmaddubsw     m3,         m1,         [r3 + 12 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 364], m3

; mode 25

pmaddubsw     m3,         m1,         [r3 + 30 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 368], m3

pmaddubsw     m3,         m1,         [r3 + 28 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 372], m3

pmaddubsw     m3,         m1,         [r3 + 26 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 376], m3

pmaddubsw     m1,         [r3 + 24 * 16]
pmulhrsw      m1,         m2
packuswb      m1,         m1
movd          [r0 + 380], m1

; mode 26

movh         m1,         [r1 + 1]
pshufd       m3,         m1,        0
movu         [r0 + 384], m3

pxor         m4,         m4
movd         m5,         [r1 + 9]
pshufd       m5,         m5,        0
punpcklbw    m5,         m4

pinsrb       m6,         [r1],      0
pshufb       m6,         m4
punpcklbw    m6,         m4

psubw        m5,         m6
psraw        m5,         1

pshufb       m6,         m1,        m4
punpcklbw    m6,         m4

paddw        m5,         m6
packuswb     m5,         m4

pextrb       [r0 + 384], m5,    0
pextrb       [r0 + 388], m5,    1
pextrb       [r0 + 392], m5,    2
pextrb       [r0 + 396], m5,    3

; mode 27

palignr       m3,         m1,     1
punpcklbw     m1,         m3

pmaddubsw     m3,         m1,     [r3 + 2 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 400], m3

pmaddubsw     m3,         m1,     [r3 + 4 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 404], m3

pmaddubsw     m3,         m1,     [r3 + 6 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 408], m3

pmaddubsw     m3,         m1,     [r3 + 8 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 412], m3

; mode 28

pmaddubsw     m3,         m1,     [r3 + 5 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 416], m3

pmaddubsw     m3,         m1,     [r3 + 10 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 420], m3

pmaddubsw     m3,         m1,     [r3 + 15 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 424], m3

pmaddubsw     m3,         m1,     [r3 + 20 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 428], m3

; mode 29

pmaddubsw     m3,         m1,     [r3 + 9 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 432], m3

pmaddubsw     m3,         m1,     [r3 + 18 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 436], m3

pmaddubsw     m3,         m1,     [r3 + 27 * 16]
pmulhrsw      m3,         m2
packuswb      m3,         m3
movd          [r0 + 440], m3

palignr       m3,         m1,     2

pmaddubsw     m4,         m3,     [r3 + 4 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 444], m4

; mode 30

pmaddubsw     m4,         m1,     [r3 + 13 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 448], m4

pmaddubsw     m7,         m1,     [r3 + 26 * 16]
pmulhrsw      m7,         m2
packuswb      m7,         m7
movd          [r0 + 452], m7

pmaddubsw     m5,         m3,     [r3 + 7 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 456], m5

pmaddubsw     m6,         m3,     [r3 + 20 * 16]
pmulhrsw      m6,         m2
packuswb      m6,         m6
movd          [r0 + 460], m6

; mode 31

pmaddubsw     m4,         m1,     [r3 + 17 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 464], m4

pmaddubsw     m5,         m3,     [r3 + 2 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 468], m5

pmaddubsw     m5,         m3,     [r3 + 19 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 472], m5

palignr       m4,         m3,     2

pmaddubsw     m5,         m4,     [r3 + 4 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 476], m5

; mode 32

pmaddubsw     m5,         m1,     [r3 + 21 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 480], m5

pmaddubsw     m5,         m3,     [r3 + 10 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 484], m5

pmaddubsw     m5,         m3,     [r3 + 31 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 488], m5

pmaddubsw     m5,         m4,     [r3 + 20 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 492], m5

; mode 33

movd          [r0 + 496], m7

movd          [r0 + 500], m6

pmaddubsw     m5,         m4,         [r3 + 14 * 16]
pmulhrsw      m5,         m2
packuswb      m5,         m5
movd          [r0 + 504], m5

psrldq        m4,         2

pmaddubsw     m4,         [r3 + 8 * 16]
pmulhrsw      m4,         m2
packuswb      m4,         m4
movd          [r0 + 508], m4

; mode 34

movh      m7,             [r1 + 2]
movd      [r0 + 512],     m7

psrldq    m7,      1
movd      [r0 + 516],     m7

psrldq    m7,      1
movd      [r0 + 520],     m7

psrldq    m7,      1
movd      [r0 + 524],     m7

RET

;------------------------------------------------------------------------------
; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
;------------------------------------------------------------------------------
INIT_XMM sse4
cglobal all_angs_pred_8x8, 3,4,8
    ; mode 2

    movu         m0,          [r2 + 18]
    palignr      m1,          m0,          1
    punpcklqdq   m2,          m0,          m1
    movu         [r0],        m2

    palignr      m1,          m0,          2
    palignr      m2,          m0,          3
    punpcklqdq   m1,          m2
    movu         [r0 + 16],   m1

    palignr      m1,          m0,          4
    palignr      m2,          m0,          5
    punpcklqdq   m1,          m2
    movu         [r0 + 32],   m1

    palignr      m1,          m0,          6
    palignr      m2,          m0,          7
    punpcklqdq   m1,          m2
    movu         [r0 + 48],   m1

    ; mode 3 [row 0, 1]

    mova          m7,         [pw_1024]
    lea           r3,         [ang_table]

    movu          m0,         [r1 + 17]

    palignr       m1,         m0,               1
    palignr       m2,         m0,               2

    punpcklbw     m3,         m0,               m1
    pmaddubsw     m4,         m3,               [r3 + 26 * 16]
    pmulhrsw      m4,         m7

    punpcklbw     m1,         m2
    pmaddubsw     m5,         m1,               [r3 + 20 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5

    movu          [r0 + 64],  m4

    ; mode 6 [row 1]

    movh          [r0 + 264], m4

    ; mode 6 [row 3]

    movhps        [r0 + 280], m4

    ; mode 4 [row 0, 1]

    pmaddubsw     m4,         m3,               [r3 + 21 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 10 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 128], m4

    ; mode 5 [row 0, 1]

    pmaddubsw     m4,         m3,               [r3 + 17 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 2 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 192], m4

    ; mode 6 [row 0]

    pmaddubsw     m4,         m3,               [r3 + 13 * 16]
    pmulhrsw      m4,         m7

    pxor          m5,         m5

    packuswb      m4,         m5
    movh          [r0 + 256], m4

    ; mode 7 [row 0, 1]

    pmaddubsw     m4,         m3,               [r3 + 9 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 18 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 320], m4

    ; mode 8 [row 0, 1]

    pmaddubsw     m4,         m3,               [r3 + 5 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 10 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 384], m4

    ; mode 8 [row 2, 3]

    pmaddubsw     m4,         m3,               [r3 + 15 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 20 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 400], m4

    ; mode 8 [row 4, 5]

    pmaddubsw     m4,         m3,               [r3 + 25 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 30 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 416], m4

    ; mode 8 [row 6, 7]

    pmaddubsw     m4,         m1,               [r3 + 3 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 8 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 432], m4

    ; mode 9 [row 0, 1]

    pmaddubsw     m4,         m3,               [r3 + 2 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 4 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 448], m4

    ; mode 9 [row 2, 3]

    pmaddubsw     m4,         m3,               [r3 + 6 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 8 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 464], m4

    ; mode 9 [row 4, 5]

    pmaddubsw     m4,         m3,               [r3 + 10 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 12 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 480], m4

    ; mode 9 [row 6, 7]

    pmaddubsw     m4,         m3,               [r3 + 14 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 16 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 496], m4

    ; mode 7 [row 2, 3]

    pmaddubsw     m4,         m3,               [r3 + 27 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 4 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 336], m4

    ; mode 7 [row 4, 5]

    pmaddubsw     m4,         m1,               [r3 + 13 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 22 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 352], m4

    ; mode 6 [row 2]

    pmaddubsw     m4,         m1,               [r3 + 7 * 16]
    pmulhrsw      m4,         m7

    pxor           m5,         m5

    packuswb      m4,         m5
    movh          [r0 + 272], m4

    ; mode 3 [row 2, 3]

    palignr       m1,         m0,               3
    palignr       m3,         m0,               4

    punpcklbw     m2,         m1
    pmaddubsw     m5,         m2,               [r3 + 14 * 16]
    pmulhrsw      m5,         m7

    punpcklbw     m1,         m3
    pmaddubsw     m6,         m1,               [r3 + 8 * 16]
    pmulhrsw      m6,         m7

    packuswb      m5,         m6
    movu          [r0 + 80],  m5

    ; mode 6 [row 7]

    movhps        [r0 + 312], m5

    ; mode 6 [row 5]

    movh          [r0 + 296], m5

    ; mode 4 [calculate and store row 4, 5]

    pmaddubsw     m4,         m1,               [r3 + 9 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 30 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 160], m4

    ; mode 5 [row 4, 5]

    pmaddubsw     m4,         m2,               [r3 + 21 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m1,               [r3 + 6 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 224], m4

    ; mode 6 [row 4, 5]

    pmaddubsw     m5,         m2,               [r3 + 1 * 16]
    pmulhrsw      m5,         m7

    pxor           m6,        m6

    packuswb      m5,         m6
    movh          [r0 + 288], m5

    ; mode 6 [row 6, 7]

    pmaddubsw     m5,         m2,               [r3 + 27 * 16]
    pmulhrsw      m5,         m7

    pxor          m6,         m6

    packuswb      m5,         m6
    movh          [r0 + 304], m5

    ; mode 5 [calculate row 6]

    pmaddubsw     m6,         m1,               [r3 + 23 * 16]
    pmulhrsw      m6,         m7

    ; mode 3 [row 4, 5]

    palignr       m1,         m0,               5

    punpcklbw     m3,         m1
    pmaddubsw     m4,         m3,               [r3 + 2 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m3,               [r3 + 28 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 96],  m4

    ; mode 4 [calculate row 7]

    pmaddubsw     m5,         m3,               [r3 + 19 * 16]
    pmulhrsw      m5,         m7

    ; mode 5 [calculate row 6]

    pmaddubsw     m4,         m3,               [r3 + 8 * 16]
    pmulhrsw      m4,         m7

    packuswb      m6,         m4
    movu          [r0 + 240], m6

    ; mode 3 [row 6, 7]

    palignr       m2,         m0,               6
    palignr       m3,         m0,               7

    punpcklbw     m1,         m2
    pmaddubsw     m4,         m1,               [r3 + 22 * 16]
    pmulhrsw      m4,         m7

    punpcklbw     m2,         m3
    pmaddubsw     m2,         [r3 + 16 * 16]
    pmulhrsw      m2,         m7

    packuswb      m4,         m2
    movu          [r0 + 112], m4

    ; mode 4 [calculate row 7]

    pmaddubsw     m2,         m1,               [r3 + 8 * 16]
    pmulhrsw      m2,         m7

    ; mode 4 [store row 6 and 7]

    packuswb      m5,         m2
    movu          [r0 + 176], m5

    ; mode 4 [row 2, 3]

    palignr       m1,         m0,               1
    palignr       m2,         m0,               2
    palignr       m3,         m0,               3

    punpcklbw     m1,         m2
    pmaddubsw     m4,         m1,               [r3 + 31 * 16]
    pmulhrsw      m4,         m7

    punpcklbw     m2,         m3
    pmaddubsw     m5,         m2,               [r3 + 20 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 144], m4

    ; mode 5 [row 2, 3]

    pmaddubsw     m4,         m1,               [r3 + 19 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m2,               [r3 + 4 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 208], m4

    ; mode 7 [row 6, 7]

    pmaddubsw     m4,         m1,               [r3 + 31 * 16]
    pmulhrsw      m4,         m7

    pmaddubsw     m5,         m2,               [r3 + 8 * 16]
    pmulhrsw      m5,         m7

    packuswb      m4,         m5
    movu          [r0 + 368], m4

    ; mode 10

    pshufb       m1,          m0,          [tab_Si]
    movu         [r0 + 512],  m1
    movu         [r0 + 528],  m1
    movu         [r0 + 544],  m1
    movu         [r0 + 560],  m1

    pxor         m0,          m0

    pshufb       m1,          m1,          m0
    punpcklbw    m1,          m0

    movu         m2,          [r1]

    pshufb       m3,          m2,          m0
    punpcklbw    m3,          m0

    psrldq       m4,          m2,          1
    punpcklbw    m4,          m0

    movu         m2,          [r1 + 9]
    punpcklbw    m2,          m0

    psubw        m4,          m3
    psubw        m2,          m3

    psraw        m4,          1
    psraw        m2,          1

    paddw        m4,          m1
    paddw        m2,          m1

    packuswb     m4,          m2

    pextrb       [r0 + 512],  m4,          0
    pextrb       [r0 + 520],  m4,          1
    pextrb       [r0 + 528],  m4,          2
    pextrb       [r0 + 536],  m4,          3
    pextrb       [r0 + 544],  m4,          4
    pextrb       [r0 + 552],  m4,          5
    pextrb       [r0 + 560],  m4,          6
    pextrb       [r0 + 568],  m4,          7

    ; mode 11 [row 0, 1]

    movu         m0,         [r1 + 16]
    pinsrb       m0,         [r1], 0
    palignr      m1,         m0,          1
    punpcklbw    m2,         m0,          m1

    pmaddubsw    m3,         m2,          [r3 + 30 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 28 * 16]
    pmulhrsw     m4,         m7

    packuswb     m3,         m4
    movu         [r0 + 576], m3

    ; mode 11 [row 2, 3]

    pmaddubsw    m3,         m2,          [r3 + 26 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 24 * 16]
    pmulhrsw     m4,         m7

    packuswb     m3,         m4
    movu         [r0 + 592], m3

    ; mode 11 [row 4, 5]

    pmaddubsw    m3,         m2,          [r3 + 22 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 20 * 16]
    pmulhrsw     m4,         m7

    packuswb     m5,         m3,         m4
    movu         [r0 + 608], m5

    ; mode 12 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 27 * 16]
    pmulhrsw     m4,         m7

    packuswb     m4,         m3
    movu         [r0 + 640], m4

    ; mode 11 [row 6, 7]

    pmaddubsw    m3,         m2,          [r3 + 18 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 16 * 16]
    pmulhrsw     m4,         m7

    packuswb     m3,         m4
    movu         [r0 + 624], m3

    ; mode 12 [row 2, 3]

    pmaddubsw    m3,         m2,          [r3 + 17 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 12 * 16]
    pmulhrsw     m4,         m7

    packuswb     m3,         m4
    movu         [r0 + 656], m3

    ; mode 12 [row 4, 5]

    pmaddubsw    m3,         m2,          [r3 + 7 * 16]
    pmulhrsw     m3,         m7

    pmaddubsw    m4,         m2,          [r3 + 2 * 16]
    pmulhrsw     m4,         m7

    packuswb     m3,         m4
    movu         [r0 + 672], m3

    ; mode 12 [row 6, 7]

    pslldq       m3,         m2,          2
    pinsrb       m3,         [r1 + 0],    1
    pinsrb       m3,         [r1 + 6],    0

    pmaddubsw    m4,         m3,          [r3 + 29 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 24 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 688], m4

    ; mode 13 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 23 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m2,          [r3 + 14 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 704], m4

    ; mode 13 [row 2, 3]

    pmaddubsw    m4,         m2,          [r3 + 5 * 16]
    pmulhrsw     m4,         m7

    pinsrb       m3,         [r1 + 4],    0
    pmaddubsw    m5,         m3,          [r3 + 28 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 720], m4

    ; mode 13 [row 4, 5]

    pmaddubsw    m4,         m3,          [r3 + 19 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 10 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 736], m4

    ; mode 13 [row 6, 7]

    pmaddubsw    m4,         m3,          [r3 + 1 * 16]
    pmulhrsw     m4,         m7

    pslldq       m5,         m3,          2
    pinsrb       m5,         [r1 + 4],    1
    pinsrb       m5,         [r1 + 7],    0

    pmaddubsw    m5,         [r3 + 24 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 752], m4

    ; mode 14 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 19 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m2,          [r3 + 6 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 768], m4

    ; mode 14 [row 2, 3]

    pinsrb       m3,         [r1 + 2],    0

    pmaddubsw    m4,         m3,          [r3 + 25 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 12 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 784], m4

    ; mode 14 [row 4, 5]

    pslldq       m1,         m3,          2
    pinsrb       m1,         [r1 + 2],    1
    pinsrb       m1,         [r1 + 5],    0

    pmaddubsw    m4,         m1,          [r3 + 31 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m1,          [r3 + 18 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 800], m4

    ; mode 14 [row 6, 7]

    pmaddubsw    m4,         m1,          [r3 + 5 * 16]
    pmulhrsw     m4,         m7

    pslldq       m1,         2
    pinsrb       m1,         [r1 + 5],    1
    pinsrb       m1,         [r1 + 7],    0

    pmaddubsw    m5,         m1,          [r3 + 24 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 816], m4

    ; mode 15 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 15 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 30 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 832], m4

    ; mode 15 [row 2, 3]

    pmaddubsw    m4,         m3,          [r3 + 13 * 16]
    pmulhrsw     m4,         m7

    pslldq       m1,         m3,          2
    pinsrb       m1,         [r1 + 2],    1
    pinsrb       m1,         [r1 + 4],    0

    pmaddubsw    m5,         m1,          [r3 + 28 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 848], m4

    ; mode 15 [row 4, 5]

    pmaddubsw    m4,         m1,          [r3 + 11 * 16]
    pmulhrsw     m4,         m7

    pslldq       m1,         2
    pinsrb       m1,         [r1 + 4],    1
    pinsrb       m1,         [r1 + 6],    0

    pmaddubsw    m5,         m1,          [r3 + 26 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 864], m4

    ; mode 15 [row 6, 7]

    pmaddubsw    m4,         m1,          [r3 + 9 * 16]
    pmulhrsw     m4,         m7

    pslldq       m1,         2
    pinsrb       m1,         [r1 + 6],    1
    pinsrb       m1,         [r1 + 8],    0

    pmaddubsw    m1,          [r3 + 24 * 16]
    pmulhrsw     m1,         m7

    packuswb     m4,         m1
    movu         [r0 + 880], m4

    ; mode 16 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 11 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 22 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 896], m4

    ; mode 16 [row 2, 3]

    pmaddubsw    m4,         m3,          [r3 + 1 * 16]
    pmulhrsw     m4,         m7

    pslldq       m3,         2
    pinsrb       m3,         [r1 + 2],    1
    pinsrb       m3,         [r1 + 3],    0

    pmaddubsw    m5,         m3,          [r3 + 12 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 912], m4

    ; mode 16 [row 4, 5]

    pslldq       m3,         2
    pinsrb       m3,         [r1 + 3],    1
    pinsrb       m3,         [r1 + 5],    0

    pmaddubsw    m4,         m3,          [r3 + 23 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m5,         m3,          [r3 + 2 * 16]
    pmulhrsw     m5,         m7

    packuswb     m4,         m5
    movu         [r0 + 928], m4

    ; mode 16 [row 6, 7]

    pslldq       m3,         2
    pinsrb       m3,         [r1 + 5],    1
    pinsrb       m3,         [r1 + 6],    0

    pmaddubsw    m4,         m3,          [r3 + 13 * 16]
    pmulhrsw     m4,         m7

    pslldq       m3,         2
    pinsrb       m3,         [r1 + 6],    1
    pinsrb       m3,         [r1 + 8],    0

    pmaddubsw    m3,         [r3 + 24 * 16]
    pmulhrsw     m3,         m7

    packuswb     m4,         m3
    movu         [r0 + 944], m4

    ; mode 17 [row 0, 1]

    pmaddubsw    m4,         m2,          [r3 + 6 * 16]
    pmulhrsw     m4,         m7

    pslldq       m2,         2
    pinsrb       m2,         [r1 + 0],    1
    pinsrb       m2,         [r1 + 1],    0

    pmaddubsw    m3,         m2,          [r3 + 12 * 16]
    pmulhrsw     m3,         m7

    packuswb     m4,         m3
    movu         [r0 + 960], m4

    ; mode 17 [row 2, 3]

    pslldq       m2,         2
    pinsrb       m2,         [r1 + 1],    1
    pinsrb       m2,         [r1 + 2],    0

    pmaddubsw    m4,         m2,          [r3 + 18 * 16]
    pmulhrsw     m4,         m7

    pslldq       m2,         2
    pinsrb       m2,         [r1 + 2],    1
    pinsrb       m2,         [r1 + 4],    0

    pmaddubsw    m3,         m2,          [r3 + 24 * 16]
    pmulhrsw     m3,         m7

    packuswb     m4,         m3
    movu         [r0 + 976], m4

    ; mode 17 [row 4, 5]

    pslldq       m2,         2
    pinsrb       m2,         [r1 + 4],    1
    pinsrb       m2,         [r1 + 5],    0

    pmaddubsw    m4,         m2,          [r3 + 30 * 16]
    pmulhrsw     m4,         m7

    pmaddubsw    m3,         m2,          [r3 + 4 * 16]
    pmulhrsw     m3,         m7

    packuswb     m4,         m3
    movu         [r0 + 992], m4

    ; mode 17 [row 6, 7]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 5],    1
    pinsrb       m2,          [r1 + 6],    0

    pmaddubsw    m4,          m2,          [r3 + 10 * 16]
    pmulhrsw     m4,          m7

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 6],    1
    pinsrb       m2,          [r1 + 7],    0

    pmaddubsw    m3,          m2,          [r3 + 16 * 16]
    pmulhrsw     m3,          m7

    packuswb     m4,          m3
    movu         [r0 + 1008], m4

    ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]

    movh          m1,          [r2]

    pslldq        m2,          m1,         1
    pinsrb        m2,          [r2 + 1 + 16],   0
    punpcklqdq    m1,          m2
    movu          [r0 + 1024], m1

    pslldq        m2,          1
    pinsrb        m2,          [r2 + 2 + 16],   0

    pslldq        m0,          m2,          1
    pinsrb        m0,          [r2 + 3 + 16],   0
    punpcklqdq    m2,          m0
    movu          [r0 + 1040], m2

    pslldq        m0,          1
    pinsrb        m0,          [r2 + 4 + 16],   0

    pslldq        m2,          m0,              1
    pinsrb        m2,          [r2 + 5 + 16],   0
    punpcklqdq    m0,          m2
    movu          [r0 + 1056], m0

    pslldq        m2,          1
    pinsrb        m2,          [r2 + 6 + 16],   0

    pslldq        m0,           m2,             1
    pinsrb        m0,          [r2 + 7 + 16],   0
    punpcklqdq    m2,          m0
    movu          [r0 + 1072], m2

    ; mode 19 [row 0, 1]

    movu         m0,          [r1]
    palignr      m1,          m0,          1
    punpcklbw    m0,          m1

    pmaddubsw    m1,          m0,          [r3 + 6 * 16]
    pmulhrsw     m1,          m7

    pslldq       m2,          m0,          2
    pinsrb       m2,          [r1],        1
    pinsrb       m2,          [r1 + 1 + 16], 0

    pmaddubsw    m3,          m2,          [r3 + 12 * 16]
    pmulhrsw     m3,          m7

    packuswb     m1,          m3
    movu         [r0 + 1088], m1

    ; mode 19 [row 2, 3]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 1 + 16], 1
    pinsrb       m2,          [r1 + 2 + 16], 0

    pmaddubsw    m4,          m2,          [r3 + 18 * 16]
    pmulhrsw     m4,          m7

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 2 + 16],    1
    pinsrb       m2,          [r1 + 4 + 16],    0

    pmaddubsw    m5,          m2,          [r3 + 24 * 16]
    pmulhrsw     m5,          m7

    packuswb     m4,          m5
    movu         [r0 + 1104], m4

    ; mode 19 [row 4, 5]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 4 + 16], 1
    pinsrb       m2,          [r1 + 5 + 16], 0

    pmaddubsw    m4,          m2,          [r3 + 30 * 16]
    pmulhrsw     m4,          m7

    pmaddubsw    m5,          m2,          [r3 + 4 * 16]
    pmulhrsw     m5,          m7

    packuswb     m4,          m5
    movu         [r0 + 1120], m4

    ; mode 19 [row 6, 7]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 5 + 16], 1
    pinsrb       m2,          [r1 + 6 + 16], 0

    pmaddubsw    m4,          m2,          [r3 + 10 * 16]
    pmulhrsw     m4,          m7

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 6 + 16], 1
    pinsrb       m2,          [r1 + 7 + 16], 0

    pmaddubsw    m2,          [r3 + 16 * 16]
    pmulhrsw     m2,          m7

    packuswb     m4,          m2
    movu         [r0 + 1136], m4

    ; mode 20 [row 0, 1]

    pmaddubsw    m3,          m0,          [r3 + 11 * 16]
    pmulhrsw     m3,          m7

    pslldq       m1,          m0,          2
    pinsrb       m1,          [r1 + 0],    1
    pinsrb       m1,          [r1 + 2 + 16], 0

    pmaddubsw    m4,          m1,          [r3 + 22 * 16]
    pmulhrsw     m4,          m7

    packuswb     m3,          m4
    movu         [r0 + 1152], m3

    ; mode 20 [row 2, 3]

    pmaddubsw    m3,          m1,          [r3 + 1 * 16]
    pmulhrsw     m3,          m7

    pslldq       m2,          m1,          2
    pinsrb       m2,          [r1 + 2 + 16], 1
    pinsrb       m2,          [r1 + 3 + 16], 0

    pmaddubsw    m4,          m2,          [r3 + 12 * 16]
    pmulhrsw     m4,          m7

    packuswb     m3,          m4
    movu         [r0 + 1168], m3

    ; mode 20 [row 4, 5]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 3 + 16], 1
    pinsrb       m2,          [r1 + 5 + 16], 0

    pmaddubsw    m3,          m2,          [r3 + 23 * 16]
    pmulhrsw     m3,          m7

    pmaddubsw    m4,          m2,          [r3 + 2 * 16]
    pmulhrsw     m4,          m7

    packuswb     m3,          m4
    movu         [r0 + 1184], m3

    ; mode 20 [row 6, 7]

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 5 + 16], 1
    pinsrb       m2,          [r1 + 6 + 16], 0

    pmaddubsw    m3,          m2,          [r3 + 13 * 16]
    pmulhrsw     m3,          m7

    pslldq       m2,          2
    pinsrb       m2,          [r1 + 6 + 16], 1
    pinsrb       m2,          [r1 + 8 + 16], 0

    pmaddubsw    m4,          m2,          [r3 + 24 * 16]
    pmulhrsw     m4,          m7

    packuswb     m3,          m4
    movu         [r0 + 1200], m3

    ; mode 21 [row 0, 1]

    pmaddubsw    m2,          m0,          [r3 + 15 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 30 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1216], m2

    ; mode 21 [row 2, 3]

    pmaddubsw    m2,          m1,          [r3 + 13 * 16]
    pmulhrsw     m2,          m7

    pslldq       m3,          m1,          2
    pinsrb       m3,          [r1 + 2 + 16], 1
    pinsrb       m3,          [r1 + 4 + 16], 0

    pmaddubsw    m4,          m3,          [r3 + 28 * 16]
    pmulhrsw     m4,          m7

    packuswb     m2,          m4
    movu         [r0 + 1232], m2

    ; mode 21 [row 4, 5]

    pmaddubsw    m2,          m3,          [r3 + 11 * 16]
    pmulhrsw     m2,          m7

    pslldq       m3,          2
    pinsrb       m3,          [r1 + 4 + 16], 1
    pinsrb       m3,          [r1 + 6 + 16], 0

    pmaddubsw    m4,          m3,          [r3 + 26 * 16]
    pmulhrsw     m4,          m7

    packuswb     m2,          m4
    movu         [r0 + 1248], m2

    ; mode 21 [row 6, 7]

    pmaddubsw    m2,          m3,          [r3 + 9 * 16]
    pmulhrsw     m2,          m7

    pslldq       m3,          2
    pinsrb       m3,          [r1 + 6 + 16], 1
    pinsrb       m3,          [r1 + 8 + 16], 0

    pmaddubsw    m4,          m3,          [r3 + 24 * 16]
    pmulhrsw     m4,          m7

    packuswb     m2,          m4
    movu         [r0 + 1264], m2

    ; mode 22 [row 0, 1]

    pmaddubsw    m2,          m0,          [r3 + 19 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m4,          m0,          [r3 + 6 * 16]
    pmulhrsw     m4,          m7

    packuswb     m2,          m4
    movu         [r0 + 1280], m2

    ; mode 22 [row 2, 3]

    pmaddubsw    m2,          m1,          [r3 + 25 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 12 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1296], m2

    ; mode 22 [row 4, 5]

    pslldq       m1,          2
    pinsrb       m1,          [r1 + 5 + 16], 0
    pinsrb       m1,          [r1 + 2 + 16], 1

    pmaddubsw    m2,          m1,          [r3 + 31 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 18 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1312], m2

    ; mode 22 [row 6, 7]

    pmaddubsw    m2,          m1,          [r3 + 5 * 16]
    pmulhrsw     m2,          m7

    pslldq       m1,          2
    pinsrb       m1,          [r1 + 5 + 16], 1
    pinsrb       m1,          [r1 + 7 + 16], 0

    pmaddubsw    m1,          [r3 + 24 * 16]
    pmulhrsw     m1,          m7

    packuswb     m2,          m1
    movu         [r0 + 1328], m2

    ; mode 23 [row 0, 1]

    pmaddubsw    m2,          m0,          [r3 + 23 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m0,          [r3 + 14 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1344], m2

    ; mode 23 [row 2, 3]

    pmaddubsw    m2,          m0,          [r3 + 5 * 16]
    pmulhrsw     m2,          m7

    pslldq       m1,          m0,          2
    pinsrb       m1,          [r1], 1
    pinsrb       m1,          [r1 + 4 + 16], 0

    pmaddubsw    m3,          m1,          [r3 + 28 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1360], m2

    ; mode 23 [row 4, 5]

    pmaddubsw    m2,          m1,          [r3 + 19 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 10 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1376], m2

    ; mode 23 [row 6, 7]

    pmaddubsw    m2,          m1,          [r3 + 1 * 16]
    pmulhrsw     m2,          m7

    pslldq       m3,          m1,          2
    pinsrb       m3,          [r1 + 4 + 16], 1
    pinsrb       m3,          [r1 + 7 + 16], 0

    pmaddubsw    m3,          [r3 + 24 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1392], m2

    ; mode 24 [row 0, 1]

    pmaddubsw    m2,          m0,          [r3 + 27 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m5,          m0,          [r3 + 22 * 16]
    pmulhrsw     m5,          m7

    packuswb     m2,          m5
    movu         [r0 + 1408], m2

    ; mode 24 [row 2, 3]

    pmaddubsw    m2,          m0,          [r3 + 17 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m0,          [r3 + 12 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1424], m2

    ; mode 24 [row 4, 5]

    pmaddubsw    m2,          m0,          [r3 + 7 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m0,          [r3 + 2 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1440], m2

    ; mode 24 [row 6, 7]

    pinsrb       m1,          [r1 + 6 + 16], 0

    pmaddubsw    m2,          m1,          [r3 + 29 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m1,          [r3 + 24 * 16]
    pmulhrsw     m1,          m7

    packuswb     m2,          m1
    movu         [r0 + 1456], m2

    ; mode 25 [row 0, 1]

    pmaddubsw    m2,          m0,          [r3 + 30 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m1,          m0,          [r3 + 28 * 16]
    pmulhrsw     m1,          m7

    packuswb     m2,          m1
    movu         [r0 + 1472], m2

    ; mode 25 [row 2, 3]

    pmaddubsw    m2,          m0,          [r3 + 26 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m1,          m0,          [r3 + 24 * 16]
    pmulhrsw     m1,          m7

    packuswb     m2,          m1
    movu         [r0 + 1488], m2

    ; mode 25 [row 4, 5]

    pmaddubsw    m1,          m0,          [r3 + 20 * 16]
    pmulhrsw     m1,          m7

    packuswb     m5,          m1
    movu         [r0 + 1504], m5

    ; mode 25 [row 6, 7]

    pmaddubsw    m2,          m0,          [r3 + 18 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m1,          m0,          [r3 + 16 * 16]
    pmulhrsw     m1,          m7

    packuswb     m2,          m1
    movu         [r0 + 1520], m2

    ; mode 26

    movu         m0,          [r1 + 1]

    pshufb       m1,          m0,          [tab_Si]
    movu         [r0 + 1536], m1
    movu         [r0 + 1552], m1
    movu         [r0 + 1568], m1
    movu         [r0 + 1584], m1

    pxor         m5,          m5

    pshufb       m1,          m1,          m5
    punpcklbw    m1,          m5

    movu         m2,          [r1 + 16]
    pinsrb       m2,          [r1], 0

    pshufb       m3,          m2,          m5
    punpcklbw    m3,          m5

    psrldq       m4,          m2,          1
    punpcklbw    m4,          m5

    movu         m2,          [r1 + 9 + 16]
    punpcklbw    m2,          m5

    psubw        m4,          m3
    psubw        m2,          m3

    psraw        m4,          1
    psraw        m2,          1

    paddw        m4,          m1
    paddw        m2,          m1

    packuswb     m4,          m2

    pextrb       [r0 + 1536], m4,          0
    pextrb       [r0 + 1544], m4,          1
    pextrb       [r0 + 1552], m4,          2
    pextrb       [r0 + 1560], m4,          3
    pextrb       [r0 + 1568], m4,          4
    pextrb       [r0 + 1576], m4,          5
    pextrb       [r0 + 1584], m4,          6
    pextrb       [r0 + 1592], m4,          7

    ; mode 27 [row 0, 1]

    palignr      m6,          m0,          1
    punpcklbw    m4,          m0,          m6

    pmaddubsw    m1,          m4,          [r3 + 2 * 16]
    pmulhrsw     m1,          m7

    pmaddubsw    m2,          m4,          [r3 + 4 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m2
    movu         [r0 + 1600], m1

    ; mode 27 [row 2, 3]

    pmaddubsw    m1,          m4,          [r3 + 6 * 16]
    pmulhrsw     m1,          m7

    pmaddubsw    m2,          m4,          [r3 + 8 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m2
    movu         [r0 + 1616], m1

    ; mode 27 [row 4, 5]

    pmaddubsw    m3,          m4,          [r3 + 10 * 16]
    pmulhrsw     m3,          m7

    pmaddubsw    m2,          m4,          [r3 + 12 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m3,          m2
    movu         [r0 + 1632], m1

    ; mode 27 [row 6, 7]

    pmaddubsw    m1,          m4,          [r3 + 14 * 16]
    pmulhrsw     m1,          m7

    pmaddubsw    m2,          m4,          [r3 + 16 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m2
    movu         [r0 + 1648], m1

    ; mode 28 [row 0, 1]

    pmaddubsw    m1,          m4,          [r3 + 5 * 16]
    pmulhrsw     m1,          m7

    packuswb     m1,          m3
    movu         [r0 + 1664], m1

    ; mode 28 [row 2, 3]

    pmaddubsw    m1,          m4,          [r3 + 15 * 16]
    pmulhrsw     m1,          m7

    pmaddubsw    m2,          m4,          [r3 + 20 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m2
    movu         [r0 + 1680], m1

    ; mode 28 [row 4, 5]

    pmaddubsw    m1,          m4,          [r3 + 25 * 16]
    pmulhrsw     m1,          m7

    pmaddubsw    m2,          m4,          [r3 + 30 * 16]
    pmulhrsw     m2,          m7

    packuswb     m1,          m2
    movu         [r0 + 1696], m1

    ; mode 28 [row 6, 7]

    palignr      m1,          m0,          2
    punpcklbw    m5,          m6,          m1

    pmaddubsw    m2,          m5,          [r3 + 3 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 8 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1712], m2

    ; mode 29 [row 0, 1]

    pmaddubsw    m2,          m4,          [r3 + 9 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m4,          [r3 + 18 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1728], m2

    ; mode 29 [row 2, 3]

    pmaddubsw    m2,          m4,          [r3 + 27 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 4 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1744], m2

    ; mode 29 [row 4, 5]

    pmaddubsw    m2,          m5,          [r3 + 13 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 22 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1760], m2

    ; mode 29 [row 6, 7]

    pmaddubsw    m2,          m5,          [r3 + 31 * 16]
    pmulhrsw     m2,          m7

    palignr      m6,          m0,          3
    punpcklbw    m1,          m6

    pmaddubsw    m3,          m1,          [r3 + 8 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1776], m2

    ; mode 32 [row 2]

    movh         [r0 + 1936], m2

    ; mode 30 [row 0, 1]

    pmaddubsw    m2,          m4,          [r3 + 13 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m4,          [r3 + 26 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1792], m2

    ; mode 30 [row 2, 3]

    pmaddubsw    m2,          m5,          [r3 + 7 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 20 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1808], m2

    ; mode 33 [row 1]

    movhps       [r0 + 1992], m2

    ; mode 30 [row 4, 5]

    pmaddubsw    m2,          m1,          [r3 + 1 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 14 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1824], m2

    ; mode 33 [row 2]

    movhps       [r0 + 2000], m2

    ; mode 30 [row 6, 7]

    pmaddubsw    m2,          m1,          [r3 + 27 * 16]
    pmulhrsw     m2,          m7

    psrldq       m0,          4
    punpcklbw    m6,          m0

    pmaddubsw    m3,          m6,          [r3 + 8 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1840], m2

    ; mode 33 [row 3]

    movhps       [r0 + 2008], m2

    ; mode 31 [row 0, 1]

    pmaddubsw    m2,          m4,          [r3 + 17 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 2 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1856], m2

    ; mode 31 [row 2, 3]

    pmaddubsw    m2,          m5,          [r3 + 19 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m1,          [r3 + 4 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1872], m2

    ; mode 31 [row 4, 5]

    pmaddubsw    m2,          m1,          [r3 + 21 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m6,          [r3 + 6 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1888], m2

    ; mode 31 [row 6, 7]

    pmaddubsw    m2,          m6,          [r3 + 23 * 16]
    pmulhrsw     m2,          m7

    movu         m3,          [r1 + 6]
    punpcklbw    m0,          m3

    pmaddubsw    m3,          m0,          [r3 + 8 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1904], m2

    ; mode 32 [row 0, 1]

    pmaddubsw    m2,          m4,          [r3 + 21 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m5,          [r3 + 10 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1920], m2

    ; mode 32 [row 3]

    pmaddubsw    m2,          m1,          [r3 + 20 * 16]
    pmulhrsw     m2,          m7

    pxor         m3,          m3

    packuswb     m2,          m3
    movh         [r0 + 1944], m2

    ; mode 32 [row 4, 5]

    pmaddubsw    m2,          m6,          [r3 + 9 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m6,          [r3 + 30 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1952], m2

    ; mode 33 [row 4, 5]

    pmaddubsw    m2,          m0,          [r3 + 2 * 16]
    pmulhrsw     m2,          m7

    pmaddubsw    m3,          m0,          [r3 + 28 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 2016], m2

    ; mode 32 [row 6]

    pmaddubsw    m2,          m0,          [r3 + 19 * 16]
    pmulhrsw     m2,          m7

    ; mode 32 [row 7]

    movu         m0,          [r1 + 6]
    palignr      m3,          m0,          1
    punpcklbw    m0,          m3

    pmaddubsw    m3,          m0,          [r3 + 8 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 1968], m2

    ; mode 33 [row 6, 7]

    pmaddubsw    m2,          m0,          [r3 + 22 * 16]
    pmulhrsw     m2,          m7

    movu         m0,          [r1 + 7]
    palignr      m3,          m0,          1
    punpcklbw    m0,          m3

    pmaddubsw    m3,          m0,          [r3 + 16 * 16]
    pmulhrsw     m3,          m7

    packuswb     m2,          m3
    movu         [r0 + 2032], m2

    ; mode 33 [row 0]

    pmaddubsw    m2,          m4,          [r3 + 26 * 16]
    pmulhrsw     m2,          m7

    pxor         m3,          m3

    packuswb     m2,          m3
    movh         [r0 + 1984], m2

    ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]

    movu         m0,          [r2 + 2]
    palignr      m1,          m0,          1
    punpcklqdq   m2,          m0,          m1
    movu         [r0 + 2048], m2

    palignr      m1,          m0,          2
    palignr      m2,          m0,          3
    punpcklqdq   m1,          m2
    movu         [r0 + 2064], m1

    palignr      m1,          m0,          4
    palignr      m2,          m0,          5
    punpcklqdq   m1,          m2
    movu         [r0 + 2080], m1

    palignr      m1,          m0,          6
    palignr      m2,          m0,          7
    punpcklqdq   m1,          m2
    movu         [r0 + 2096], m1
RET

;--------------------------------------------------------------------------------
; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
;--------------------------------------------------------------------------------
INIT_XMM sse4
cglobal all_angs_pred_16x16, 3,4,8
    ; mode 2

    movu      m0,               [r2 + 2 + 32]
    movu      [r0 + 0 * 16],    m0

    movu      m1,               m0

    movu      m6,              [r2 + 18 + 32]
    palignr   m5,              m6,             m0,    1
    movu     [r0 + 1 * 16],    m5

    movu      m4,               m5

    palignr   m5,              m6,             m0,    2
    movu      [r0 + 2 * 16],   m5
    palignr   m5,              m6,             m0,    3
    movu      [r0 + 3 * 16],   m5
    palignr   m5,              m6,             m0,    4
    movu      [r0 + 4 * 16],   m5
    palignr   m5,              m6,             m0,    5
    movu      [r0 + 5 * 16],   m5
    palignr   m5,              m6,             m0,    6
    movu      [r0 + 6 * 16],   m5
    palignr   m5,              m6,             m0,    7
    movu      [r0 + 7 * 16],   m5

    movu      m7,               m5

    palignr   m5,              m6,             m0,    8
    movu      [r0 + 8 * 16],   m5

    movu      m2,              m5

    palignr   m5,              m6,             m0,    9
    movu      [r0 + 9 * 16],   m5

    palignr   m3,              m6,             m0,    10
    movu      [r0 + 10 * 16],  m3
    palignr   m3,              m6,             m0,    11
    movu      [r0 + 11 * 16],  m3
    palignr   m3,              m6,             m0,    12
    movu      [r0 + 12 * 16],  m3

    ; mode 3  [row 15]
    movu      [r0 + (3-2)*16*16 + 15 * 16], m3

    palignr   m3,              m6,             m0,    13
    movu      [r0 + 13 * 16],   m3
    palignr   m3,              m6,             m0,    14
    movu      [r0 + 14 * 16],   m3
    palignr   m3,              m6,             m0,    15
    movu      [r0 + 15 * 16],   m3

    ; mode 3 [row 0]
    lea           r3,    [ang_table]
    movu          m3,    [pw_1024]
    movu          m0,    [r2 + 1 + 32]
    punpcklbw     m0,    m1

    ; mode 17 [row 8 - second half]
    pmaddubsw     m1,                   m0,    [r3 + 22 * 16]
    pmulhrsw      m1,                   m3
    packuswb      m1,                   m1
    movh          [r0 + 248 * 16 + 8],  m1
    ; mode 17 [row 8 - second half] end

    pmaddubsw     m1,    m0,        [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    punpcklbw     m7,    m2
    pmaddubsw     m2,    m7,        [r3 + 26 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 16 * 16],   m1

    ;mode 6 [row 1]
    movu          [r0 + 65 * 16],   m1

    ; mode 4 [row 0]
    pmaddubsw     m1,             m0,         [r3 + 21 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 21 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 32 * 16], m1

    ; mode 5 [row 0]
    pmaddubsw     m1,             m0,         [r3 + 17 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 17 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 48 * 16], m1

    ; mode 6 [row 0]
    pmaddubsw     m1,             m0,         [r3 + 13 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 13 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 64 * 16], m1

    ; mode 7 [row 0]
    pmaddubsw     m1,             m0,        [r3 + 9 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,        [r3 + 9 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 80 * 16], m1

    ; mode 7 [row 1]
    pmaddubsw     m1,             m0,         [r3 + 18 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 18 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 81 * 16], m1

    ; mode 7 [row 2]
    pmaddubsw     m1,             m0,         [r3 + 27 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 27 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 82 * 16], m1

    ; mode 8 [row 0]
    pmaddubsw     m1,             m0,        [r3 + 5 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,        [r3 + 5 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 96 * 16], m1

    ; mode 8 [row 1]
    pmaddubsw     m1,             m0,         [r3 + 10 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 10 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 97 * 16], m1

    ; mode 8 [row 2]
    pmaddubsw     m1,             m0,         [r3 + 15 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 15 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 98 * 16], m1

    ; mode 8 [row 3]
    pmaddubsw     m1,             m0,         [r3 + 20 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m2,             m7,         [r3 + 20 * 16]
    pmulhrsw      m2,             m3
    packuswb      m1,             m2
    movu          [r0 + 99 * 16], m1

    ; mode 8 [row 4]
    pmaddubsw     m1,              m0,         [r3 + 25 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m2,              m7,         [r3 + 25 * 16]
    pmulhrsw      m2,              m3
    packuswb      m1,              m2
    movu          [r0 + 100 * 16], m1

    ; mode 8 [row 5]
    pmaddubsw     m1,              m0,         [r3 + 30 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m2,              m7,         [r3 + 30 * 16]
    pmulhrsw      m2,              m3
    packuswb      m1,              m2
    movu          [r0 + 101 * 16], m1

    ; mode 15 [row 13 - second half]
    pmaddubsw     m1,                  m0,     [r3 + 18 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 221 * 16 + 8], m1
    ; mode 15 [row 13 - second half] end

    ; mode 15 [row 14 - second half]
    pmaddubsw     m1,                  m0,     [r3 + 1 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 222 * 16 + 8], m1
    ; mode 15 [row 14 - second half] end

    ; mode 16 [row 10 - second half]
    pmaddubsw     m1,                  m0,    [r3 + 25 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 234 * 16 + 8], m1
    ; mode 16 [row 10 - second half] end

    ; mode 16 [row 11 - second half]
    pmaddubsw     m1,                  m0,    [r3 + 4 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 235 * 16 + 8], m1
    ; mode 16 [row 11 - second half] end

    ; mode 3 [row 1]
    movu          m6,    [r3 + 20 * 16]
    movu          m0,    [r2 + 2 + 32]
    punpcklbw     m0,    m4

    ; mode 17 [row 7 - second half]
    pmaddubsw     m1,     m0,          [r3 + 16 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 247 * 16 + 8], m1

    ; mode 17 [row 7 - second half] end
    pmaddubsw     m1,             m0,          m6
    pmulhrsw      m1,             m3
    movu          m2,             [r2 + 10 + 32]
    punpcklbw     m2,             m5
    pmaddubsw     m4,             m2,          m6
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 17 * 16], m1

    ;mode 6 [row 3]
    movu          [r0 + 67 * 16], m1

    ; mode 4 row [row 1]
    pmaddubsw     m1,             m0,         [r3 + 10 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 10 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 33 * 16], m1

    ; mode 4 row [row 2]
    pmaddubsw     m1,             m0,         [r3 + 31 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 31 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 34 * 16], m1

    ; mode 7 [row 6]
    movu          [r0 + 86 * 16], m1

    ; mode 5 row [row 1]
    pmaddubsw     m1,             m0,        [r3 + 2 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 2 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 49 * 16], m1

    ; mode 5 row [row 2]
    pmaddubsw     m1,             m0,         [r3 + 19 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 19 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 50 * 16], m1

    ; mode 6 [row 2]
    pmaddubsw     m1,             m0,        [r3 + 7 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 7 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 66 * 16], m1

    ; mode 7 [row 3]
    pmaddubsw     m1,             m0,        [r3 + 4 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 4 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 83 * 16], m1

    ; mode 7 [row 4]
    pmaddubsw     m1,             m0,         [r3 + 13 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 13 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 84 * 16], m1

    ; mode 8 [row 8]
    movu          [r0 + 104 * 16], m1

    ; mode 7 [row 5]
    pmaddubsw     m1,             m0,         [r3 + 22 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 22 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 85 * 16], m1

    ; mode 8 [row 6]
    pmaddubsw     m1,              m0,      [r3 + 3 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,      [r3 + 3 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 102 * 16], m1

    ; mode 8 [row 7]
    pmaddubsw     m1,              m0,        [r3 + 8 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,        [r3 + 8 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 103 * 16], m1

    ; mode 8 [row 9]
    pmaddubsw     m1,              m0,         [r3 + 18 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,         [r3 + 18 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 105 * 16], m1

    ; mode 8 [row 10]
    pmaddubsw     m1,              m0,         [r3 + 23 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,         [r3 + 23 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 106 * 16], m1

    ; mode 8 [row 11]
    pmaddubsw     m1,              m0,         [r3 + 28 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,         [r3 + 28 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 107 * 16], m1

    ; mode 3 [row 2]
    movu          m0,    [r2 + 3 + 32]
    movd          m1,    [r2 + 19 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1

    ; mode 17 [row 6 - second half]
    pmaddubsw     m1,                  m0,     [r3 + 10 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 246 * 16 + 8], m1
    ; mode 17 [row 6 - second half] end

    pmaddubsw     m1,             m0,          [r3 + 14 * 16]
    pmulhrsw      m1,             m3
    movu          m2,             [r2 + 11 + 32]
    movd          m4,             [r2 + 27 + 32]
    palignr       m4,             m2,          1
    punpcklbw     m2,             m4
    pmaddubsw     m4,             m2,          [r3 + 14 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 18 * 16], m1

    ; mode 6 [row 5]
    movu          [r0 + 69 * 16], m1

    ; mode 4 row [row 3]
    pmaddubsw     m1,             m0,         [r3 + 20 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 20 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 35 * 16], m1

    ; mode 5 row [row 3]
    pmaddubsw     m1,             m0,        [r3 + 4 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 4 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 51 * 16], m1

    ; mode 5 row [row 4]
    pmaddubsw     m1,             m0,         [r3 + 21 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 21 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 52 * 16], m1

    ; mode 6 [row 4]
    pmaddubsw     m1,             m0,        [r3 + 1 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 1 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 68 * 16], m1

    ; mode 6 [row 6]
    pmaddubsw     m1,             m0,      [r3 + 27 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,      [r3 + 27 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 70 * 16], m1

    ; mode 7 [row 7]
    pmaddubsw     m1,             m0,        [r3 + 8 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 8 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 87 * 16], m1

    ; mode 7 [row 8]
    pmaddubsw     m1,             m0,         [r3 + 17 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 17 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 88 * 16], m1

    ; mode 7 [row 9]
    pmaddubsw     m1,             m0,       [r3 + 26 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,       [r3 + 26 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 89 * 16], m1

    ; mode 8 [row 12]
    pmaddubsw     m1,              m0,        [r3 + 1 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,        [r3 + 1 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 108 * 16], m1

    ; mode 8 [row 13]
    pmaddubsw     m1,              m0,      [r3 + 6 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,      [r3 + 6 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 109 * 16], m1

    ; mode 8 [row 14]
    pmaddubsw     m1,              m0,         [r3 + 11 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,         [r3 + 11 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 110 * 16], m1

    ; mode 8 [row 15]
    pmaddubsw     m1,              m0,         [r3 + 16 * 16]
    pmulhrsw      m1,              m3
    pmaddubsw     m4,              m2,         [r3 + 16 * 16]
    pmulhrsw      m4,              m3
    packuswb      m1,              m4
    movu          [r0 + 111 * 16], m1

    ; mode 3 [row 3]
    movu          m0,              [r2 + 4 + 32]
    movd          m1,              [r2 + 20 + 32]
    palignr       m1,              m0,          1
    punpcklbw     m0,              m1

    ; mode 17 [row 4 - second half]
    pmaddubsw     m1,                  m0,    [r3 + 30 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 244 * 16 + 8], m1
    ; mode 17 [row 4 - second half] end

    ; mode 17 [row 5 - second half]
    pmaddubsw     m1,                  m0,    [r3 + 4 * 16]
    pmulhrsw      m1,                  m3
    packuswb      m1,                  m1
    movh          [r0 + 245 * 16 + 8], m1
    ; mode 17 [row 5 - second half] end

    pmaddubsw     m1,             m0,          [r3 + 8 * 16]
    pmulhrsw      m1,             m3
    movu          m2,             [r2 + 12 + 32]
    movd          m4,             [r2 + 28 + 32]
    palignr       m4,             m2,          1
    punpcklbw     m2,             m4
    pmaddubsw     m4,             m2,          [r3 + 8 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 19 * 16], m1

    ; mode 6 [row 7]
    movu          [r0 + 71 * 16], m1

    ; mode 4 row [row 4]
    pmaddubsw     m1,             m0,        [r3 + 9 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 9 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 36 * 16], m1

    ; mode 4 row [row 5]
    pmaddubsw     m1,             m0,        [r3 + 30 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 30 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 37 * 16], m1

    ; mode 7 row [row 13]
    movu          [r0 + 93 * 16], m1

    ; mode 5 row [row 5]
    pmaddubsw     m1,             m0,        [r3 + 6 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,        [r3 + 6 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 53 * 16], m1

    ; mode 5 row [row 6]
    pmaddubsw     m1,             m0,         [r3 + 23 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 23 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 54 * 16], m1

    ; mode 6 [row 8]
    pmaddubsw     m1,             m0,         [r3 + 21 * 16]
    pmulhrsw      m1,             m3
    pmaddubsw     m4,             m2,         [r3 + 21 * 16]
    pmulhrsw      m4,             m3
    packuswb      m1,             m4
    movu          [r0 + 72 * 16], m1

    ; mode 7 [row 12]
    movu          [r0 + 92 * 16], m1

    ; mode 7 [row 10]
    pmaddubsw     m1,    m0,      [r3 + 3 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 3 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 90 * 16], m1

    ; mode 7 [row 11]
    pmaddubsw     m1,    m0,      [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 12 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 91 * 16], m1

    ; mode 3 [row 4]
    movu          m0,    [r2 + 5 + 32]
    movd          m1,    [r2 + 20 + 32]
    palignr       m1,    m0,         1
    punpcklbw     m0,    m1

    ; mode 17 [row 3 - second half]
    pmaddubsw     m1,     m0,           [r3 + 24 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 243 * 16 + 8],  m1

    ; mode 17 [row 3 - second half] end
    pmaddubsw     m1,    m0,          [r3 + 2 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 13 + 32]
    movd          m4,    [r2 + 29 + 32]
    palignr       m4,    m2,          1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 2 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 20 * 16], m1

    ;mode 6 [row 9]
    movu          [r0 + 73 * 16], m1

    ; mode 4 row [row 6]
    movu          m6,    [r3 + 19 * 16]
    pmaddubsw     m1,    m0,      m6
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      m6
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 38 * 16], m1

    ; mode 3 [row 5]
    pmaddubsw     m1,    m0,      [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 28 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 21 * 16], m1

    ;mode 6 [row 11]
    movu          [r0 + 75 * 16], m1

    ; mode 5 row [row 7]
    pmaddubsw     m1,    m0,      [r3 + 8 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 8 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 55 * 16], m1

    ; mode 5 row [row 8]
    pmaddubsw     m1,    m0,      [r3 + 25 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 25 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 56 * 16], m1

    ; mode 6 [row 10]
    pmaddubsw     m1,    m0,      [r3 + 15 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 15 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 74 * 16], m1

    ; mode 7 [row 14]
    pmaddubsw     m1,    m0,      [r3 + 7 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 7 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 94 * 16], m1

    ; mode 7 [row 15]
    pmaddubsw     m1,    m0,      [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 16 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 95 * 16], m1

    ; mode 3 [row 6]
    movu          m0,    [r2 + 6 + 32]
    movd          m1,    [r2 + 22 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1

    ; mode 17 [row 2 - second half]
    pmaddubsw     m1,     m0,          [r3 + 18 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 242 * 16 + 8],  m1
    ; mode 17 [row 2 - second half] end

    pmaddubsw     m1,    m0,          [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 14 + 32]
    movd          m4,    [r2 + 30 + 32]
    palignr       m4,    m2,          1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 22 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 22 * 16], m1

    ; mode 6 [row 13]
    movu          [r0 + 77 * 16], m1

    ; mode 4 row [row 7]
    pmaddubsw     m1,    m0,      [r3 + 8 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 8 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 39 * 16], m1

    ; mode 4 row [row 8]
    pmaddubsw     m1,    m0,       [r3 + 29 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,       [r3 + 29 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 40 * 16], m1

    ; mode 5 row [row 9]
    pmaddubsw     m1,    m0,      [r3 + 10 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 10 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 57 * 16], m1

    ; mode 5 row [row 10]
    pmaddubsw     m1,    m0,      [r3 + 27 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 27 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 58 * 16], m1

    ; mode 6 [row 12]
    pmaddubsw     m1,    m0,      [r3 + 9 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 9 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 76 * 16], m1

    ; mode 3 [row 7]
    movu          m0,    [r2 + 7 + 32]
    movd          m1,    [r2 + 27 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1

    ; mode 17 [row 1 - second half]
    pmaddubsw     m1,     m0,           [r3 + 12 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 241 * 16 + 8],  m1
    ; mode 17 [row 1 - second half] end

    pmaddubsw     m1,    m0,          [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 15 + 32]
    movd          m4,    [r2 + 25 + 32]
    palignr       m4,    m2,          1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 16 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 23 * 16], m1

    ; mode 6 [row 15]
    movu          [r0 + 79 * 16], m1

    ; mode 4 row [row 9]
    pmaddubsw     m1,    m0,      [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 18 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 41 * 16], m1

    ; mode 5 row [row 11]
    pmaddubsw     m1,    m0,      [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 12 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 59 * 16], m1

    ; mode 5 row [row 12]
    pmaddubsw     m1,    m0,      [r3 + 29 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 29 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 60 * 16], m1

    ; mode 6 [row 14]
    pmaddubsw     m1,    m0,      [r3 + 3 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 3 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 78 * 16], m1

    ; mode 3 [row 8]
    movu          m0,    [r2 + 8 + 32]
    movd          m1,    [r2 + 24 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,          [r3 + 10 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 16 + 32]
    psrldq        m4,    m2,         1
    pinsrb        m4,    [r2 + 32],  15
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 10 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 24 * 16], m1

    ; mode 4 row [row 10]
    pmaddubsw     m1,    m0,      [r3 + 7 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 7 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 42 * 16], m1

    ; mode 4 row [row 11]
    pmaddubsw     m1,    m0,      [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 28 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 43 * 16], m1

    ; mode 5 row [row 13]
    pmaddubsw     m1,    m0,      [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 14 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 61 * 16], m1

    ; mode 5 row [row 14]
    pmaddubsw     m1,    m0,      [r3 + 31 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 31 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 62 * 16], m1

    ; mode 3 [row 9]
    movu          m0,    [r2 +  9 + 32]
    movd          m1,    [r2 + 16 + 32]
    palignr       m1,    m0,         1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,         [r3 + 4 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 17 + 32]
    movd          m4,    [r2 + 33 + 32]
    palignr       m4,    m2,         1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,         [r3 + 4 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 25 * 16], m1

    ; mode 4 row [row 12]
    pmaddubsw     m1,    m0,      [r3 + 17 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 17 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 44 * 16], m1

    ; mode 3 [row 10]
    pmaddubsw     m1,    m0,          [r3 + 30 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,          [r3 + 30 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 26 * 16], m1

    ; mode 5 row [row 15]
    pmaddubsw     m1,    m0,      [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 16 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 63 * 16], m1

    ; mode 3 [row 11]
    movu          m0,    [r2 + 10 + 32]
    movd          m1,    [r2 + 26 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,          [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 18 + 32]
    movd          m4,    [r2 + 34 + 32]
    palignr       m4,    m2,         1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,         [r3 + 24 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,                 m4
    movu          [r0 + 27 * 16],     m1

    ; mode 4 row [row 13]
    pmaddubsw     m1,    m0,      [r3 + 6 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 6 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 45 * 16], m1

    ; mode 4 row [row 14]
    pmaddubsw     m1,    m0,      [r3 + 27 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 27 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 46 * 16], m1

    ; mode 3 [row 12]
    movu          m0,    [r2 + 11 + 32]
    movd          m1,    [r2 + 27 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,          [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 19 + 32]
    movd          m4,    [r2 + 35 + 32]
    palignr       m4,    m2,          1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 18 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 28 * 16], m1

    ; mode 4 row [row 15]
    pmaddubsw     m1,    m0,      [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m2,      [r3 + 16 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 47 * 16], m1

    ; mode 3 [row 13]
    movu          m0,    [r2 + 12 + 32]
    movd          m1,    [r2 + 28 + 32]
    palignr       m1,    m0,          1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,          [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 20 + 32]
    movd          m4,    [r2 + 36 + 32]
    palignr       m4,    m2,          1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,          [r3 + 12 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,             m4
    movu          [r0 + 29 * 16], m1

    ; mode 3 [row 14]
    movu          m0,    [r2 + 13 + 32]
    movd          m1,    [r2 + 29 + 32]
    palignr       m1,    m0,         1
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,         [r3 + 6 * 16]
    pmulhrsw      m1,    m3
    movu          m2,    [r2 + 21 + 32]
    movd          m4,    [r2 + 37 + 32]
    palignr       m4,    m2,         1
    punpcklbw     m2,    m4
    pmaddubsw     m4,    m2,         [r3 + 6 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,                m4
    movu          [r0 + 30 * 16],    m1

    ; mode 9
    movu          m0,    [r1 + 1 + 32]
    movd          m1,    [r1 + 17 + 32]
    palignr       m1,    m0,         1

    ; mode 9 [row 15]
    movu          [r0 + 127 * 16],  m1

    ; mode 9 [row 0]
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,        [r3 + 2 * 16]
    pmulhrsw      m1,    m3
    movu          m7,    [r1 +  9 + 32]
    movd          m4,    [r2 + 25 + 32]
    palignr       m2,    m7,        1
    punpcklbw     m7,    m2
    pmaddubsw     m2,    m7,        [r3 + 2 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 112 * 16],  m1

    ; mode 9 [row 1]
    pmaddubsw     m1,    m0,        [r3 + 4 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 4 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 113 * 16],  m1

    ; mode 9 [row 2]
    pmaddubsw     m1,    m0,        [r3 + 6 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 6 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 114 * 16],  m1

    ; mode 9 [row 3]
    pmaddubsw     m1,    m0,        [r3 + 8 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 8 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 115 * 16],  m1

    ; mode 9 [row 4]
    pmaddubsw     m1,    m0,        [r3 + 10 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 10 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 116 * 16],  m1

    ; mode 9 [row 5]
    pmaddubsw     m1,    m0,        [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 12 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 117 * 16],  m1

    ; mode 9 [row 6]
    pmaddubsw     m1,    m0,        [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 14 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 118 * 16],  m1

    ; mode 9 [row 7]
    pmaddubsw     m1,    m0,        [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 16 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 119 * 16],  m1

    ; mode 9 [row 8]
    pmaddubsw     m1,    m0,        [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 18 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 120 * 16],  m1

    ; mode 9 [row 9]
    pmaddubsw     m1,    m0,        [r3 + 20 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 20 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 121 * 16],  m1

    ; mode 9 [row 10]
    pmaddubsw     m1,    m0,        [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 22 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 122 * 16],  m1

    ; mode 9 [row 11]
    pmaddubsw     m1,    m0,        [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 24 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 123 * 16],  m1

    ; mode 9 [row 12]
    pmaddubsw     m1,    m0,        [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 26 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 124 * 16],  m1

    ; mode 9 [row 13]
    pmaddubsw     m1,    m0,         [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,         [r3 + 28 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 125 * 16],  m1

    ; mode 9 [row 14]
    pmaddubsw     m1,    m0,        [r3 + 30 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 30 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 126 * 16],  m1

    ; mode 10
    movu         m1,               [r1 + 1 + 32]
    movu         [r0 + 128 * 16],  m1
    movu         [r0 + 129 * 16],  m1
    movu         [r0 + 130 * 16],  m1
    movu         [r0 + 131 * 16],  m1
    movu         [r0 + 132 * 16],  m1
    movu         [r0 + 133 * 16],  m1
    movu         [r0 + 134 * 16],  m1
    movu         [r0 + 135 * 16],  m1
    movu         [r0 + 136 * 16],  m1
    movu         [r0 + 137 * 16],  m1
    movu         [r0 + 138 * 16],  m1
    movu         [r0 + 139 * 16],  m1
    movu         [r0 + 140 * 16],  m1
    movu         [r0 + 141 * 16],  m1
    movu         [r0 + 142 * 16],  m1
    movu         [r0 + 143 * 16],  m1

    pxor         m0,          m0
    pshufb       m1,          m1,         m0
    punpcklbw    m1,          m0
    pinsrb       m2,          [r1], 0
    pshufb       m2,          m2,         m0
    punpcklbw    m2,          m0
    movu         m4,          [r1 + 1]
    punpcklbw    m5,          m4,         m0
    punpckhbw    m4,          m0
    psubw        m5,          m2
    psubw        m4,          m2
    psraw        m5,          1
    psraw        m4,          1
    paddw        m5,          m1
    paddw        m4,          m1
    packuswb     m5,          m4

    pextrb       [r0 + 128 * 16],  m5,          0
    pextrb       [r0 + 129 * 16],  m5,          1
    pextrb       [r0 + 130 * 16],  m5,          2
    pextrb       [r0 + 131 * 16],  m5,          3
    pextrb       [r0 + 132 * 16],  m5,          4
    pextrb       [r0 + 133 * 16],  m5,          5
    pextrb       [r0 + 134 * 16],  m5,          6
    pextrb       [r0 + 135 * 16],  m5,          7
    pextrb       [r0 + 136 * 16],  m5,          8
    pextrb       [r0 + 137 * 16],  m5,          9
    pextrb       [r0 + 138 * 16],  m5,          10
    pextrb       [r0 + 139 * 16],  m5,          11
    pextrb       [r0 + 140 * 16],  m5,          12
    pextrb       [r0 + 141 * 16],  m5,          13
    pextrb       [r0 + 142 * 16],  m5,          14
    pextrb       [r0 + 143 * 16],  m5,          15

    ; mode 11
    movu          m0,               [r1 + 32]
    pinsrb        m0,               [r1], 0

    ; mode 11 [row 15]
    movu          [r0 + 159 * 16],  m0

    ; mode 11 [row 0]
    movu          m1,    [r1 + 1 + 32]
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,        [r3 + 30 * 16]
    pmulhrsw      m1,    m3
    movu          m7,    [r1 + 8 + 32]
    movu          m2,    [r1 + 9 + 32]
    punpcklbw     m7,    m2
    pmaddubsw     m2,    m7,        [r3 + 30 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 144 * 16],  m1

    ; mode 11 [row 1]
    pmaddubsw     m1,    m0,        [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 28 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 145 * 16],  m1

    ; mode 11 [row 2]
    pmaddubsw     m1,    m0,        [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 26 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 146 * 16],  m1

    ; mode 11 [row 3]
    pmaddubsw     m1,    m0,         [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 24 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 147 * 16],  m1

    ; mode 11 [row 4]
    pmaddubsw     m1,    m0,        [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 22 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 148 * 16],  m1

    ; mode 11 [row 5]
    pmaddubsw     m1,    m0,        [r3 + 20 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 20 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 149 * 16],  m1

    ; mode 11 [row 6]
    pmaddubsw     m1,    m0,        [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 18 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 150 * 16],  m1

    ; mode 11 [row 7]
    pmaddubsw     m1,    m0,        [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 16 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 151 * 16],  m1

    ; mode 11 [row 8]
    pmaddubsw     m1,    m0,        [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 14 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 152 * 16],  m1

    ; mode 11 [row 9]
    pmaddubsw     m1,    m0,        [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 12 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 153 * 16],  m1

    ; mode 11 [row 10]
    pmaddubsw     m1,    m0,        [r3 + 10 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 10 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 154 * 16],  m1

    ; mode 11 [row 11]
    pmaddubsw     m1,    m0,        [r3 + 8 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 8 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 155 * 16],  m1

    ; mode 11 [row 12]
    pmaddubsw     m1,    m0,        [r3 + 6 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 6 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 156 * 16],  m1

    ; mode 11 [row 13]
    pmaddubsw     m1,    m0,        [r3 + 4 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 4 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 157 * 16],  m1

    ; mode 11 [row 14]
    pmaddubsw     m1,    m0,        [r3 + 2 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 2 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 158 * 16],  m1

    ; mode 12 [row 0]
    movu          m0,    [r2 + 32]
    pinsrb        m0,    [r2], 0
    movu          m1,    [r2 + 1 + 32]
    punpcklbw     m0,    m1
    pmaddubsw     m1,    m0,        [r3 + 27 * 16]
    pmulhrsw      m1,    m3
    movu          m7,    [r2 + 8 + 32]
    movd          m2,    [r2 + 24 + 32]
    palignr       m2,    m7,        1
    punpcklbw     m7,    m2
    pmaddubsw     m2,    m7,        [r3 + 27 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 160 * 16],  m1

    ; mode 12 [row 1]
    pmaddubsw     m1,    m0,        [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 22 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 161 * 16],  m1

    ; mode 12 [row 2]
    pmaddubsw     m1,    m0,        [r3 + 17 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 17 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 162 * 16],  m1

    ; mode 12 [row 3]
    pmaddubsw     m1,    m0,        [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 12 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 163 * 16],  m1

    ; mode 12 [row 4]
    pmaddubsw     m1,    m0,        [r3 + 7 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 7 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 164 * 16],  m1

    ; mode 12 [row 5]
    pmaddubsw     m1,    m0,        [r3 + 2 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 2 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 165 * 16],  m1

    ; mode 13 [row 0]
    pmaddubsw     m1,    m0,        [r3 + 23 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 23 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 176 * 16],  m1

    ; mode 13 [row 1]
    pmaddubsw     m1,    m0,        [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 14 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 177 * 16],  m1

    ; mode 13 [row 2]
    pmaddubsw     m1,    m0,        [r3 + 5 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 5 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 178 * 16],  m1

    ; mode 14 [row 0]
    pmaddubsw     m1,    m0,        [r3 + 19 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 19 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 192 * 16],  m1

    ; mode 14 [row 1]
    pmaddubsw     m1,    m0,        [r3 + 6 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 6 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 193 * 16],  m1

    ; mode 17 [row 0]
    movu          [r0 + 240 * 16],  m1

    ; mode 15 [row 0]
    pmaddubsw     m1,    m0,        [r3 + 15 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 15 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 208 * 16],  m1

    ; mode 15 [row 15 - second half]
    pmaddubsw     m1,    m0,           [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 223 * 16 + 8], m1
    ; mode 15 [row 15 - second half] end

    ; mode 16 [row 0]
    pmaddubsw     m1,    m0,        [r3 + 11 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m2,    m7,        [r3 + 11 * 16]
    pmulhrsw      m2,    m3
    packuswb      m1,               m2
    movu          [r0 + 224 * 16],  m1

    ; mode 17 [row 9 - second half]
    pmaddubsw     m1,     m0,          [r3 + 28 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 249 * 16 + 8],  m1
    ; mode 17 [row 9 - second half] end

    ; mode 17 [row 10 - second half]
    pmaddubsw     m1,     m0,          [r3 + 2 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 250 * 16 + 8],  m1
    ; mode 17 [row 10 - second half] end

    ; mode 17 [row 1 - first half]
    pslldq        m6,     m0,          2
    pinsrb        m6,     [r2],        1
    pinsrb        m6,     [r2 + 1],    0
    pmaddubsw     m1,     m6,          [r3 + 12 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,               m1
    movh          [r0 + 241 * 16],  m1

    ; mode 17 [row 11 - second half]
    pmaddubsw     m1,     m6,          [r3 + 8 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 251 * 16 + 8],  m1
    ; mode 17 [row 11 - second half] end

    ; mode 17 [row 2 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 1],    1
    pinsrb        m6,     [r2 + 2],    0
    pmaddubsw     m1,     m6,          [r3 + 18 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 242 * 16],     m1

    ; mode 17 [row 12 - second half]
    pmaddubsw     m1,     m6,           [r3 + 14 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 252 * 16 + 8],  m1
    ; mode 17 [row 12 - second half] end

    ; mode 17 [row 3 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 2],    1
    pinsrb        m6,     [r2 + 4],    0
    pmaddubsw     m1,     m6,          [r3 + 24 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,               m1
    movh          [r0 + 243 * 16],  m1

    ; mode 17 [row 13 - first half]
    pmaddubsw     m1,     m6,           [r3 + 20 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 253 * 16 + 8],  m1

    ; mode 17 [row 4 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 4],    1
    pinsrb        m6,     [r2 + 5],    0
    pmaddubsw     m1,     m6,          [r3 + 30 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 244 * 16],     m1

    ; mode 17 [row 5 - first half]
    pmaddubsw     m1,     m6,          [r3 + 4 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,               m1
    movh          [r0 + 245 * 16],  m1

    ; mode 17 [row 14 - second half]
    pmaddubsw     m1,     m6,          [r3 + 26 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 254 * 16 + 8], m1
    ; mode 17 [row 14 - second half] end

    ; mode 17 [row 6 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 5],    1
    pinsrb        m6,     [r2 + 6],    0
    pmaddubsw     m1,     m6,          [r3 + 10 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 246 * 16],     m1

    ; mode 17 [row 7 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 6],    1
    pinsrb        m6,     [r2 + 7],    0
    pmaddubsw     m1,     m6,          [r3 + 16 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 247 * 16],     m1

    ; mode 17 [row 8 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 7],    1
    pinsrb        m6,     [r2 + 9],    0
    pmaddubsw     m1,     m6,          [r3 + 22 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 248 * 16],     m1

    ; mode 17 [row 9 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 +  9],    1
    pinsrb        m6,     [r2 + 10],    0
    pmaddubsw     m1,     m6,           [r3 + 28 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 249 * 16],      m1

    ; mode 17 [row 10 - first half]
    pmaddubsw     m1,     m6,          [r3 + 2 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                  m1
    movh          [r0 + 250 * 16],     m1

    ; mode 17 [row 11 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 10],    1
    pinsrb        m6,     [r2 + 11],    0
    pmaddubsw     m1,     m6,           [r3 + 8 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 251 * 16],      m1

    ; mode 17 [row 12 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 11],    1
    pinsrb        m6,     [r2 + 12],    0
    pmaddubsw     m1,     m6,           [r3 + 14 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 252 * 16],      m1

    ; mode 17 [row 13 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 12],    1
    pinsrb        m6,     [r2 + 14],    0
    pmaddubsw     m1,     m6,           [r3 + 20 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 253 * 16],      m1

    ; mode 17 [row 14 - first half]
    pslldq        m6,     2
    pinsrb        m6,     [r2 + 14],    1
    pinsrb        m6,     [r2 + 15],    0
    pmaddubsw     m1,     m6,           [r3 + 26 * 16]
    pmulhrsw      m1,     m3
    packuswb      m1,                   m1
    movh          [r0 + 254 * 16],      m1

    ; mode 16 [row 12 -  second half]
    pmaddubsw     m1,    m0,            [r3 + 15 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                   m1
    movh          [r0 + 236 * 16 + 8],  m1
    ; mode 16 [row 12 -  second half]

    ; mode 12 [row 6]
    pslldq        m2,    m0,            2
    pinsrb        m2,    [r2], 1
    pinsrb        m2,    [r2 + 6],      0
    pmaddubsw     m1,    m2,            [r3 + 29 * 16]
    pmulhrsw      m1,    m3
    movu          m0,    [r2 + 7 + 32]
    psrldq        m4,    m0,            1
    punpcklbw     m0,    m4
    pmaddubsw     m4,    m0,            [r3 + 29 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,                   m4
    movu          [r0 + 166 * 16],      m1

    ; mode 12 [row 7]
    pmaddubsw     m1,    m2,        [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 24 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 167 * 16],  m1

    ; mode 12 [row 8]
    pmaddubsw     m1,    m2,        [r3 + 19 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 19 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 168 * 16],  m1

    ; mode 12 [row 9]
    pmaddubsw     m1,    m2,        [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 14 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 169 * 16],  m1

    ; mode 12 [row 10]
    pmaddubsw     m1,    m2,        [r3 + 9 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 9 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 170 * 16],  m1

    ; mode 12 [row 11]
    pmaddubsw     m1,    m2,        [r3 + 4 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 4 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,    m4
    movu          [r0 + 171 * 16],  m1

    ; mode 13 [row 3]
    pinsrb        m7,    m2,        [r2 +  4],   0
    pmaddubsw     m1,    m7,        [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 28 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 179 * 16],  m1

    ; mode 13 [row 4]
    pmaddubsw     m1,    m7,        [r3 + 19 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 19 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 180 * 16],  m1

    ; mode 13 [row 5]
    pmaddubsw     m1,    m7,        [r3 + 10 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 10 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 181 * 16],  m1

    ; mode 13 [row 6]
    pmaddubsw     m1,    m7,        [r3 + 1 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 1 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 182 * 16],  m1

    ; mode 14 [row 2]
    pinsrb        m5,    m7,        [r2 +  2],   0
    pmaddubsw     m1,    m5,        [r3 + 25 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 25 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 194 * 16],  m1

    ; mode 14 [row 3]
    pmaddubsw     m1,    m5,        [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 12 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 195 * 16],  m1

    ; mode 15 [row 1]
    pmaddubsw     m1,    m5,        [r3 + 30 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 30 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 209 * 16],  m1

    ; mode 15 [row 2]
    pmaddubsw     m1,    m5,        [r3 + 13 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 13 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 210 * 16],  m1

    ; mode 16 [row 1]
    pmaddubsw     m1,    m5,        [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 22 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 225 * 16],  m1

    ; mode 16 [row 2]
    pmaddubsw     m1,    m5,        [r3 + 1 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m4,    m0,        [r3 + 1 * 16]
    pmulhrsw      m4,    m3
    packuswb      m1,               m4
    movu          [r0 + 226 * 16],  m1

    ; mode 16 [row 13 - second half]
    pmaddubsw     m1,    m5,           [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 237 * 16 + 8], m1
    ; mode 16 [row 13 - second half]

    ; mode 16 [row 14 - second half]
    pmaddubsw     m1,    m5,           [r3 + 5 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 238 * 16 + 8], m1
    ; mode 16 [row 14 - second half]

    ; mode 16 [row 3]
    pslldq        m6,    m5,         2
    pinsrb        m6,    [r2 + 2],   1
    pinsrb        m6,    [r2 + 3],   0
    pmaddubsw     m1,    m6,         [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 227 * 16],   m1

    ; mode 16 [row 15 - second half]
    pmaddubsw     m1,    m6,          [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 239 * 16 + 8], m1
    ; mode 16 [row 15 - second half] end

    ; mode 16 [row 4- first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 3],   1
    pinsrb        m6,    [r2 + 5],   0
    pmaddubsw     m1,    m6,         [r3 + 23 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 228 * 16],   m1

    ; mode 16 [row 5- first half]
    pmaddubsw     m1,    m6,        [r3 + 2 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 229 * 16],  m1

    ; mode 16 [row 6- first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 5],   1
    pinsrb        m6,    [r2 + 6],   0
    pmaddubsw     m1,    m6,         [r3 + 13 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 230 * 16],   m1

    ; mode 16 [row 7- first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 6],   1
    pinsrb        m6,    [r2 + 8],   0
    pmaddubsw     m1,    m6,         [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 231 * 16],   m1

    ; mode 16 [row 8- first half]
    pmaddubsw     m1,    m6,        [r3 + 3 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 232 * 16],  m1
    ; mode 19 [row 0 - second half] end

    ; mode 16 [row 9- first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 8],   1
    pinsrb        m6,    [r2 + 9],   0
    pmaddubsw     m1,    m6,        [r3 + 14 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 233 * 16],  m1

    ; mode 16 [row 10 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 +  9], 1
    pinsrb        m6,    [r2 + 11], 0
    pmaddubsw     m1,    m6,        [r3 + 25 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 234 * 16],  m1

    ; mode 16 [row 11 - first half]
    pmaddubsw     m1,    m6,        [r3 + 4 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 235 * 16],  m1

    ; mode 16 [row 12 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 11], 1
    pinsrb        m6,    [r2 + 12], 0
    pmaddubsw     m1,    m6,        [r3 + 15 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 236 * 16],  m1

    ; mode 16 [row 13 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 12],   1
    pinsrb        m6,    [r2 + 14],   0
    pmaddubsw     m1,    m6,        [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 237 * 16],  m1

    ; mode 16 [row 14 - first half]
    pmaddubsw     m1,    m6,        [r3 + 5 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 238 * 16],  m1

    ; mode 16 [row 15 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 14],   1
    pinsrb        m6,    [r2 + 15],   0
    pmaddubsw     m1,    m6,          [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,               m1
    movh          [r0 + 239 * 16],  m1

    ; mode 14 [row 4]
    pslldq        m5,    2
    pinsrb        m5,    [r2 + 2],   1
    pinsrb        m5,    [r2 + 5],   0
    movu          m4,    [r2 + 6 + 32]
    psrldq        m0,    m4,         1
    punpcklbw     m4,    m0

    ; mode 16 [row 3 - second half]
    pmaddubsw     m1,    m4,        [r3 + 12 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 227 * 16 + 8], m1

    ; mode 16 [row 3 - second half] end
    pmaddubsw     m1,    m5,        [r3 + 31 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m0,    m4,        [r3 + 31 * 16]
    pmulhrsw      m0,    m3
    packuswb      m1,               m0
    movu          [r0 + 196 * 16],  m1

    ; mode 14 [row 5]
    pmaddubsw     m1,    m5,        [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m0,    m4,        [r3 + 18 * 16]
    pmulhrsw      m0,    m3
    packuswb      m1,               m0
    movu          [r0 + 197 * 16],  m1

    ; mode 14 [row 6]
    pmaddubsw     m1,    m5,         [r3 + 5 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m0,    m4,         [r3 + 5 * 16]
    pmulhrsw      m0,    m3
    packuswb      m1,               m0
    movu          [r0 + 198 * 16],  m1

    ; mode 15 [row 3]
    movu          m6,    m5
    pinsrb        m6,    [r2 + 4],   0
    pmaddubsw     m1,    m6,         [r3 + 28 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m0,    m4,         [r3 + 28 * 16]
    pmulhrsw      m0,    m3
    packuswb      m1,                m0
    movu          [r0 + 211 * 16],   m1

    ; mode 15 [row 4]
    pmaddubsw     m1,    m6,         [r3 + 11 * 16]
    pmulhrsw      m1,    m3
    pmaddubsw     m0,    m4,         [r3 + 11 * 16]
    pmulhrsw      m0,    m3
    packuswb      m1,                m0
    movu          [r0 + 212 * 16],   m1

    ; mode 15 [row 5 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 4],   1
    pinsrb        m6,    [r2 + 6],   0
    pmaddubsw     m1,    m6,         [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 213 * 16],   m1

    ; mode 15 [row 6 - first half]
    pmaddubsw     m1,    m6,         [r3 + 9 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 214 * 16],   m1

    ; mode 15 [row 7 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 6],   1
    pinsrb        m6,    [r2 + 8],   0
    pmaddubsw     m1,    m6,         [r3 + 24 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 215 * 16],   m1

    ; mode 15 [row 8 - first half]
    pmaddubsw     m1,    m6,         [r3 + 7 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 216 * 16],   m1

    ; mode 15 [row 9 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 8],   1
    pinsrb        m6,    [r2 + 9],   0
    pmaddubsw     m1,    m6,         [r3 + 22 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 217 * 16],   m1

    ; mode 15 [row 10 - first half]
    pmaddubsw     m1,    m6,         [r3 + 5 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 218 * 16],   m1

    ; mode 15 [row 11 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 +  9],   1
    pinsrb        m6,    [r2 + 11],   0
    pmaddubsw     m1,    m6,         [r3 + 20 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 219 * 16],   m1

    ; mode 15 [row 12 - first half]
    pmaddubsw     m1,    m6,         [r3 + 3 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 220 * 16],   m1

    ; mode 15 [row 13 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 11],   1
    pinsrb        m6,    [r2 + 13],   0
    pmaddubsw     m1,    m6,         [r3 + 18 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 221 * 16],   m1

    ; mode 15 [row 14 - first half]
    pmaddubsw     m1,    m6,         [r3 + 1 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 222 * 16],   m1

    ; mode 15 [row 15 - first half]
    pslldq        m6,    2
    pinsrb        m6,    [r2 + 13],   1
    pinsrb        m6,    [r2 + 15],   0
    pmaddubsw     m1,    m6,         [r3 + 16 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                m1
    movh          [r0 + 223 * 16],   m1

    ; mode 14 [row 7]
    pslldq        m5,    2
    pinsrb        m5,    [r2 + 5],   1
    pinsrb        m5,    [r2 + 7],   0
    movu          m0,    [r2 + 5 + 32]
    psrldq        m6,    m0,          1
    punpcklbw     m0,    m6

    ; mode 15 [row 5 - second half]
    pmaddubsw     m1,    m0,           [r3 + 26 * 16]
    pmulhrsw      m1,    m3
    packuswb      m1,                  m1
    movh          [r0 + 213 * 16 + 8], m1
    ; mode 15 [row 5 - seco