changeset 2036:1c7ce772c0b7

Merged in deepthidevaki/xhevc_deepthid (pull request #184) intrinsics in IntraAng
author Steve Borho <steve@borho.org>
date Thu, 06 Jun 2013 11:59:23 -0500
parents 6657cc220ef5 (current diff) 18f485d4e06f (diff)
children 9567c46f3511
files
diffstat 1 files changed, 710 insertions(+-), 416 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/vec/intrapred.inc	Thu Jun 06 19:41:53 2013 +0530
+++ b/source/common/vec/intrapred.inc	Thu Jun 06 11:59:23 2013 -0500
@@ -2664,77 +2664,110 @@ void xPredIntraAng8x8(int bitDepth, pixe
 #undef CALCROW
 #endif /* if HIGH_BIT_DEPTH */
 
+//16x16
 #if HIGH_BIT_DEPTH
 #else
 #define PREDANG_CALCROW_VER(X) { \
         LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
         LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
         CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        compress(row11L, row11H).store(pDst + ((X)*dstStride)); \
+        /*compress(row11L, row11H).store(pDst + ((X)*dstStride));*/ \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
 }
 
 #define PREDANG_CALCROW_HOR(X, rowx) { \
         LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
         LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
         CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        rowx = compress(row11L, row11H); \
+        /*rowx = compress(row11L, row11H);*/  \
+        rowx = _mm_packus_epi16(row11L, row11H); \
 }
 
 // ROWL/H is a Vec8s variable, X is the index in of data to be loaded
 #define LOADROW(ROWL, ROWH, X) { \
-        tmp.load(refMain + 1 + (X)); \
-        ROWL = extend_low(tmp); \
-        ROWH = extend_high(tmp); \
+        /*tmp.load(refMain + 1 + (X)); */ \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+        /* ROWL = extend_low(tmp);*/  \
+        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        /*ROWH = extend_high(tmp);*/  \
+        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
 }
 
 #define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
-        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5; \
+        /*v_deltaPos += v_ipAngle; \
+        v_deltaFract = v_deltaPos & thirty1;*/ \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        /*RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
+        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5;*/ \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, ROW1L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, ROW2L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        RESL = _mm_srai_epi16(it2, 5); \
+        \
+        it2 = _mm_mullo_epi16(it1, ROW1H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, ROW2H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        RESH = _mm_srai_epi16(it2, 5); \
 }
 
 #define  BLND2_16(R1, R2) { \
-        tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); \
-        tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2); \
-        R1 = tmp1; \
-        R2 = tmp2; \
+        /*tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); */ \
+        itmp1 = _mm_unpacklo_epi8(R1, R2); \
+        /*tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2);*/ \
+        itmp2 = _mm_unpackhi_epi8(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
 }
 
 #define MB4(R1, R2, R3, R4) { \
         BLND2_16(R1, R2) \
         BLND2_16(R3, R4) \
-        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3); \
-        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3); \
-        R1 = tmp1; \
-        R3 = tmp2; \
-        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
-        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4); \
-        R2 = tmp1; \
-        R4 = tmp2; \
+        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3);*/  \
+        itmp1 = _mm_unpacklo_epi16(R1, R3); \
+        /* tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3);*/ \
+        itmp2 = _mm_unpackhi_epi16(R1, R3); \
+        R1 = itmp1; \
+        R3 = itmp2; \
+        /*R1 = tmp1; \
+        R3 = tmp2;*/ \
+        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
+        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4);*/ \
+        itmp1 = _mm_unpacklo_epi16(R2, R4); \
+        itmp2 = _mm_unpackhi_epi16(R2, R4); \
+        R2 = itmp1; \
+        R4 = itmp2; \
+        /*R2 = tmp1; \
+        R4 = tmp2;*/ \
 }
 
 #define BLND2_4(R1, R2) { \
-        tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
-        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); \
-        R1 = tmp1; \
-        R2 = tmp2; \
+        /* tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
+        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); */ \
+        itmp1 = _mm_unpacklo_epi32(R1, R2); \
+        itmp2 = _mm_unpackhi_epi32(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
+        /*R1 = tmp1; \
+        R2 = tmp2; */\
 }
 
 #define BLND2_2(R1, R2) { \
-        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
-        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
-        tmp1.store(pDst);   pDst += dstStride; \
-        tmp2.store(pDst);   pDst += dstStride; \
-}
-
-#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
-        MB4(R1, R2, R3, R4) \
-        MB4(R5, R6, R7, R8) \
-        BLND2_4(R1, R5); \
-        BLND2_4(R2, R6); \
-        BLND2_4(R3, R7); \
-        BLND2_4(R4, R8); \
+        /*tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2);*/ \
+        itmp1 = _mm_unpacklo_epi64(R1, R2); \
+        itmp2 = _mm_unpackhi_epi64(R1, R2); \
+        /*tmp1.store(pDst); */ \
+        _mm_storeu_si128((__m128i*)pDst, itmp1); \
+        pDst += dstStride; \
+        /*tmp2.store(pDst);*/ \
+        _mm_storeu_si128((__m128i*)pDst, itmp2); \
+        pDst += dstStride; \
 }
 
 #define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
@@ -2746,20 +2779,12 @@ void xPredIntraAng8x8(int bitDepth, pixe
         PREDANG_CALCROW_HOR(5 + X, R6) \
         PREDANG_CALCROW_HOR(6 + X, R7) \
         PREDANG_CALCROW_HOR(7 + X, R8) \
-        MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
-}
-
-#define MB16 { \
-        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0) \
-        CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8) \
-        BLND2_2(R1, R9) \
-        BLND2_2(R5, R13) \
-        BLND2_2(R3, R11) \
-        BLND2_2(R7, R15) \
-        BLND2_2(R2, R10) \
-        BLND2_2(R6, R14) \
-        BLND2_2(R4, R12) \
-        BLND2_2(R8, R16) \
+        MB4(R1, R2, R3, R4) \
+        MB4(R5, R6, R7, R8) \
+        BLND2_4(R1, R5); \
+        BLND2_4(R2, R6); \
+        BLND2_4(R3, R7); \
+        BLND2_4(R4, R8); \
 }
 
 void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
@@ -2887,23 +2912,25 @@ void xPredIntraAng16x16(int bitDepth, pi
         else
         {
             Vec16uc v_main;
-            v_main.load(refMain + 1);
-            v_main.store(pDst);
-            v_main.store(pDst + dstStride);
-            v_main.store(pDst + (2 * dstStride));
-            v_main.store(pDst + (3 * dstStride));
-            v_main.store(pDst + (4 * dstStride));
-            v_main.store(pDst + (5 * dstStride));
-            v_main.store(pDst + (6 * dstStride));
-            v_main.store(pDst + (7 * dstStride));
-            v_main.store(pDst + (8 * dstStride));
-            v_main.store(pDst + (9 * dstStride));
-            v_main.store(pDst + (10 * dstStride));
-            v_main.store(pDst + (11 * dstStride));
-            v_main.store(pDst + (12 * dstStride));
-            v_main.store(pDst + (13 * dstStride));
-            v_main.store(pDst + (14 * dstStride));
-            v_main.store(pDst + (15 * dstStride));
+//            v_main.load(refMain + 1);
+            v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
+
+            _mm_storeu_si128((__m128i*)pDst, v_main);
+            _mm_storeu_si128((__m128i*)(pDst + dstStride), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (2 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (3 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (4 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (5 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (6 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (7 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (8 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (9 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (10 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (11 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (12 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (13 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (14 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (15 * dstStride)), v_main);
 
             Vec16uc v_temp;
             Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
@@ -2953,106 +2980,131 @@ void xPredIntraAng16x16(int bitDepth, pi
         refMain[0] = refMain0;
 
         Vec16uc tmp;
-        tmp.load(refMain);        //-1,0,1,2
-        tmp.store(pDst);
-        tmp.load(--refMain);     //-2,-1,0,1
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
+        __m128i itmp;
+//        tmp.load(refMain);        //-1,0,1,2
+//        tmp.store(pDst);
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
         pDst += dstStride;
-        tmp.store(pDst);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
         tmp.load(--refMain);
         pDst += dstStride;
         tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
+        ... 14 times more
+*/
         return;
     }
     else if (intraPredAngle == 32)
     {
         Vec8s tmp;
-
-        tmp.load(refMain + 2);
-        tmp.store(pDst);
-        tmp.load(refMain + 3);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 4);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 5);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 6);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 7);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 8);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 9);
+        __m128i itmp;
+        refMain += 2;
+
+//        tmp.load(refMain++);
+//        tmp.store(pDst);
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
+        tmp.load(refMain++);
         pDst += dstStride;
         tmp.store(pDst);
-        tmp.load(refMain + 10);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 11);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 12);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 13);
+        ... 14 times more
+*/
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 14);
-        pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 15);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 16);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
         pDst += dstStride;
-        tmp.store(pDst);
-        tmp.load(refMain + 17);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
         pDst += dstStride;
-        tmp.store(pDst);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
         return;
     }
     else
@@ -3066,7 +3118,18 @@ void xPredIntraAng16x16(int bitDepth, pi
             Vec16uc tmp1, tmp2;
             v_deltaPos = 0;
             v_ipAngle = intraPredAngle;
-            MB16;
+            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+//            MB16;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
         }
         else
         {
@@ -3076,6 +3139,7 @@ void xPredIntraAng16x16(int bitDepth, pi
             Vec8s tmp1, tmp2;
             v_deltaPos = 0;
             v_ipAngle = intraPredAngle;
+            __m128i itmp, it1, it2, it3, i16;
 
             PREDANG_CALCROW_VER(0);
             PREDANG_CALCROW_VER(1);
@@ -3104,105 +3168,220 @@ void xPredIntraAng16x16(int bitDepth, pi
 #undef BLND2_16
 #undef BLND2_2
 #undef BLND2_4
-#undef MB16
-#undef MB8
 #undef MB4
 #undef CALC_BLND_8ROWS
 #endif /* if HIGH_BIT_DEPTH */
 
+//32x32
 #if HIGH_BIT_DEPTH
 #else
 #define PREDANG_CALCROW_VER(X) { \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
-        LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
-        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        compress(row11L, row11H).store(pDst + ((X)*dstStride)); \
-        LOADROW(row11L, row11H, GETAP(lookIdx, X) + 16); \
-        LOADROW(row12L, row12H, GETAP(lookIdx, X) + 17); \
-        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        compress(row11L, row11H).store(pDst + ((X)*dstStride) + 16); \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
 }
 
 #define PREDANG_CALCROW_VER_MODE2(X) { \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        CALCROW(res1, res2, row11, row12, row21, row22); \
-        compress(res1, res2).store(pDst + ((X)*dstStride)); \
-        CALCROW(res1, res2, row13, row14, row23, row24); \
-        compress(res1, res2).store(pDst + ((X)*dstStride) + 16); \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row21); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row12); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row22); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(res1, res2); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row13); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row23); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row14); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row24); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(res1, res2); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
 }
 
 #define PREDANG_CALCROW_HOR(X, rowx) { \
-        LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
-        LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        rowx = compress(row11L, row11H); \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        rowx = _mm_packus_epi16(row11L, row11H); \
 }
 
 #define PREDANG_CALCROW_HOR_MODE2(rowx) { \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        CALCROW(res1, res2, row11L, row11H, row12L, row12H); \
-        rowx = compress(res1, res2); \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        rowx = _mm_packus_epi16(res1, res2); \
 }
 
 // ROWL/H is a Vec8s variable, X is the index in of data to be loaded
 #define LOADROW(ROWL, ROWH, X) { \
-        tmp.load(refMain + 1 + (X)); \
+/*        tmp.load(refMain + 1 + (X)); \
         ROWL = extend_low(tmp); \
-        ROWH = extend_high(tmp); \
-}
-
-#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
-        RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
-        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5; \
-}
-
-#define  BLND2_16(R1, R2) { \
-        tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); \
-        tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2); \
-        R1 = tmp1; \
-        R2 = tmp2; \
-}
-
-#define MB4(R1, R2, R3, R4) { \
-        BLND2_16(R1, R2) \
-        BLND2_16(R3, R4) \
-        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3); \
-        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3); \
-        R1 = tmp1; \
-        R3 = tmp2; \
-        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
-        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4); \
-        R2 = tmp1; \
-        R4 = tmp2; \
-}
-
-#define BLND2_4(R1, R2) { \
-        tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
-        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); \
-        R1 = tmp1; \
-        R2 = tmp2; \
+        ROWH = extend_high(tmp); */\
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
 }
 
 #define BLND2_2(R1, R2) { \
-        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+/*        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
         tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
         tmp1.store(pDst);   pDst += dstStride; \
-        tmp2.store(pDst);   pDst += dstStride; \
+        tmp2.store(pDst);   pDst += dstStride; */\
+        itmp1 = _mm_unpacklo_epi64(R1, R2); \
+        itmp2 = _mm_unpackhi_epi64(R1, R2); \
+        _mm_storeu_si128((__m128i*)pDst, itmp1); \
+        pDst += dstStride; \
+        _mm_storeu_si128((__m128i*)pDst, itmp2); \
+        pDst += dstStride; \
 }
 
 #define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
-        MB4(R1, R2, R3, R4) \
-        MB4(R5, R6, R7, R8) \
-        BLND2_4(R1, R5); \
-        BLND2_4(R2, R6); \
-        BLND2_4(R3, R7); \
-        BLND2_4(R4, R8); \
+        itmp1 = _mm_unpacklo_epi8(R1, R2); \
+        itmp2 = _mm_unpackhi_epi8(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R3, R4); \
+        itmp2 = _mm_unpackhi_epi8(R3, R4); \
+        R3 = itmp1; \
+        R4 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R1, R3); \
+        itmp2 = _mm_unpackhi_epi16(R1, R3); \
+        R1 = itmp1; \
+        R3 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R2, R4); \
+        itmp2 = _mm_unpackhi_epi16(R2, R4); \
+        R2 = itmp1; \
+        R4 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R5, R6); \
+        itmp2 = _mm_unpackhi_epi8(R5, R6); \
+        R5 = itmp1; \
+        R6 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R7, R8); \
+        itmp2 = _mm_unpackhi_epi8(R7, R8); \
+        R7 = itmp1; \
+        R8 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R5, R7); \
+        itmp2 = _mm_unpackhi_epi16(R5, R7); \
+        R5 = itmp1; \
+        R7 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R6, R8); \
+        itmp2 = _mm_unpackhi_epi16(R6, R8); \
+        R6 = itmp1; \
+        R8 = itmp2; \
+        itmp1 = _mm_unpacklo_epi32(R1, R5); \
+        itmp2 = _mm_unpackhi_epi32(R1, R5); \
+        R1 = itmp1; \
+        R5 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R2, R6); \
+        itmp2 = _mm_unpackhi_epi32(R2, R6); \
+        R2 = itmp1; \
+        R6 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R3, R7); \
+        itmp2 = _mm_unpackhi_epi32(R3, R7); \
+        R3 = itmp1; \
+        R7 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R4, R8); \
+        itmp2 = _mm_unpackhi_epi32(R4, R8); \
+        R4 = itmp1; \
+        R8 = itmp2; \
 }
 
 #define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
@@ -3225,81 +3404,6 @@ void xPredIntraAng16x16(int bitDepth, pi
         PREDANG_CALCROW_HOR_MODE2(R7) \
 }
 
-#define MB16 { \
-        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0) \
-        PREDANG_CALCROW_HOR(7 + 0, R8) \
-        MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
-        CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8) \
-        PREDANG_CALCROW_HOR(7 + 8, R16) \
-        MB8(R9, R10, R11, R12, R13, R14, R15, R16) \
-        BLND2_2(R1, R9) \
-        BLND2_2(R5, R13) \
-        BLND2_2(R3, R11) \
-        BLND2_2(R7, R15) \
-        BLND2_2(R2, R10) \
-        BLND2_2(R6, R14) \
-        BLND2_2(R4, R12) \
-        BLND2_2(R8, R16) \
-}
-
-#define MB16RESCUE { \
-        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16) \
-        PREDANG_CALCROW_HOR(7 + 16, R8) \
-        MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
-        CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24) \
-        R16.load(refMain + 1 + GETAP(lookIdx, 31)); \
-        MB8(R9, R10, R11, R12, R13, R14, R15, R16) \
-        BLND2_2(R1, R9) \
-        BLND2_2(R5, R13) \
-        BLND2_2(R3, R11) \
-        BLND2_2(R7, R15) \
-        BLND2_2(R2, R10) \
-        BLND2_2(R6, R14) \
-        BLND2_2(R4, R12) \
-        BLND2_2(R8, R16) \
-}
-
-#define MB16MODE2 { \
-        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) \
-        PREDANG_CALCROW_HOR_MODE2(R8) \
-        MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
-        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16) \
-        MB8(R9, R10, R11, R12, R13, R14, R15, R16) \
-        BLND2_2(R1, R9) \
-        BLND2_2(R5, R13) \
-        BLND2_2(R3, R11) \
-        BLND2_2(R7, R15) \
-        BLND2_2(R2, R10) \
-        BLND2_2(R6, R14) \
-        BLND2_2(R4, R12) \
-        BLND2_2(R8, R16) \
-}
-
-#define BROADSTORE(X) { \
-        tmp1 = permute16uc<X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X>(v_temp); \
-        tmp1.store(pDst + (X * dstStride));     /*rowX*/ \
-        tmp1.store(pDst + (X * dstStride) + 16); \
-}
-
-#define BROADSTORE16ROWS { \
-        BROADSTORE(0) \
-        BROADSTORE(1) \
-        BROADSTORE(2) \
-        BROADSTORE(3) \
-        BROADSTORE(4) \
-        BROADSTORE(5) \
-        BROADSTORE(6) \
-        BROADSTORE(7) \
-        BROADSTORE(8) \
-        BROADSTORE(9) \
-        BROADSTORE(10) \
-        BROADSTORE(11) \
-        BROADSTORE(12) \
-        BROADSTORE(13) \
-        BROADSTORE(14) \
-        BROADSTORE(15) \
-}
-
 #if 0 /* disable temporarily, to save MSVC build times */
 void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
 {
@@ -3356,10 +3460,107 @@ void xPredIntraAng32x32(int /*bitDepth*/
             Vec16uc v_temp, tmp1;
 
             v_temp.load(refMain + 1);
-            BROADSTORE16ROWS;
+            /*BROADSTORE16ROWS;*/
+            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+            tmp1.store(pDst + (0 * dstStride));
+            tmp1.store(pDst + (0 * dstStride) + 16);
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride));
+            tmp1.store(pDst + (1 * dstStride) + 16);
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride));
+            tmp1.store(pDst + (2 * dstStride) + 16);
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride));
+            tmp1.store(pDst + (3 * dstStride) + 16);
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride));
+            tmp1.store(pDst + (4 * dstStride) + 16);
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride));
+            tmp1.store(pDst + (5 * dstStride) + 16);
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride));
+            tmp1.store(pDst + (6 * dstStride) + 16);
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride));
+            tmp1.store(pDst + (7 * dstStride) + 16);
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride));
+            tmp1.store(pDst + (8 * dstStride) + 16);
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride));
+            tmp1.store(pDst + (9 * dstStride) + 16);
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride));
+            tmp1.store(pDst + (10 * dstStride) + 16);
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride));
+            tmp1.store(pDst + (11 * dstStride) + 16);
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride));
+            tmp1.store(pDst + (12 * dstStride) + 16);
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride));
+            tmp1.store(pDst + (13 * dstStride) + 16);
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride));
+            tmp1.store(pDst + (14 * dstStride) + 16);
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride));
+            tmp1.store(pDst + (15 * dstStride) + 16);
+
             pDst += 16 * dstStride;
             v_temp.load(refMain + 1 + 16);
-            BROADSTORE16ROWS;
+            /*BROADSTORE16ROWS;*/
+            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+            tmp1.store(pDst + (0 * dstStride));
+            tmp1.store(pDst + (0 * dstStride) + 16);
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride));
+            tmp1.store(pDst + (1 * dstStride) + 16);
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride));
+            tmp1.store(pDst + (2 * dstStride) + 16);
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride));
+            tmp1.store(pDst + (3 * dstStride) + 16);
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride));
+            tmp1.store(pDst + (4 * dstStride) + 16);
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride));
+            tmp1.store(pDst + (5 * dstStride) + 16);
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride));
+            tmp1.store(pDst + (6 * dstStride) + 16);
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride));
+            tmp1.store(pDst + (7 * dstStride) + 16);
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride));
+            tmp1.store(pDst + (8 * dstStride) + 16);
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride));
+            tmp1.store(pDst + (9 * dstStride) + 16);
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride));
+            tmp1.store(pDst + (10 * dstStride) + 16);
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride));
+            tmp1.store(pDst + (11 * dstStride) + 16);
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride));
+            tmp1.store(pDst + (12 * dstStride) + 16);
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride));
+            tmp1.store(pDst + (13 * dstStride) + 16);
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride));
+            tmp1.store(pDst + (14 * dstStride) + 16);
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride));
+            tmp1.store(pDst + (15 * dstStride) + 16);
         }
         else
         {
@@ -3821,6 +4022,7 @@ void xPredIntraAng32x32(int /*bitDepth*/
             Pel * original_pDst = pDst;
             v_deltaPos = 0;
             v_ipAngle = intraPredAngle;
+            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
 
             switch (intraPredAngle)
             {
@@ -3828,82 +4030,245 @@ void xPredIntraAng32x32(int /*bitDepth*/
                 LOADROW(row11L, row11H, -1)
                 LOADROW(row12L, row12H,  0)
                 R16 = compress(row11L, row11H);
-                MB16MODE2
-
-                    v_deltaPos += v_ipAngle;
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos += v_ipAngle;
                 v_deltaFract = v_deltaPos & thirty1;
                 row12L = row11L;
                 row12H = row11H;
                 LOADROW(row11L, row11H, -2)
                 R16 = compress(row11L, row11H);
                 pDst = original_pDst + 16;
-                MB16MODE2
-
-                    pDst = original_pDst + (16 * dstStride);
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                pDst = original_pDst + (16 * dstStride);
                 refMain += 16;
                 v_deltaPos = 0;
                 v_ipAngle = intraPredAngle;
                 LOADROW(row11L, row11H, -1)
                 LOADROW(row12L, row12H,  0)
                 R16 = compress(row11L, row11H);
-                MB16MODE2
-
-                    v_deltaPos += v_ipAngle;
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos += v_ipAngle;
                 v_deltaFract = v_deltaPos & thirty1;
                 row12L = row11L;
                 row12H = row11H;
                 LOADROW(row11L, row11H, -2)
                 R16 = compress(row11L, row11H);
                 pDst = original_pDst + (16 * dstStride) + 16;
-                MB16MODE2
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
                 return;
 
             case  2:
                 LOADROW(row11L, row11H, 0)
                 LOADROW(row12L, row12H, 1)
                 R16 = compress(row12L, row12H);
-                MB16MODE2
-
-                    v_deltaPos += v_ipAngle;
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos += v_ipAngle;
                 v_deltaFract = v_deltaPos & thirty1;
                 row11L = row12L;
                 row11H = row12H;
                 LOADROW(row12L, row12H, 2)
                 R16 = compress(row12L, row12H);
                 pDst = original_pDst + 16;
-                MB16MODE2
-
-                    pDst = original_pDst + (16 * dstStride);
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                pDst = original_pDst + (16 * dstStride);
                 refMain += 16;
                 v_deltaPos = 0;
                 v_ipAngle = intraPredAngle;
                 LOADROW(row11L, row11H, 0)
                 LOADROW(row12L, row12H, 1)
                 R16 = compress(row12L, row12H);
-                MB16MODE2
-
-                    v_deltaPos += v_ipAngle;
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos += v_ipAngle;
                 v_deltaFract = v_deltaPos & thirty1;
                 row11L = row12L;
                 row11H = row12H;
                 LOADROW(row12L, row12H, 2)
                 R16 = compress(row12L, row12H);
                 pDst = original_pDst + (16 * dstStride) + 16;
-                MB16MODE2
+//                MB16MODE2
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
                 return;
             }
 
-            MB16;
+//            MB16;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+
             pDst = original_pDst + 16;
-            MB16RESCUE;
+//            MB16RESCUE;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            /*R16.load(refMain + 1 + GETAP(lookIdx, 31)); */
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
 
             pDst = original_pDst + (16 * dstStride);
             refMain += 16;
             v_deltaPos = 0;
             v_ipAngle = intraPredAngle;
-            MB16;
+//            MB16;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
             pDst = original_pDst + (16 * dstStride) + 16;
-            MB16RESCUE;
+//            MB16RESCUE;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            /*R16.load(refMain + 1 + GETAP(lookIdx, 31)); */
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
         }
         else
         {
@@ -3914,6 +4279,7 @@ void xPredIntraAng32x32(int /*bitDepth*/
             Vec8s tmp1, tmp2, res1, res2;
             v_deltaPos = 0;
             v_ipAngle = intraPredAngle;
+            __m128i itmp, it1, it2, it3, i16;
 
             switch (intraPredAngle)
             {
@@ -3922,21 +4288,10 @@ void xPredIntraAng32x32(int /*bitDepth*/
                 LOADROW(row21, row22,  0)
                 LOADROW(row13, row14, 15)
                 LOADROW(row23, row24, 16)
-                PREDANG_CALCROW_VER_MODE2(0);
-                PREDANG_CALCROW_VER_MODE2(1);
-                PREDANG_CALCROW_VER_MODE2(2);
-                PREDANG_CALCROW_VER_MODE2(3);
-                PREDANG_CALCROW_VER_MODE2(4);
-                PREDANG_CALCROW_VER_MODE2(5);
-                PREDANG_CALCROW_VER_MODE2(6);
-                PREDANG_CALCROW_VER_MODE2(7);
-                PREDANG_CALCROW_VER_MODE2(8);
-                PREDANG_CALCROW_VER_MODE2(9);
-                PREDANG_CALCROW_VER_MODE2(10);
-                PREDANG_CALCROW_VER_MODE2(11);
-                PREDANG_CALCROW_VER_MODE2(12);
-                PREDANG_CALCROW_VER_MODE2(13);
-                PREDANG_CALCROW_VER_MODE2(14);
+                for (int i = 0; i <= 14; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
 
                 //deltaFract == 0 for 16th row
                 v_deltaPos += v_ipAngle;
@@ -3951,22 +4306,10 @@ void xPredIntraAng32x32(int /*bitDepth*/
 
                 LOADROW(row11, row12, -2)
                 LOADROW(row13, row14, 14)
-
-                PREDANG_CALCROW_VER_MODE2(16);
-                PREDANG_CALCROW_VER_MODE2(17);
-                PREDANG_CALCROW_VER_MODE2(18);
-                PREDANG_CALCROW_VER_MODE2(19);
-                PREDANG_CALCROW_VER_MODE2(20);
-                PREDANG_CALCROW_VER_MODE2(21);
-                PREDANG_CALCROW_VER_MODE2(22);
-                PREDANG_CALCROW_VER_MODE2(23);
-                PREDANG_CALCROW_VER_MODE2(24);
-                PREDANG_CALCROW_VER_MODE2(25);
-                PREDANG_CALCROW_VER_MODE2(26);
-                PREDANG_CALCROW_VER_MODE2(27);
-                PREDANG_CALCROW_VER_MODE2(28);
-                PREDANG_CALCROW_VER_MODE2(29);
-                PREDANG_CALCROW_VER_MODE2(30);
+                for (int i = 16; i <= 30; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
 
                 compress(row11, row12).store(pDst + (31 * dstStride));
                 compress(row13, row14).store(pDst + (31 * dstStride) + 16);
@@ -3979,21 +4322,10 @@ void xPredIntraAng32x32(int /*bitDepth*/
                 LOADROW(row21, row22, 1)
                 LOADROW(row13, row14, 16)
                 LOADROW(row23, row24, 17)
-                PREDANG_CALCROW_VER_MODE2(0);
-                PREDANG_CALCROW_VER_MODE2(1);
-                PREDANG_CALCROW_VER_MODE2(2);
-                PREDANG_CALCROW_VER_MODE2(3);
-                PREDANG_CALCROW_VER_MODE2(4);
-                PREDANG_CALCROW_VER_MODE2(5);
-                PREDANG_CALCROW_VER_MODE2(6);
-                PREDANG_CALCROW_VER_MODE2(7);
-                PREDANG_CALCROW_VER_MODE2(8);
-                PREDANG_CALCROW_VER_MODE2(9);
-                PREDANG_CALCROW_VER_MODE2(10);
-                PREDANG_CALCROW_VER_MODE2(11);
-                PREDANG_CALCROW_VER_MODE2(12);
-                PREDANG_CALCROW_VER_MODE2(13);
-                PREDANG_CALCROW_VER_MODE2(14);
+                for (int i = 0; i <= 14; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
 
                 //deltaFract == 0 for 16th row
                 v_deltaPos += v_ipAngle;
@@ -4008,22 +4340,10 @@ void xPredIntraAng32x32(int /*bitDepth*/
 
                 LOADROW(row21, row22, 2)
                 LOADROW(row23, row24, 18)
-
-                PREDANG_CALCROW_VER_MODE2(16);
-                PREDANG_CALCROW_VER_MODE2(17);
-                PREDANG_CALCROW_VER_MODE2(18);
-                PREDANG_CALCROW_VER_MODE2(19);
-                PREDANG_CALCROW_VER_MODE2(20);
-                PREDANG_CALCROW_VER_MODE2(21);
-                PREDANG_CALCROW_VER_MODE2(22);
-                PREDANG_CALCROW_VER_MODE2(23);
-                PREDANG_CALCROW_VER_MODE2(24);
-                PREDANG_CALCROW_VER_MODE2(25);
-                PREDANG_CALCROW_VER_MODE2(26);
-                PREDANG_CALCROW_VER_MODE2(27);
-                PREDANG_CALCROW_VER_MODE2(28);
-                PREDANG_CALCROW_VER_MODE2(29);
-                PREDANG_CALCROW_VER_MODE2(30);
+                for (int i = 16; i <= 30; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
 
                 compress(row21, row22).store(pDst + (31 * dstStride));
                 compress(row23, row24).store(pDst + (31 * dstStride) + 16);
@@ -4031,37 +4351,10 @@ void xPredIntraAng32x32(int /*bitDepth*/
                 return;
             }
 
-            PREDANG_CALCROW_VER(0);
-            PREDANG_CALCROW_VER(1);
-            PREDANG_CALCROW_VER(2);
-            PREDANG_CALCROW_VER(3);
-            PREDANG_CALCROW_VER(4);
-            PREDANG_CALCROW_VER(5);
-            PREDANG_CALCROW_VER(6);
-            PREDANG_CALCROW_VER(7);
-            PREDANG_CALCROW_VER(8);
-            PREDANG_CALCROW_VER(9);
-            PREDANG_CALCROW_VER(10);
-            PREDANG_CALCROW_VER(11);
-            PREDANG_CALCROW_VER(12);
-            PREDANG_CALCROW_VER(13);
-            PREDANG_CALCROW_VER(14);
-            PREDANG_CALCROW_VER(15);
-            PREDANG_CALCROW_VER(16);
-            PREDANG_CALCROW_VER(17);
-            PREDANG_CALCROW_VER(18);
-            PREDANG_CALCROW_VER(19);
-            PREDANG_CALCROW_VER(20);
-            PREDANG_CALCROW_VER(21);
-            PREDANG_CALCROW_VER(22);
-            PREDANG_CALCROW_VER(23);
-            PREDANG_CALCROW_VER(24);
-            PREDANG_CALCROW_VER(25);
-            PREDANG_CALCROW_VER(26);
-            PREDANG_CALCROW_VER(27);
-            PREDANG_CALCROW_VER(28);
-            PREDANG_CALCROW_VER(29);
-            PREDANG_CALCROW_VER(30);
+            for (int i = 0; i <= 30; i++)
+            {
+                PREDANG_CALCROW_VER(i);
+            }
 
             tmp.load(refMain + GETAP(lookIdx, 31) + 1);
             tmp.store(pDst + 31 * dstStride);
@@ -4070,6 +4363,7 @@ void xPredIntraAng32x32(int /*bitDepth*/
         }
     }
 }
+
 #endif /* disable temporarily */
 
 #endif /* if HIGH_BIT_DEPTH */