changeset 1694:813e9c90cbf6

Merged in deepthidevaki/xhevc_deepthid (pull request #170) IntraPredAng 16x16 with all modes
author Steve Borho <steve@borho.org>
date Fri, 24 May 2013 11:36:26 -0500
parents e3ed785d727f (current diff) 73316a3addc5 (diff)
children d14951b6324a
files
diffstat 2 files changed, 479 insertions(+-), 38 deletions(-) [+]
line wrap: on
line diff
--- a/source/VectorClass/vectori128.h	Fri May 24 18:06:33 2013 +0530
+++ b/source/VectorClass/vectori128.h	Fri May 24 11:36:26 2013 -0500
@@ -118,7 +118,7 @@ public:
     }
     // Member function to change a single bit
     // Note: This function is inefficient. Use load function if changing more than one bit
-    Vec128b const & set_bit(uint32_t index, int value) {
+    ALWAYSINLINE Vec128b const & set_bit(uint32_t index, int value) {
         static const union {
             uint64_t i[4];
             __m128i  x[2];
@@ -137,7 +137,7 @@ public:
     }
     // Member function to get a single bit
     // Note: This function is inefficient. Use store function if reading more than one bit
-    int get_bit(uint32_t index) const {
+    ALWAYSINLINE int get_bit(uint32_t index) const {
         union {
             __m128i x;
             uint8_t i[16];
@@ -149,7 +149,7 @@ public:
     }
     // Extract a single element. Use store function if extracting more than one element.
     // Operator [] can only read an element, not write.
-    bool operator [] (uint32_t index) const {
+    ALWAYSINLINE bool operator [] (uint32_t index) const {
         return get_bit(index) != 0;
     }
 };
@@ -1669,41 +1669,41 @@ static ALWAYSINLINE Vec8us min(Vec8us co
 class Vec4i : public Vec128b {
 public:
     // Default constructor:
-    Vec4i() {
+    ALWAYSINLINE Vec4i() {
     };
     // Constructor to broadcast the same value into all elements:
-    Vec4i(int i) {
+    ALWAYSINLINE Vec4i(int i) {
         xmm = _mm_set1_epi32(i);
     };
     // Constructor to build from all elements:
-    Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) {
+    ALWAYSINLINE Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) {
         xmm = _mm_setr_epi32(i0, i1, i2, i3);
     };
     // Constructor to convert from type __m128i used in intrinsics:
-    Vec4i(__m128i const & x) {
+    ALWAYSINLINE Vec4i(__m128i const & x) {
         xmm = x;
     };
     // Assignment operator to convert from type __m128i used in intrinsics:
-    Vec4i & operator = (__m128i const & x) {
+    ALWAYSINLINE Vec4i & operator = (__m128i const & x) {
         xmm = x;
         return *this;
     };
     // Type cast operator to convert to __m128i used in intrinsics
-    operator __m128i() const {
+    ALWAYSINLINE operator __m128i() const {
         return xmm;
     };
     // Member function to load from array (unaligned)
-    Vec4i & load(void const * p) {
+    ALWAYSINLINE Vec4i & load(void const * p) {
         xmm = _mm_loadu_si128((__m128i const*)p);
         return *this;
     }
     // Member function to load from array (aligned)
-    Vec4i & load_a(void const * p) {
+    ALWAYSINLINE Vec4i & load_a(void const * p) {
         xmm = _mm_load_si128((__m128i const*)p);
         return *this;
     }
     // Partial load. Load n elements and set the rest to 0
-    Vec4i & load_partial(int n, void const * p) {
+    ALWAYSINLINE Vec4i & load_partial(int n, void const * p) {
         switch (n) {
         case 0:
             *this = 0;  break;
@@ -1722,7 +1722,7 @@ public:
         return *this;
     }
     // Partial store. Store n elements
-    void store_partial(int n, void * p) const {
+    ALWAYSINLINE void store_partial(int n, void * p) const {
         union {        
             int32_t i[4];
             int64_t q[2];
@@ -1745,13 +1745,13 @@ public:
         }
     }
     // cut off vector to n elements. The last 4-n elements are set to zero
-    Vec4i & cutoff(int n) {
+    ALWAYSINLINE Vec4i & cutoff(int n) {
         *this = Vec16c(xmm).cutoff(n * 4);
         return *this;
     }
     // Member function to change a single element in vector
     // Note: This function is inefficient. Use load function if changing more than one element
-    Vec4i const & insert(uint32_t index, int32_t value) {
+    ALWAYSINLINE Vec4i const & insert(uint32_t index, int32_t value) {
         static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
         __m128i broad = _mm_set1_epi32(value);  // broadcast value into all elements
         __m128i mask  = _mm_loadu_si128((__m128i const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
@@ -1759,14 +1759,14 @@ public:
         return *this;
     };
     // Member function extract a single element from vector
-    int32_t extract(uint32_t index) const {
+    ALWAYSINLINE int32_t extract(uint32_t index) const {
         int32_t x[4];
         store(x);
         return x[index & 3];
     }
     // Extract a single element. Use store function if extracting more than one element.
     // Operator [] can only read an element, not write.
-    int32_t operator [] (uint32_t index) const {
+    ALWAYSINLINE int32_t operator [] (uint32_t index) const {
         return extract(index);
     }
 };
@@ -2322,10 +2322,10 @@ static ALWAYSINLINE Vec4ui min(Vec4ui co
 class Vec2q : public Vec128b {
 public:
     // Default constructor:
-    Vec2q() {
+    ALWAYSINLINE Vec2q() {
     };
     // Constructor to broadcast the same value into all elements:
-    Vec2q(int64_t i) {
+    ALWAYSINLINE Vec2q(int64_t i) {
 #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
         // MS compiler has no _mm_set1_epi64x
 #if defined(__x86_64__)                                    // 64 bit mode
@@ -2354,7 +2354,7 @@ public:
 #endif
     };
     // Constructor to build from all elements:
-    Vec2q(int64_t i0, int64_t i1) {
+    ALWAYSINLINE Vec2q(int64_t i0, int64_t i1) {
 #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
         // MS compiler has no _mm_set_epi64x     // !!
 #if defined(__x86_64__)                                    // 64 bit mode
@@ -2376,30 +2376,30 @@ public:
 #endif
     };
     // Constructor to convert from type __m128i used in intrinsics:
-    Vec2q(__m128i const & x) {
+    ALWAYSINLINE Vec2q(__m128i const & x) {
         xmm = x;
     };
     // Assignment operator to convert from type __m128i used in intrinsics:
-    Vec2q & operator = (__m128i const & x) {
+    ALWAYSINLINE Vec2q & operator = (__m128i const & x) {
         xmm = x;
         return *this;
     };
     // Type cast operator to convert to __m128i used in intrinsics
-    operator __m128i() const {
+    ALWAYSINLINE operator __m128i() const {
         return xmm;
     }
     // Member function to load from array (unaligned)
-    Vec2q & load(void const * p) {
+    ALWAYSINLINE Vec2q & load(void const * p) {
         xmm = _mm_loadu_si128((__m128i const*)p);
         return *this;
     }
     // Member function to load from array (aligned)
-    Vec2q & load_a(void const * p) {
+    ALWAYSINLINE Vec2q & load_a(void const * p) {
         xmm = _mm_load_si128((__m128i const*)p);
         return *this;
     }
     // Partial load. Load n elements and set the rest to 0
-    Vec2q & load_partial(int n, void const * p) {
+    ALWAYSINLINE Vec2q & load_partial(int n, void const * p) {
         switch (n) {
         case 0:
             *this = 0;  break;
@@ -2414,7 +2414,7 @@ public:
         return *this;
     }
     // Partial store. Store n elements
-    void store_partial(int n, void * p) const {
+    ALWAYSINLINE void store_partial(int n, void * p) const {
         switch (n) {
         case 1:
             int64_t q[2];
@@ -2427,13 +2427,13 @@ public:
         }
     }
     // cut off vector to n elements. The last 2-n elements are set to zero
-    Vec2q & cutoff(int n) {
+    ALWAYSINLINE Vec2q & cutoff(int n) {
         *this = Vec16c(xmm).cutoff(n * 8);
         return *this;
     }
     // Member function to change a single element in vector
     // Note: This function is inefficient. Use load function if changing more than one element
-    Vec2q const & insert(uint32_t index, int64_t value) {
+    ALWAYSINLINE Vec2q const & insert(uint32_t index, int64_t value) {
 #if INSTRSET >= 5 && defined(__x86_64__)  // SSE4.1 supported, 64 bit mode
         if (index == 0) {
             xmm = _mm_insert_epi64(xmm,value,0);
@@ -2464,14 +2464,14 @@ public:
         return *this;
     }
     // Member function extract a single element from vector
-    int64_t extract(uint32_t index) const {
+    ALWAYSINLINE int64_t extract(uint32_t index) const {
         int64_t x[2];
         store(x);
         return x[index & 1];
     }
     // Extract a single element. Use store function if extracting more than one element.
     // Operator [] can only read an element, not write.
-    int64_t operator [] (uint32_t index) const {
+    ALWAYSINLINE int64_t operator [] (uint32_t index) const {
         return extract(index);
     }
 };
--- a/source/encoder/vec/intrapred.inc	Fri May 24 18:06:33 2013 +0530
+++ b/source/encoder/vec/intrapred.inc	Fri May 24 11:36:26 2013 -0500
@@ -1923,12 +1923,12 @@ void xPredIntraAng8x8(int bitDepth, pixe
 
             v_temp.load(refMain + 1);
             Vec8s row0;
-            row0 = permute16uc<0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1, 0,-1>(v_temp);
+            row0 = permute16uc<0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1>(v_temp);
             v_side -= v_side_0;
             v_side = v_side >> 1;
             row0 = row0 + v_side;
             row0 = min(max(0, row0), (1 << bitDepth) - 1);
-            
+
             pDst[0 * dstStride] = row0[0];
             pDst[1 * dstStride] = row0[1];
             pDst[2 * dstStride] = row0[2];
@@ -1937,7 +1937,6 @@ void xPredIntraAng8x8(int bitDepth, pixe
             pDst[5 * dstStride] = row0[5];
             pDst[6 * dstStride] = row0[6];
             pDst[7 * dstStride] = row0[7];
-
         }
     }
     else if (intraPredAngle == -32)
@@ -2243,24 +2242,466 @@ void xPredIntraAng8x8(int bitDepth, pixe
     }
 }
 
+#undef PREDANG_CALCROW_VER
+#undef PREDANG_CALCROW_HOR
+#undef LOADROW
+#undef CALCROW
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+        LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
+        LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
+        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+        compress(row11L, row11H).store(pDst + ((X)*dstStride)); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+        LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
+        LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
+        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+        rowx = compress(row11L, row11H); \
+}
+
+// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROWL, ROWH, X) { \
+        tmp.load(refMain + 1 + (X)); \
+        ROWL = extend_low(tmp); \
+        ROWH = extend_high(tmp); \
+}
+
+#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
+        v_deltaPos += v_ipAngle; \
+        v_deltaFract = v_deltaPos & thirty1; \
+        RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
+        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5; \
+}
+
+#define  BLND2_16(R1, R2) { \
+        tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); \
+        tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2); \
+        R1 = tmp1; \
+        R2 = tmp2; \
+}
+
+#define MB4(R1, R2, R3, R4) { \
+        BLND2_16(R1, R2) \
+        BLND2_16(R3, R4) \
+        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3); \
+        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3); \
+        R1 = tmp1; \
+        R3 = tmp2; \
+        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
+        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4); \
+        R2 = tmp1; \
+        R4 = tmp2; \
+}
+
+#define BLND2_4(R1, R2) { \
+        tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
+        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); \
+        R1 = tmp1; \
+        R2 = tmp2; \
+}
+
+#define BLND2_2(R1, R2) { \
+        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
+        tmp1.store(pDst);   pDst += dstStride; \
+        tmp2.store(pDst);   pDst += dstStride; \
+}
+
+#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
+        MB4(R1, R2, R3, R4) \
+        MB4(R5, R6, R7, R8) \
+        BLND2_4(R1, R5); \
+        BLND2_4(R2, R6); \
+        BLND2_4(R3, R7); \
+        BLND2_4(R4, R8); \
+}
+
+#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
+        PREDANG_CALCROW_HOR(0 + X, R1) \
+        PREDANG_CALCROW_HOR(1 + X, R2) \
+        PREDANG_CALCROW_HOR(2 + X, R3) \
+        PREDANG_CALCROW_HOR(3 + X, R4) \
+        PREDANG_CALCROW_HOR(4 + X, R5) \
+        PREDANG_CALCROW_HOR(5 + X, R6) \
+        PREDANG_CALCROW_HOR(6 + X, R7) \
+        PREDANG_CALCROW_HOR(7 + X, R8) \
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
+}
+
+#define MB16 { \
+        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0) \
+        CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8) \
+        BLND2_2(R1, R9) \
+        BLND2_2(R5, R13) \
+        BLND2_2(R3, R11) \
+        BLND2_2(R7, R15) \
+        BLND2_2(R2, R10) \
+        BLND2_2(R6, R14) \
+        BLND2_2(R4, R12) \
+        BLND2_2(R8, R16) \
+}
+
+void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+    int k;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        if (intraPredAngle != -32)
+            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+            {
+                invAngleSum += invAngle;
+                refMain[k] = refSide[invAngleSum >> 8];
+            }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for blocksize 8
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec16uc v_temp;
+            Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+            v_temp.load(refSide + 1);
+            Vec8s v_side;
+            v_side = extend_low(v_temp);
+
+            Vec8s row01, row02, ref(refMain[1]);
+            v_side -= v_side_0;
+            v_side = v_side >> 1;
+            row01 = ref + v_side;
+            row01 = min(max(0, row01), (1 << bitDepth) - 1);
+
+            v_side = extend_high(v_temp);
+            v_side -= v_side_0;
+            v_side = v_side >> 1;
+            row02 = ref + v_side;
+            row02 = min(max(0, row02), (1 << bitDepth) - 1);
+
+            Vec16uc tmp1;
+            tmp1 = compress(row01, row02);
+            tmp1.store(pDst);                //row0
+
+            v_temp.load(refMain + 1);
+
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride)); //row1
+
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride)); //row2
+
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride)); //row3
+
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride)); //row4
+
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride)); //row5
+
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride)); //row6
+
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride)); //row7
+
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride)); //row8
+
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride)); //row9
+
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride)); //row10
+
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride)); //row11
+
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride)); //row12
+
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride)); //row13
+
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride)); //row14
+
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride)); //row15
+        }
+        else
+        {
+            Vec16uc v_main;
+            v_main.load(refMain + 1);
+            v_main.store(pDst);
+            v_main.store(pDst + dstStride);
+            v_main.store(pDst + (2 * dstStride));
+            v_main.store(pDst + (3 * dstStride));
+            v_main.store(pDst + (4 * dstStride));
+            v_main.store(pDst + (5 * dstStride));
+            v_main.store(pDst + (6 * dstStride));
+            v_main.store(pDst + (7 * dstStride));
+            v_main.store(pDst + (8 * dstStride));
+            v_main.store(pDst + (9 * dstStride));
+            v_main.store(pDst + (10 * dstStride));
+            v_main.store(pDst + (11 * dstStride));
+            v_main.store(pDst + (12 * dstStride));
+            v_main.store(pDst + (13 * dstStride));
+            v_main.store(pDst + (14 * dstStride));
+            v_main.store(pDst + (15 * dstStride));
+
+            Vec16uc v_temp;
+            Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+            v_temp.load(refSide + 1);
+            Vec8s v_side;
+            v_side = extend_low(v_temp);
+
+            Vec8s row0, ref(refMain[1]);
+            v_side -= v_side_0;
+            v_side = v_side >> 1;
+            row0 = ref + v_side;
+            row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+            pDst[0 * dstStride] = row0[0];
+            pDst[1 * dstStride] = row0[1];
+            pDst[2 * dstStride] = row0[2];
+            pDst[3 * dstStride] = row0[3];
+            pDst[4 * dstStride] = row0[4];
+            pDst[5 * dstStride] = row0[5];
+            pDst[6 * dstStride] = row0[6];
+            pDst[7 * dstStride] = row0[7];
+
+            v_side = extend_high(v_temp);
+            v_side -= v_side_0;
+            v_side = v_side >> 1;
+            row0 = ref + v_side;
+            row0 = min(max(0, row0), (1 << bitDepth) - 1);
+            pDst[8 * dstStride] = row0[0];
+            pDst[9 * dstStride] = row0[1];
+            pDst[10 * dstStride] = row0[2];
+            pDst[11 * dstStride] = row0[3];
+            pDst[12 * dstStride] = row0[4];
+            pDst[13 * dstStride] = row0[5];
+            pDst[14 * dstStride] = row0[6];
+            pDst[15 * dstStride] = row0[7];
+        }
+    }
+    else if (intraPredAngle == -32)
+    {
+        Vec16uc v_refSide;
+        v_refSide.load(refSide);
+        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+        pixel refMain0 = refMain[0];
+
+        v_refSide.store(refMain - 15);
+        refMain[0] = refMain0;
+
+        Vec16uc tmp;
+        tmp.load(refMain);        //-1,0,1,2
+        tmp.store(pDst);
+        tmp.load(--refMain);     //-2,-1,0,1
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        return;
+    }
+    else if (intraPredAngle == 32)
+    {
+        Vec8s tmp;
+
+        tmp.load(refMain + 2);
+        tmp.store(pDst);
+        tmp.load(refMain + 3);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 4);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 5);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 6);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 7);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 8);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 9);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 10);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 11);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 12);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 13);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 14);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 15);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 16);
+        pDst += dstStride;
+        tmp.store(pDst);
+        tmp.load(refMain + 17);
+        pDst += dstStride;
+        tmp.store(pDst);
+        return;
+    }
+    else
+    {
+        if (modeHor)
+        {
+            Vec8s row11L, row12L, row11H, row12H;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec16uc tmp;
+            Vec16uc R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+            Vec16uc tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+            MB16;
+        }
+        else
+        {
+            Vec8s row11L, row12L, row11H, row12H;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec16uc tmp;
+            Vec8s tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+
+            PREDANG_CALCROW_VER(0);
+            PREDANG_CALCROW_VER(1);
+            PREDANG_CALCROW_VER(2);
+            PREDANG_CALCROW_VER(3);
+            PREDANG_CALCROW_VER(4);
+            PREDANG_CALCROW_VER(5);
+            PREDANG_CALCROW_VER(6);
+            PREDANG_CALCROW_VER(7);
+            PREDANG_CALCROW_VER(8);
+            PREDANG_CALCROW_VER(9);
+            PREDANG_CALCROW_VER(10);
+            PREDANG_CALCROW_VER(11);
+            PREDANG_CALCROW_VER(12);
+            PREDANG_CALCROW_VER(13);
+            PREDANG_CALCROW_VER(14);
+            PREDANG_CALCROW_VER(15);
+        }
+    }
+}
+
 #endif /* if HIGH_BIT_DEPTH */
 
 void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
 {
+#if HIGH_BIT_DEPTH
+#else
     switch (width)
     {
     case 4:
         xPredIntraAng4x4(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
         return;
     case 8:
-#if HIGH_BIT_DEPTH
-        ; // To fix build for HIGH_BIT_DEPTH enabled
-#else
         xPredIntraAng8x8(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
         return;
-#endif
+    case 16:
+        xPredIntraAng16x16(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
+        return;
     }
 
+#endif /* if HIGH_BIT_DEPTH */
+
     int k, l;
     int blkSize        = width;