changeset 12571:37648fca915b

Fix: Performance drop in aq-mode 4 This patch moves the memory handling part of the edge information required for aq-mode 4 to the Frame class-level in that way it can be reused by the threads.
author Akil Ayyappan<akil@multicorewareinc.com>
date Fri, 11 Oct 2019 12:45:52 +0530
parents f46aa2bc1c34
children 7fc1f6ef2b96
files source/common/frame.cpp source/common/frame.h source/encoder/slicetype.cpp source/encoder/slicetype.h source/test/regression-tests.txt
diffstat 5 files changed, 57 insertions(+-), 40 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/frame.cpp	Tue Sep 24 15:02:05 2019 +0530
+++ b/source/common/frame.cpp	Fri Oct 11 12:45:52 2019 +0530
@@ -58,6 +58,9 @@ Frame::Frame()
     m_classifyFrame = false;
     m_fieldNum = 0;
     m_picStruct = 0;
+    m_edgePic = NULL;
+    m_gaussianPic = NULL;
+    m_thetaPic = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -98,6 +101,20 @@ bool Frame::create(x265_param *param, fl
         CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
     }
 
+    if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount && param->rc.aqMode != 0))
+    {
+        uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment
+        uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for 8-tap filter and infinite padding
+        intptr_t m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
+        int maxHeight = numCuInHeight * param->maxCUSize;
+
+        m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+        m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+        m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
@@ -243,4 +260,11 @@ void Frame::destroy()
         X265_FREE_ZERO(m_classifyVariance);
         X265_FREE_ZERO(m_classifyCount);
     }
+
+    if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount && m_param->rc.aqMode != 0))
+    {
+        X265_FREE(m_edgePic);
+        X265_FREE(m_gaussianPic);
+        X265_FREE(m_thetaPic);
+    }
 }
--- a/source/common/frame.h	Tue Sep 24 15:02:05 2019 +0530
+++ b/source/common/frame.h	Fri Oct 11 12:45:52 2019 +0530
@@ -132,6 +132,11 @@ public:
     bool                   m_classifyFrame;
     int                    m_fieldNum;
 
+    /* aq-mode 4 : Gaussian, edge and theta frames for edge information */
+    pixel*                 m_edgePic;
+    pixel*                 m_gaussianPic;
+    pixel*                 m_thetaPic;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
--- a/source/encoder/slicetype.cpp	Tue Sep 24 15:02:05 2019 +0530
+++ b/source/encoder/slicetype.cpp	Fri Oct 11 12:45:52 2019 +0530
@@ -85,12 +85,22 @@ inline uint32_t acEnergyPlane(Frame *cur
 
 } // end anonymous namespace
 
-void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, intptr_t stride, int height, int width)
+void edgeFilter(Frame *curFrame, x265_param* param)
 {
+    int height = curFrame->m_fencPic->m_picHeight;
+    int width = curFrame->m_fencPic->m_picWidth;
+    intptr_t stride = curFrame->m_fencPic->m_stride;
+    uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize;
+    int maxHeight = numCuInHeight * param->maxCUSize;
+
+    memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+    memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+    memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+
     pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
-    pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
-    pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
-    pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
 
     for (int i = 0; i < height; i++)
     {
@@ -103,7 +113,7 @@ void edgeFilter(Frame *curFrame, pixel *
 
     //Applying Gaussian filter on the picture
     src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
-    refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
     pixel pixelValue = 0;
 
     for (int rowNum = 0; rowNum < height; rowNum++)
@@ -148,7 +158,7 @@ void edgeFilter(Frame *curFrame, pixel *
     float gradientH = 0, gradientV = 0, radians = 0, theta = 0;
     float gradientMagnitude = 0;
     pixel blackPixel = 0;
-    edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
     //Applying Sobel filter on the gaussian filtered picture
     for (int rowNum = 0; rowNum < height; rowNum++)
     {
@@ -198,8 +208,10 @@ inline void findAvgAngle(const pixel* bl
     angle = sum / (size*size);
 }
 
-uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame,pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
+uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
 {
+    pixel *edgeImage = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
     intptr_t srcStride = curFrame->m_fencPic->m_stride;
     intptr_t blockOffsetLuma = blockX + (blockY * srcStride);
     int plane = 0; // Sobel filter is applied only on Y component
@@ -478,31 +490,14 @@ void LookaheadTLD::calcAdaptiveQuantFram
             }
             else
             {
-#define AQ_EDGE_BIAS 0.5
-#define EDGE_INCLINATION 45
-
-                pixel *edgePic = NULL;
-                pixel *gaussianPic = NULL;
-                pixel *thetaPic = NULL;
-
-                if (param->rc.aqMode == X265_AQ_EDGE)
-                {
-                    uint32_t numCuInHeight = (maxRow + param->maxCUSize - 1) / param->maxCUSize;
-                    int maxHeight = numCuInHeight * param->maxCUSize;
-                    intptr_t stride = curFrame->m_fencPic->m_stride;
-                    edgePic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
-                    gaussianPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
-                    thetaPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
-                    memset(edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
-                    memset(gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
-                    memset(thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
-                    edgeFilter(curFrame, edgePic, gaussianPic, thetaPic, stride, maxRow, maxCol);
-                }                  
-
                 int blockXY = 0, inclinedEdge = 0;
                 double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
                 double bias_strength = 0.f;
                 double strength = 0.f;
+
+                if (param->rc.aqMode == X265_AQ_EDGE)
+                    edgeFilter(curFrame, param);
+
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
                 {
                     double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
@@ -514,9 +509,7 @@ void LookaheadTLD::calcAdaptiveQuantFram
                             energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
                             if (param->rc.aqMode == X265_AQ_EDGE)
                             {
-                                pixel *edgeImage = edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
-                                pixel *edgeTheta = thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
-                                edgeDensity = edgeDensityCu(curFrame, edgeImage, edgeTheta, avgAngle, blockX, blockY, param->rc.qgSize);
+                                edgeDensity = edgeDensityCu(curFrame, avgAngle, blockX, blockY, param->rc.qgSize);
                                 if (edgeDensity)
                                 {
                                     qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1);
@@ -549,13 +542,6 @@ void LookaheadTLD::calcAdaptiveQuantFram
                 else
                     strength = param->rc.aqStrength * 1.0397f;
 
-                if (param->rc.aqMode == X265_AQ_EDGE)
-                {
-                    X265_FREE(edgePic);
-                    X265_FREE(gaussianPic);
-                    X265_FREE(thetaPic);
-                }
-
                 blockXY = 0;
                 for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
                 {
--- a/source/encoder/slicetype.h	Tue Sep 24 15:02:05 2019 +0530
+++ b/source/encoder/slicetype.h	Fri Oct 11 12:45:52 2019 +0530
@@ -40,6 +40,8 @@ class Lookahead;
 
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
 #define LOWRES_COST_SHIFT 14
+#define AQ_EDGE_BIAS 0.5
+#define EDGE_INCLINATION 45
 
 /* Thread local data for lookahead tasks */
 struct LookaheadTLD
@@ -92,7 +94,7 @@ struct LookaheadTLD
 protected:
 
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
-    uint32_t edgeDensityCu(Frame*curFrame, pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
+    uint32_t edgeDensityCu(Frame*curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
--- a/source/test/regression-tests.txt	Tue Sep 24 15:02:05 2019 +0530
+++ b/source/test/regression-tests.txt	Fri Oct 11 12:45:52 2019 +0530
@@ -154,7 +154,7 @@ big_buck_bunny_360p24.y4m, --keyint 60 -
 BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000
 big_buck_bunny_360p24.y4m, --bitrate 500 --fades
 720p50_parkrun_ter.y4m,--preset medium --bitrate 400 --hme
-ducks_take_off_420_1_720p50.y4m,--preset medium --aq-mode 4 --crf 22 --no-cutree
+ducks_take_off_420_720p50.y4m,--preset medium --aq-mode 4 --crf 22 --no-cutree
 ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000