【x265】码率控制模块的简单分析—块级码控工具(AQ和cuTree)
目录
- 1. 自适应QP技术(Adaptive QP)
- 1.2 图像纹理的检测(edgeFilter)
- 1.3 计算AC energy(acEnergyCu)
- 1.4 计算梯度密度(edgeDensityCu)
- 2.宏块树(cuTree)
- 2.1 计算帧损失(singleCost)
- 2.2 计算传播损失(estimateCUPropagate)
- 2.3 计算qp调整量(cuTreeFinish)
- 3.qpOffset的使用
- 4.AQ技术和cuTree技术之间的关联
- 5.hevcAq模式
- 5.1 复杂度的计算(xPreanalyze)
- 5.2 qpOffset的计算(computeCUTreeQpOffset)
x265相关:
【x265】x265编码器参数配置
【x265】预测模块的简单分析—帧内预测
【x265】预测模块的简单分析—帧间预测
1. 自适应QP技术(Adaptive QP)
在x265当中定义了5种AQ模式,较之于x264而言,新增了AQ_EDGE这种模式
#define X265_AQ_NONE 0 // 不使用AQ
#define X265_AQ_VARIANCE 1 // 方差模式,仅考虑当前块
#define X265_AQ_AUTO_VARIANCE 2 // 自方差模式,考虑整帧中的块
#define X265_AQ_AUTO_VARIANCE_BIASED 3 // 带偏置项的自方差模式,考虑整帧中的块,并且带一个可调控的偏置项
#define X265_AQ_EDGE 4 // 边缘模式
AQ模式的计算位于encoder/slicetype.cpp中,由calcAdaptiveQuantFrame()实现,主要的步骤为:
- 如果使用hevcAq,则使用xPreanalyze()去分析当前帧
- 如果使用常规AQ
(1)如果使用X265_AQ_EDGE模式,则先进行滤波,检测边缘纹理(edgeFilter)
(2)如果使用X265_AQ_AUTO_VARIANCE、X265_AQ_ATUO_VARIANCE_BIASED或X265_AQ_EDGE模式中的一种,则根据全局范围的纹理情况来计算qp_adj
(a)如果是X265_AQ_EDGE模式,会计算边缘密度,依据边缘密度计算块级(默认为16x16)qp_adj
(b)如果非X265_AQ_EDGE模式,根据AC energy计算块级qp_adj
(c)将所有块的qp_adj平均计算,得到平均qp_adj
(3)如果是X265_AQ_VARIANCE,根据strength调整qp
void LookaheadTLD::calcAdaptiveQuantFrame(Frame* curFrame, x265_param* param)
{
/* Actual adaptive quantization */
int maxCol = curFrame->m_fencPic->m_picWidth;
int maxRow = curFrame->m_fencPic->m_picHeight;
int blockCount, loopIncr;
float modeOneConst, modeTwoConst;
/*
qgSize表示量化组大小(quantization group size)
(1)qg将图像划分成为固定大小的正方形像素块(NxN),同一个qg内所有非零系数的CU使用同一个qp
不同的qg使用不同的qp
(2)通过调整qgSize,能够针对不同区域的图像内容,使用不同的qp,从而节省码率
(3)qgSize默认的大小为32
*/
if (param->rc.qgSize == 8)
{
blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
modeOneConst = 11.427f;
modeTwoConst = 8.f;
loopIncr = 8;
}
else
{
blockCount = widthInCU * heightInCU;
modeOneConst = 14.427f;
modeTwoConst = 11.f;
loopIncr = 16;
}
float* quantOffsets = curFrame->m_quantOffsets;
/*
m_lowres表示低分辨率视频,在编码器中用于优化编码效率和质量
(1)降低视频分辨率,能够减少编码过程中的计算量,通常降低分辨率为输入视频的1/4
(2)提升编码质量,低分辨率下的Intra和Inter模式,能够良好的应用于场景检测、帧结构确定及CU Tree优化
(3)优化帧类型选择,选择合适的帧类型(Intra、Inter、Bi-Inter)
*/
for (int y = 0; y < 3; y++)
{
curFrame->m_lowres.wp_ssd[y] = 0;
curFrame->m_lowres.wp_sum[y] = 0;
}
// bStatRead表示从文件中读取信息(multi-pass)
if (!(param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
{
/* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
// 为帧当中的16x16和8x8计算QP偏移量
// 如果aq模式为NONE或aq强度为0
if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
{
if (param->rc.aqMode && param->rc.aqStrength == 0)
{
if (quantOffsets)
{
for (int cuxy = 0; cuxy < blockCount; cuxy++)
{
curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
}
}
else
{
memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
for (int cuxy = 0; cuxy < blockCount; cuxy++)
curFrame->m_lowres.invQscaleFactor[cuxy] = 256; // 初始化为256
}
}
/* Need variance data for weighted prediction and dynamic refinement*/
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred) // 是否使用加权预测
{
for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
}
}
else // 当前模式不为X265_AQ_NONE
{
// 1.是否使用hevcAq,这是一种新的面向hevc的AQ模式
if (param->rc.hevcAq)
{
// New method for calculating variance and qp offset
// 提取图像特征并计算图像内容的特性,辅助编码器进行更精确的AQ
xPreanalyze(curFrame);
}
else
{ // 2.使用常规的AQ
int blockXY = 0, inclinedEdge = 0;
double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
double bias_strength = 0.f;
double strength = 0.f;
// 3.如果使用X265_AQ_EDGE模式,则对帧进行高斯滤波和sobel滤波,检测图像的纹理边界
if (param->rc.aqMode == X265_AQ_EDGE)
edgeFilter(curFrame, param);
/*
aqMode 默认为 X265_AQ_AUTO_VARIANCE
bHistBasedSceneCut 默认为 0
recurisonSkipMode 默认为 1
*/
if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
{
pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
// 对各个plane进行位移操作
primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
}
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
{
double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8))); // bitdepth修正
// 4.计算AC energy(高频信息,也可以理解为图像复杂度),随后调整avg_adj和strength
for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
{
for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
{
uint32_t energy, edgeDensity, avgAngle;
// 计算AC energy
energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
if (param->rc.aqMode == X265_AQ_EDGE)
{
// Edge模式会计算CU的边缘密度,能够识别出图像中的高对比度区域,这些区域通常包括更多的视觉细节
edgeDensity = edgeDensityCu(curFrame, avgAngle, blockX, blockY, param->rc.qgSize); // avgAngle是当前块中像素的角度
if (edgeDensity)
{ // 依据边缘密度计算qp_adj
qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1);
//Increasing the QP of a block if its edge orientation lies around the multiples of 45 degree
// 正负45°或者是正负90°左右
if ((avgAngle >= EDGE_INCLINATION - 15 && avgAngle <= EDGE_INCLINATION + 15) || (avgAngle >= EDGE_INCLINATION + 75 && avgAngle <= EDGE_INCLINATION + 105))
curFrame->m_lowres.edgeInclined[blockXY] = 1; // edgeInclined表示倾向于是edge区域
else
curFrame->m_lowres.edgeInclined[blockXY] = 0;
}
else // 边缘密度为0,直接使用AC来计算qp_adj
{
qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
curFrame->m_lowres.edgeInclined[blockXY] = 0;
}
}
else // 非edge模式
qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
// 写入单个16x16块的qp调整量
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
avg_adj += qp_adj;
avg_adj_pow2 += qp_adj * qp_adj;
blockXY++;
}
}
avg_adj /= blockCount;
avg_adj_pow2 /= blockCount;
// 根据avg_adj(一帧中平均qp调整量)调整strength,aqStrength默认为1.f
strength = param->rc.aqStrength * avg_adj;
// 调整avg_adj(modeTwoConst默认为11.f)
avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
bias_strength = param->rc.aqStrength;
}
else // 如果是X265_AQ_VARIANCE模式,直接计算strength
strength = param->rc.aqStrength * 1.0397f;
// 5.根据不同的AQ模式来调整qp
blockXY = 0;
for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
{
for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
{
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED) // 自方差并且携带偏置项
{
qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
// modeTwoConst = 11.f
qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
}
else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE) // 自方差模式
{
qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
qp_adj = strength * (qp_adj - avg_adj);
}
else if (param->rc.aqMode == X265_AQ_EDGE) // 边缘模式
{
inclinedEdge = curFrame->m_lowres.edgeInclined[blockXY];
qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
if (inclinedEdge && (qp_adj - avg_adj > 0))
// 期望调整的qp量大于平均值,则加上一个EDGE_BIAS进行调整
// AQ_EDGE_BIAS = 0.5
qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj));
else
qp_adj = strength * (qp_adj - avg_adj);
}
else
{ // 如果是X265_AQ_VARIANCE模式
uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
}
if (param->bHDR10Opt)
{
uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
uint32_t lumaAvg = sum / (loopIncr * loopIncr);
if (lumaAvg < 301)
qp_adj += 3;
else if (lumaAvg >= 301 && lumaAvg < 367)
qp_adj += 2;
else if (lumaAvg >= 367 && lumaAvg < 434)
qp_adj += 1;
else if (lumaAvg >= 501 && lumaAvg < 567)
qp_adj -= 1;
else if (lumaAvg >= 567 && lumaAvg < 634)
qp_adj -= 2;
else if (lumaAvg >= 634 && lumaAvg < 701)
qp_adj -= 3;
else if (lumaAvg >= 701 && lumaAvg < 767)
qp_adj -= 4;
else if (lumaAvg >= 767 && lumaAvg < 834)
qp_adj -= 5;
else if (lumaAvg >= 834)
qp_adj -= 6;
}
if (quantOffsets != NULL)
qp_adj += quantOffsets[blockXY];
// 存储qp_adj
curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
blockXY++;
}
}
}
}
if (param->rc.qgSize == 8)
{
for (int cuY = 0; cuY < heightInCU; cuY++)
{
for (int cuX = 0; cuX < widthInCU; cuX++)
{
const int cuXY = cuX + cuY * widthInCU;
curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
}
}
}
}
// 是否使用加权预测
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
{
if (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame))
{
for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
}
int hShift = CHROMA_H_SHIFT(param->internalCsp);
int vShift = CHROMA_V_SHIFT(param->internalCsp);
maxCol = ((maxCol + 8) >> 4) << 4;
maxRow = ((maxRow + 8) >> 4) << 4;
int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift };
int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
for (int i = 0; i < 3; i++)
{
uint64_t sum, ssd;
sum = curFrame->m_lowres.wp_sum[i];
ssd = curFrame->m_lowres.wp_ssd[i];
curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
}
}
// 是否使用动态优化或渐入
if (param->bDynamicRefine || param->bEnableFades)
{
uint64_t blockXY = 0, rowVariance = 0;
curFrame->m_lowres.frameVariance = 0;
for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
{
for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
{
curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
rowVariance += curFrame->m_lowres.blockVariance[blockXY];
blockXY++;
}
curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
}
curFrame->m_lowres.frameVariance /= maxRow;
}
}
从上面的代码中看,4种模式对应的qp_adj方式为:
X265_AQ_VARIANCE
(1)strength = param->rc.aqStrength * 1.0397f
(2)qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)))
X265_AQ_AUTO_VARIANCE
(1)qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)strength = param->rc.aqStrength * avg_adj;(avg_adj为qp_adj的均值)
(3)avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
(4)qp_adj = strength * (qp_adj - avg_adj)
X265_AQ_AUTO_VARIANCE_BIASED
(1)qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)strength = param->rc.aqStrength * avg_adj;(avg_adj为qp_adj的均值)
(3)avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
(4)bias_strength = param->rc.aqStrength;
(5)qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj))
X265_AQ_EDGE
(1)计算qp_adj初始值
如果edgeDensity不为0:qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1)
如果edgeDensity为0:qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)调整qp_adj的值
如果当前块为edge块,并且qp_adj - avg_adj > 0:qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj))
其他情况:qp_adj = strength * (qp_adj - avg_adj)
总体上看,X265_AQ_VARIANCE模式为aq调整的基础,如果考虑了当前帧中其他的块,变为X265_AQ_AUTO_VARIANCE模式。如果想要更精确地调控,可以增加一些调控因子,演变为X265_AQ_AUTO_VARIANCE_BIASED。如果考虑不局限于AC energy,增加梯度的检测,演变为X265_AQ_EDGE
1.2 图像纹理的检测(edgeFilter)
函数的主要功能是对当前帧进行图像纹理的检测,具体来说,会对输入图像进行高斯滤波,随后进行sobel滤波获得图像的边界纹理
void edgeFilter(Frame *curFrame, x265_param* param)
{
int height = curFrame->m_fencPic->m_picHeight;
int width = curFrame->m_fencPic->m_picWidth;
intptr_t stride = curFrame->m_fencPic->m_stride;
uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize;
int maxHeight = numCuInHeight * param->maxCUSize;
// 初始化
memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
for (int i = 0; i < height; i++)
{
memcpy(edgePic, src, width * sizeof(pixel));
memcpy(refPic, src, width * sizeof(pixel));
src += stride;
edgePic += stride;
refPic += stride;
}
//Applying Gaussian filter on the picture
// 进行高斯滤波
src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
pixel pixelValue = 0;
for (int rowNum = 0; rowNum < height; rowNum++)
{
for (int colNum = 0; colNum < width; colNum++)
{
if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
{
/* 5x5 Gaussian filter 滤波器
[2 4 5 4 2]
1 [4 9 12 9 4]
--- [5 12 15 12 5]
159 [4 9 12 9 4]
[2 4 5 4 2]*/
const intptr_t rowOne = (rowNum - 2)*stride, colOne = colNum - 2;
const intptr_t rowTwo = (rowNum - 1)*stride, colTwo = colNum - 1;
const intptr_t rowThree = rowNum * stride, colThree = colNum;
const intptr_t rowFour = (rowNum + 1)*stride, colFour = colNum + 1;
const intptr_t rowFive = (rowNum + 2)*stride, colFive = colNum + 2;
const intptr_t index = (rowNum*stride) + colNum;
// 进行高斯滤波
pixelValue = ((2 * src[rowOne + colOne] + 4 * src[rowOne + colTwo] + 5 * src[rowOne + colThree] + 4 * src[rowOne + colFour] + 2 * src[rowOne + colFive] +
4 * src[rowTwo + colOne] + 9 * src[rowTwo + colTwo] + 12 * src[rowTwo + colThree] + 9 * src[rowTwo + colFour] + 4 * src[rowTwo + colFive] +
5 * src[rowThree + colOne] + 12 * src[rowThree + colTwo] + 15 * src[rowThree + colThree] + 12 * src[rowThree + colFour] + 5 * src[rowThree + colFive] +
4 * src[rowFour + colOne] + 9 * src[rowFour + colTwo] + 12 * src[rowFour + colThree] + 9 * src[rowFour + colFour] + 4 * src[rowFour + colFive] +
2 * src[rowFive + colOne] + 4 * src[rowFive + colTwo] + 5 * src[rowFive + colThree] + 4 * src[rowFive + colFour] + 2 * src[rowFive + colFive]) / 159);
refPic[index] = pixelValue;
}
}
}
// 对已经进行了高斯滤波的图像再进行sobel滤波
if(!computeEdge(edgePic, refPic, edgeTheta, stride, height, width, true))
x265_log(NULL, X265_LOG_ERROR, "Failed edge computation!");
}
computeEdge()的代码为
bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
{
intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;
const int startIndex = 1;
if (!edgePic || !refPic || (!edgeTheta && bcalcTheta))
{
return false;
}
else
{
float gradientH = 0, gradientV = 0, radians = 0, theta = 0;
float gradientMagnitude = 0;
pixel blackPixel = 0;
//Applying Sobel filter expect for border pixels
// 对于边界像素应用sobel滤波
height = height - startIndex;
width = width - startIndex;
for (int rowNum = startIndex; rowNum < height; rowNum++)
{
rowTwo = rowNum * stride;
rowOne = rowTwo - stride;
rowThree = rowTwo + stride;
for (int colNum = startIndex; colNum < width; colNum++)
{
/*
标准的sobel滤波算子为,x265当中应该是进行了微调
[-1 0 1] [-1 -2 -1]
gH = [-2 0 2] gV = [0 0 0]
[-1 0 1] [ 1 2 1]
*/
/* Horizontal and vertical gradients
[ -3 0 3 ] [-3 -10 -3 ]
gH =[ -10 0 10] gV = [ 0 0 0 ]
[ -3 0 3 ] [ 3 10 3 ] */
colTwo = colNum;
colOne = colTwo - startIndex;
colThree = colTwo + startIndex;
middle = rowTwo + colTwo;
topLeft = rowOne + colOne;
topRight = rowOne + colThree;
bottomLeft = rowThree + colOne;
bottomRight = rowThree + colThree;
// 计算水平方向梯度
gradientH = (float)(-3 * refPic[topLeft] + 3 * refPic[topRight] - 10 * refPic[rowTwo + colOne] + 10 * refPic[rowTwo + colThree] - 3 * refPic[bottomLeft] + 3 * refPic[bottomRight]);
// 计算垂直方向梯度
gradientV = (float)(-3 * refPic[topLeft] - 10 * refPic[rowOne + colTwo] - 3 * refPic[topRight] + 3 * refPic[bottomLeft] + 10 * refPic[rowThree + colTwo] + 3 * refPic[bottomRight]);
// 计算总共的梯度大小,描述该位置的复杂度
gradientMagnitude = sqrtf(gradientH * gradientH + gradientV * gradientV);
if(bcalcTheta)
{
edgeTheta[middle] = 0;
radians = atan2(gradientV, gradientH);
theta = (float)((radians * 180) / PI);
if (theta < 0)
theta = 180 + theta;
edgeTheta[middle] = (pixel)theta;
}
// 如果梯度幅值超出了阈值,则赋值为whitePixel,whitePixel=1表示检测到了边界
edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
}
}
return true;
}
}
1.3 计算AC energy(acEnergyCu)
函数计算了一帧图像的AC分量
/* Find the total AC energy of each block in all planes */
uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
{
intptr_t stride = curFrame->m_fencPic->m_stride;
intptr_t cStride = curFrame->m_fencPic->m_strideC;
intptr_t blockOffsetLuma = blockX + (blockY * stride);
int hShift = CHROMA_H_SHIFT(csp);
int vShift = CHROMA_V_SHIFT(csp);
intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);
uint32_t var;
// 计算luma的AC分量
var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp, qgSize);
// 计算chroma的AC分量
if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
{
var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp, qgSize);
var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp, qgSize);
}
x265_emms();
return var;
}
计算单通道的AC energy
/* Find the energy of each block in Y/Cb/Cr plane */
inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat, uint32_t qgSize)
{
// 计算chroma
if ((colorFormat != X265_CSP_I444) && plane)
{
if (qgSize == 8) // 如果qgSize比较小,则使用4x4尺寸计算AC
{
ALIGN_VAR_4(pixel, pix[4 * 4]);
primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
// primitives.cu[BLOCK_4x4].var(pix, 4)表示计算图像的方差
return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
}
else
{ // 使用8x8尺寸计算AC
ALIGN_VAR_8(pixel, pix[8 * 8]);
primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
}
}
else
{ // 计算luma
if (qgSize == 8)
return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
else
return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
}
}
acEnergyVar()的定义如下,获得每帧的AC energy
/* Compute variance to derive AC energy of each block */
inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
{
uint32_t sum = (uint32_t)sum_ssd;
uint32_t ssd = (uint32_t)(sum_ssd >> 32);
curFrame->m_lowres.wp_sum[plane] += sum;
curFrame->m_lowres.wp_ssd[plane] += ssd;
return ssd - ((uint64_t)sum * sum >> shift);
}
1.4 计算梯度密度(edgeDensityCu)
如果当前的模式为X265_AQ_EDGE模式,则会计算梯度密度。具体来说,先寻找一个块的平均角度,随后计算每帧的AC energy
uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
{
pixel *edgeImage = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
intptr_t srcStride = curFrame->m_fencPic->m_stride;
intptr_t blockOffsetLuma = blockX + (blockY * srcStride);
int plane = 0; // Sobel filter is applied only on Y component
uint32_t var;
if (qgSize == 8)
{
findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, qgSize, avgAngle);
var = acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(edgeImage + blockOffsetLuma, srcStride), 6, plane);
}
else
{
// 寻找块的平均角度,通过求取平均值实现,这里的edgeTheta在edgeFilter()当中计算得到
findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, 16, avgAngle);
// 计算AC energy
var = acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(edgeImage + blockOffsetLuma, srcStride), 8, plane);
}
x265_emms();
return var;
}
findAvgAngle()的实现如下,通过求一个块中的平均像素角度来估算一个块的角度
//Find the angle of a block by averaging the pixel angles
inline void findAvgAngle(const pixel* block, intptr_t stride, uint32_t size, uint32_t &angle)
{
int sum = 0;
for (uint32_t y = 0; y < size; y++)
{
for (uint32_t x = 0; x < size; x++)
{
sum += block[x];
}
block += stride;
}
angle = sum / (size*size);
}
2.宏块树(cuTree)
在x264当中,有mbtree这一项工具,用于提升宏块级编码效率。在x265当中也有类似的技术,叫做cuTree,两者差不多。具体来说,cuTree位于lookahead模块中,通过将lookahead队列中的帧按照从后向前的顺序进行分析,来获得前序帧中CU相对于后序帧中CU的重要程度,这里的后序帧CU会将前序帧CU作为参考CU。在主线程编码流程中,如果前序帧中的CU重要程度比较高,说明应该为其使用较低的QP(即高质量编码),这样后序帧中的CU就能够获得更好的编码效果
/*
例如,队列中有3个P帧,主线程编码顺序为Pn-1,Pn,Pn+1,如下
... Pn-1 -> Pn -> Pn+1 ...
cuTree分析时的顺序为Pn+1,Pn,Pn-1
*/
cuTree的计算流程位于encoder\slicetype.cpp中,由cuTree()实现,主要工作流程为
(1)计算帧intra和inter cost(singleCost)
(2)计算CU的传播cost(estimateCUPropagate)
(3)根据前面计算的传播cost来评估qp调整量(cuTreeFinish)
void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
{
int idx = !bIntra;
int lastnonb, curnonb = 1;
int bframes = 0;
x265_emms();
double totalDuration = 0.0;
for (int j = 0; j <= numframes; j++)
totalDuration += (double)m_param->fpsDenom / m_param->fpsNum;
// 计算平均持续时间
double averageDuration = totalDuration / (numframes + 1);
int i = numframes;
// 从后向前,寻找到第一个非B帧
while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
i--;
lastnonb = i;
/* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
* be applied to the end of a lookahead buffer of any size. However, it's most needed when
* lookahead=0, so that's what's currently implemented. */
// 如果lookahead队列为空,将propagatecost和qpCuTreeOffset都初始化为0
if (!m_param->lookaheadDepth)
{
if (bIntra)
{
memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
if (m_param->rc.qgSize == 8)
memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * 4 * sizeof(double));
else
memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
return;
}
std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
}
else
{
if (lastnonb < idx)
return;
memset(frames[lastnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
}
CostEstimateGroup estGroup(*this, frames);
// 开始向前计算propagate cost
while (i-- > idx)
{
curnonb = i;
// 查找第一个非B帧,作为当前节点
while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
curnonb--;
if (curnonb < idx)
break;
// 1.计算intra和inter cost
estGroup.singleCost(curnonb, lastnonb, lastnonb);
memset(frames[curnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
bframes = lastnonb - curnonb - 1;
// 是否使用金字塔模式
if (m_param->bBPyramid && bframes > 1)
{
int middle = (bframes + 1) / 2 + curnonb;
estGroup.singleCost(curnonb, lastnonb, middle);
memset(frames[middle]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
while (i > curnonb)
{
int p0 = i > middle ? middle : curnonb;
int p1 = i < middle ? middle : lastnonb;
if (i != middle)
{
estGroup.singleCost(p0, p1, i);
estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
}
i--;
}
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
}
else
{
// i为一个B帧
while (i > curnonb)
{
estGroup.singleCost(curnonb, lastnonb, i);
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
i--;
}
}
// 2.计算CU的传播cost
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
lastnonb = curnonb;
}
if (!m_param->lookaheadDepth)
{
estGroup.singleCost(0, lastnonb, lastnonb);
estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
}
// 3.根据前面计算的传播cost来评估qp调整量
cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
}
2.1 计算帧损失(singleCost)
singleCost()中调用了estimateFrameCost()计算帧的cost
int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
{
LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];
return estimateFrameCost(tld, p0, p1, b, intraPenalty);
}
estimateFrameCost()的定义如下
/*
计算一帧的cost,其中
p0表示前向参考帧位置,p1表示后向参考帧位置,b为当前帧位置
若p0 = p1 = b,则表示没有参考帧,即I帧
若p1 = b,则表示只有前向参考帧,即P帧
作为I帧,所有宏块的cost = intra cost
作为P帧,所有宏块的cost = min( intra cost, inter cost)
作为B帧,所有宏块的cost = inter cost
其中每一个帧都带有开销矩阵costEst[b-p0][p1-b]
表示帧b以p0为前向参考,p1为后向参考时的帧cost
*/
int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
{
Lowres* fenc = m_frames[b];
x265_param* param = m_lookahead.m_param;
int64_t score = 0;
// 是否已经存在cost
if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
score = fenc->costEst[b - p0][p1 - b];
else
{
bool bDoSearch[2];
bDoSearch[0] = fenc->lowresMvs[0][b - p0][0].x == 0x7FFF; // 是否进行前向搜索
bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFF; // 是否进行后向搜索
#if CHECKED_BUILD
X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0][0].x == 0x7FFE), "motion search batch duplication L0\n");
X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFE), "motion search batch duplication L1\n");
if (bDoSearch[0]) fenc->lowresMvs[0][b - p0][0].x = 0x7FFE;
if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b][0].x = 0x7FFE;
#endif
fenc->weightedRef[b - p0].isWeighted = false;
// 是否进行加权预测
if (param->bEnableWeightedPred && bDoSearch[0])
tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);
fenc->costEst[b - p0][p1 - b] = 0;
fenc->costEstAq[b - p0][p1 - b] = 0;
// m_batchMode表示批量处理模式(即并行模式,默认会使用),这里是帧级别的并行
if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1]))
{
/* Use cooperative mode if a thread pool is available and the cost estimate is
* going to need motion searches or bidir measurements */
// 如果线程池可用,并且成本估计将需要运动搜索或边界测量,则使用合作模式
memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
m_lock.acquire();
X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
m_coop.p0 = p0;
m_coop.p1 = p1;
m_coop.b = b;
m_coop.bDoSearch[0] = bDoSearch[0];
m_coop.bDoSearch[1] = bDoSearch[1];
m_jobTotal = m_lookahead.m_numCoopSlices;
m_jobAcquired = 0;
m_lock.release();
tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
processTasks(-1);
waitForExit();
for (int i = 0; i < m_lookahead.m_numCoopSlices; i++)
{
fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq;
if (p1 == b) // 没有后向参考,写入Intra cost
fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
}
}
else
{
/* Calculate MVs for 1/16th resolution*/
bool lastRow;
if (param->bEnableHME)
{
lastRow = true;
for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
{
for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
lastRow = false;
}
}
/*
计算每个CU的cost
(1)这里使用的是倒序的方式,从一帧的最右下角开始计算,在x264的slicetype_slice_cost函数中解释如下:
MV在主编码过程中被用作预测器,通过倒序的方式能够有效提高MV预测的总体水平
(2)我猜测这里可能有几方面的原因
(a)视频特性
一般情况下,视频具有向右、向下的一个趋势,人眼注意力也会倾向于关注偏右下角区域
(b)信息参考
先预测右下角,这使得左上角CU在进行预测时能够获取较多的信息参考,计算左上角CU的损失时更加准确,qp的调控也更加准确
此时,在主编码流程中,如果左上角CU编码质量较高,整帧的编码质量都会较高
(c)经验性
可能按照这种配置,在大规模测试时,取得了不错的性能
*/
lastRow = true;
for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
{
fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);
lastRow = false;
}
}
score = fenc->costEst[b - p0][p1 - b];
if (b != p1)
score = score * 100 / (130 + param->bFrameBias);
fenc->costEst[b - p0][p1 - b] = score;
}
if (bIntraPenalty)
// arbitrary penalty for I-blocks after B-frames
score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8);
return score;
}
estimateCUCost()的定义如下
void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
Lowres *fref0 = m_frames[p0];
Lowres *fref1 = m_frames[p1];
Lowres *fenc = m_frames[b];
ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
// 如果是hme,块大小为4x4,否则为8x8
const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
const int bBidir = (b < p1);
const int cuXY = cuX + cuY * widthInCU;
const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
const int cuSize = X265_LOWRES_CU_SIZE;
const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);
if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
int listDist[2] = { b - p0, p1 - b};
MV mvmin, mvmax;
int bcost = tld.me.COST_MAX;
int listused = 0;
// TODO: restrict to slices boundaries
// establish search bounds that don't cross extended frame boundaries
mvmin.x = (int32_t)(-cuX * cuSize - 8);
mvmin.y = (int32_t)(-cuY * cuSize - 8);
mvmax.x = (int32_t)((widthInCU - cuX - 1) * cuSize + 8);
mvmax.y = (int32_t)((heightInCU - cuY - 1) * cuSize + 8);
for (int i = 0; i < 1 + bBidir; i++)
{
int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
int skipCost = INT_MAX;
// 不进行搜索,直接对比cost
if (!bDoSearch[i])
{
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
continue;
}
int numc = 0;
MV mvc[5], mvp;
// 如果使用hme搜索,则使用resmvs,即残差mv
MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
ReferencePlanes* fref = i ? fref1 : wfref0;
/* Reverse-order MV prediction */
#define MVC(mv) mvc[numc++] = mv;
// 将mv填充到mvc中
if (cuX < widthInCU - 1)
MVC(fencMV[1]); // 填充右侧块的MV
if (!lastRow)
{
MVC(fencMV[widthInCU]); // 填充下方块的MV
if (cuX > 0)
MVC(fencMV[widthInCU - 1]); // 填充左下方块的MV
if (cuX < widthInCU - 1)
MVC(fencMV[widthInCU + 1]); // 填充右下方块的MV
}
if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
{
MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
}
#undef MVC
// 如果无可用mv,则mvp设置为0
if (!numc)
mvp = 0;
else
{
ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
int mvpcost = MotionEstimate::COST_MAX;
/* measure SATD cost of each neighbor MV (estimating merge analysis)
* and use the lowest cost MV as MVP (estimating AMVP). Since all
* mvc[] candidates are measured here, none are passed to motionEstimate */
// 对相邻mv评估SATD的损失,最佳的mv存储到mvp中
for (int idx = 0; idx < numc; idx++)
{
intptr_t stride = X265_LOWRES_CU_SIZE;
// 低分辨率的运动补偿
pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
// 计算SATD
int cost = tld.me.bufSATD(src, stride);
COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
/* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
if (!mvp.notZero() && bBidir)
skipCost = cost;
}
}
// 基于前面获取的最佳mv,进行运动估计
int searchRange = m_lookahead.m_param->bEnableHME ? (hme ? m_lookahead.m_param->hmeRange[0] : m_lookahead.m_param->hmeRange[1]) : s_merange;
/* ME will never return a cost larger than the cost @MVP, so we do not
* have to check that ME cost is more than the estimated merge cost */
if(!hme)
fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
else
fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
if (skipCost < 64 && skipCost < fencCost && bBidir)
{
fencCost = skipCost;
*fencMV = 0;
}
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
}
if (hme)
return;
if (bBidir) /* B, also consider bidir */
{
/* NOTE: the wfref0 (weightp) is not used for BIDIR */
/* avg(l0-mv, l1-mv) candidate */
ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
/* coloc candidate */
src0 = fref0->lowresPlane[0] + pelOffset;
src1 = fref1->lowresPlane[0] + pelOffset;
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
bcost += lowresPenalty;
}
else /* P, also consider intra */
{
bcost += lowresPenalty;
if (fenc->intraCost[cuXY] < bcost)
{
bcost = fenc->intraCost[cuXY];
listused = 0;
}
}
/* do not include edge blocks in the frame cost estimates, they are not very accurate */
// 不能将边界块纳入计算,因为不准确
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
int bcostAq;
if (m_lookahead.m_param->rc.qgSize == 8)
bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
else
bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;
if (bFrameScoreCU)
{
if (slice < 0)
{
fenc->costEst[b - p0][p1 - b] += bcost;
fenc->costEstAq[b - p0][p1 - b] += bcostAq;
if (!listused && !bBidir)
fenc->intraMbs[b - p0]++;
}
else
{
m_slice[slice].costEst += bcost;
m_slice[slice].costEstAq += bcostAq;
if (!listused && !bBidir)
m_slice[slice].intraMbs++;
}
}
fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}
2.2 计算传播损失(estimateCUPropagate)
函数的主要功能是根据前面获得的inter与intra cost来计算传播损失,传播损失计算的公式为
c
o
s
t
=
(
p
r
o
p
a
g
a
t
e
I
n
+
i
n
t
r
a
C
o
s
t
∗
i
n
v
q
s
c
a
l
e
∗
f
p
s
F
a
c
t
o
r
>
>
8
)
∗
(
1
−
i
n
t
e
r
C
o
s
t
i
n
t
r
a
C
o
s
t
)
cost = (propagateIn + intraCost * invqscale * fpsFactor >> 8) * (1-\frac{ interCost}{intraCost})
cost=(propagateIn+intraCost∗invqscale∗fpsFactor>>8)∗(1−intraCostinterCost)
这个公式表示的意思是:
(1)当前CU的cost,与propagateIn呈正相关(或者说别的CU赋予当前CU的重要程度)。如果别的CU认为当前CU很重要,则当前CU应该被认为是重要的,应该以较高质量编码
(2)当前CU的cost,与intraCost呈正相关。如果intraCost比较大,说明纹理比较复杂,应该以较高质量编码。qscale表示一个与qp相关的因子,可以理解是一个调控因子
(3)当前CU的cost,与fps(或者说视频持续时长)呈正相关。如果fps较大,从人眼视觉来说,当前帧比较重要
(4)当前CU的cost,与interCost和intraCost之间的关系有关,只有当interCost小于intraCost时,传播损失才大于0,并且只有传播损失大于0时才会被使用。如果interCost远小于intraCost,说明视频前后的图像很相似,使用Inter模式带来的损失很小,如果将当前CU以高质量编码,后续CU编码损失会很小。在这种情况下,传播损失会比较大,即当前CU的重要程度很高
PS:如果interCost大于intraCost,说明当前CU直接使用Intra模式效果更好,而cuTree是面向Inter模式的一种技术,这种情况下的传播损失设置为0
// 根据前面获得的inter 和 intra cost,计算CU级的传播cost
void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
{
// 在lookahead当中进行帧计算时,使用的是经过下采样的低分辨率图像Lowres
uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
/*
b表示当前帧
p0表示b的前向参考帧
p1表示b的后向参考帧
*/
int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
int listDist[2] = { b - p0, p1 - b };
memset(m_scratch, 0, m_8x8Width * sizeof(int));
uint16_t *propagateCost = frames[b]->propagateCost;
x265_emms();
double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
/* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
if (!referenced) // 如果当前帧没有被参考,则inter cost为0
memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t));
int32_t strideInCU = m_8x8Width;
for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++)
{
int cuIndex = blocky * strideInCU;
/*
逐行计算propagate cost,计算的结果存储在m_scratch中,这是一个并行操作,每次都会计算一行
(1)propagateCost是其他帧传递给当前帧的cost,表示当前帧的重要程度
(2)frames[b]->intraCost表示当前帧的intra cost
(3)frames[b]->lowresCosts[b - p0][p1 - b]表示前一个参考帧传递给后一个参考帧的inter cost
(4)frames[b]->invQscaleFactor表示invqscale,可以理解是与qp相关的影响因子
(5)fpsFactor表示fps因子,如果当前帧持续时间长,说明当前帧比较重要
*/
if (m_param->rc.qgSize == 8)
primitives.propagateCost(m_scratch, propagateCost,
frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
else // 使用x265_mbtree_propagate_cost_avx2实现行级计算
primitives.propagateCost(m_scratch, propagateCost,
frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
if (referenced)
propagateCost += m_8x8Width;
for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++)
{
int32_t propagate_amount = m_scratch[blockx]; // 这里的propagate_amount就代表了cost
/* Don't propagate for an intra block. */
if (propagate_amount > 0) // intra block不传播
{
/* Access width-2 bitfield. */
int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
/* Follow the MVs to the previous frame(s). */
for (uint16_t list = 0; list < 2; list++)
{
if ((lists_used >> list) & 1)
{
#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) // 两个数字相加并返回给s,并检查是否上溢
int32_t listamount = propagate_amount;
/* Apply bipred weighting. */
if (lists_used == 3)
listamount = (listamount * bipredWeights[list] + 32) >> 6;
MV *mvs = frames[b]->lowresMvs[list][listDist[list]];
/* Early termination for simple case of mv0. */
if (!mvs[cuIndex].word) // mv为零,直接将cost写入
{
CLIP_ADD(refCosts[list][cuIndex], listamount);
continue;
}
/*
如果mv不为0,说明当前块传递给别的块(传播块)时,对应的传播块不是一个完整块,此时需要划分成4个块来考虑
+ ---- + ---- +
| 0 | 1 |
+ ---- + ---- +
| 2 | 3 |
+ ---- + ---- +
在内存中,mv的存储以1/4像素进行存储,要将mv中的x和y转换成为以8x8块为单位的坐标
(1)x >> 2 表示将1/4像素转换成整像素
(2)x >> 3 表示将整像素转换成为以8x8为单位的坐标
*/
int32_t x = mvs[cuIndex].x;
int32_t y = mvs[cuIndex].y;
int32_t cux = (x >> 5) + blockx; // 以8x8为单位块的横坐标
int32_t cuy = (y >> 5) + blocky; // 以8x8为单位块的纵坐标
int32_t idx0 = cux + cuy * strideInCU; // 0号块位置
int32_t idx1 = idx0 + 1; // 1号块位置
int32_t idx2 = idx0 + strideInCU; // 2号块位置
int32_t idx3 = idx0 + strideInCU + 1; // 3号块位置
x &= 31;
y &= 31;
int32_t idx0weight = (32 - y) * (32 - x); // 0号块权重
int32_t idx1weight = (32 - y) * x; // 1号块权重
int32_t idx2weight = y * (32 - x); // 2号块权重
int32_t idx3weight = y * x; // 3号块权重
/* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
* be counted. */
/*
检查对应的传播块是否超出边界
(1)如果没有超出边界,则直接计算cost并赋值
(2)如果有部分块超出了边界,则赋值可用的块
*/
if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0) // 所有块都没有超出边界
{
CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
}
else /* Check offsets individually */
{
// 可能有部分块超出了边界
if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0) // idx0可用
CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0) // idx1可用
CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0) // idx2可用
CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0) // idx3可用
CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
}
}
}
}
}
}
if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
}
2.3 计算qp调整量(cuTreeFinish)
函数的主要功能是基于前面已经获取的传播损失,来计算当前CU的qp调整量
void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
{
// 是否使用hevcAq模式
if (m_param->rc.hevcAq)
{
computeCUTreeQpOffset(frame, averageDuration, ref0Distance);
}
else
{ // 不使用hevcAq模式
int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
double weightdelta = 0.0;
if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
if (m_param->rc.qgSize == 8)
{
for (int cuY = 0; cuY < m_8x8Height; cuY++)
{
for (int cuX = 0; cuX < m_8x8Width; cuX++)
{
const int cuXY = cuX + cuY * m_8x8Width;
int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
if (intracost)
{
int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
}
}
}
}
else
{
for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
{
int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
// 当前CU纹理值得被参考时(intracost不为0),才计算传播cost
if (intracost)
{
// 为每个cu块(以16x16为尺寸)赋值qpOffset
int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
/*
(1)m_cuTreeStrength的计算取决于是否使用hevcAq以及qCompress
m_cuTreeStrength = (m_param->rc.hevcAq ? 6.0 : 5.0) * (1.0 - m_param->rc.qCompress);
(2)log2_ratio的计算取决于propagateCost和intraCost的比值,propagateCost相比于intraCost而言越大,
log2_ratio的值越大,frame->qpCuTreeOffset[cuIndex]就越小,此时应该以高质量编码
PS: qpAqOffset的值可以为负,log2_ratio的值越大,则负的越多;如果qpAqOffset为正,log2_ratio的值越大,
则正的越少。不论哪种情况,结果都是实际编码qp会更低
*/
frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
}
}
}
}
}
3.qpOffset的使用
通过前面的AQ和cuTree获得了qpOffset等信息,在实际编码过程中会被使用到,粗略来说,可能有几种用法:
(1)直接使用qp进行CU级调整(例如calculateQpforCuSize)
(2)调整行级平均qp(与bOptCUDeltaQP相关)
(3)调整lowres的帧级cost,影响码控(与VBV相关,例如getEstimatedPictureCost)
在这几种使用中,简单记录一下calculateQpForCuSize()
int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
{
FrameData& curEncData = *m_frame->m_encData;
double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
{
x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
&& distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
qp += distortionData->offset[ctu.m_cuAddr];
}
// analysisLoadReuseLevel默认为0
if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
{
int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
if (ctu.m_slice->m_sliceType == I_SLICE)
return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
else
return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
}
// 是否使用hevcAq
if (m_param->rc.hevcAq)
{
/* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
double dQpOffset = 0;
if (bCuTreeOffset)
{
dQpOffset = cuTreeQPOffset(ctu, cuGeom);
}
else
{
dQpOffset = aqQPOffset(ctu, cuGeom);
if (complexCheck)
{
int32_t offset = (int32_t)(dQpOffset * 100 + .5);
double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
int32_t max_threshold = (int32_t)(threshold * 100 + .5);
return (offset < max_threshold);
}
}
qp += dQpOffset;
}
else
{
int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
/* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
if (qpoffs)
{
uint32_t width = m_frame->m_fencPic->m_picWidth;
uint32_t height = m_frame->m_fencPic->m_picHeight;
uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
double dQpOffset = 0;
uint32_t cnt = 0;
// 遍历16x16小块,从中取出原先计算好的qpOffset
for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
{
for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
{
uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
dQpOffset += qpoffs[idx];
cnt++;
}
}
dQpOffset /= cnt;
qp += dQpOffset; // 进行qp的调整
// complexCheck默认为 -1
if (complexCheck)
{
int32_t offset = (int32_t)(dQpOffset * 100 + .5);
double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
int32_t max_threshold = (int32_t)(threshold * 100 + .5);
return (offset < max_threshold);
}
}
}
// 对qp进行clip,防止溢出
return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
}
4.AQ技术和cuTree技术之间的关联
AQ技术和cuTree技术都是CU级别的码率控制技术,其中AQ技术主要思想是基于帧内图像的空域相关性来调整qp,cuTree技术主要思想是基于帧间图像的时域相关性来调整qp。在实际编码过程中,通常是先计算AQ再计算cuTree,两者之间的影响关系如下所示,位于encoder\encoder.cpp中
/*
Encoder::configure(),位于encoder.cpp中
(1)如果不启用aq,但启用了cuTree,则会强制设置aqMode = X265_AQ_VARIANCE
(2)如果aqStrength为0,同时不启用cuTree,则aqMode = X265_AQ_NONE
(3)如果不启用aq和cuTree,则aqStrength = 0
*/
if (p->rc.aqMode == 0 && p->rc.cuTree)
{
p->rc.aqMode = X265_AQ_VARIANCE;
p->rc.aqStrength = 0.0;
}
if (p->rc.aqStrength == 0 && p->rc.cuTree == 0)
{
p->rc.aqMode = X265_AQ_NONE;
p->rc.hevcAq = 0;
}
if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0)
p->rc.aqStrength = 0;
5.hevcAq模式
这个模式是后续提出来的,单独面向265标准的一种qp计算模式,默认不会启用。这种模式的主要思想是考虑一帧之内的纹理复杂度(用方差描述),利用单个块和整帧平均值来计算qpOffset。在这个模式下,复杂度的计算使用xPreanalyze()实现,qpOffset的计算使用computeCUTreeQpOffset()实现
5.1 复杂度的计算(xPreanalyze)
xPreanalyze()函数被calcAdaptiveQuantFrame()调用,简单来说就是按照不同的粒度对当前帧进行分析,分析的依据是方差,根据方差来对CU进行qp的调整
void LookaheadTLD::xPreanalyze(Frame* curFrame)
{
const uint32_t width = curFrame->m_fencPic->m_picWidth;
const uint32_t height = curFrame->m_fencPic->m_picHeight;
const intptr_t stride = curFrame->m_fencPic->m_stride;
// 1.按照不同的粒度,计算方差(或者说纹理复杂度)
for (uint32_t d = 0; d < 4; d++)
{
// maxCUSize默认为64,ctuSizeIdx为0
int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
// qgSize默认为32,aqDepth为1
int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
/*
对于aqLayerDepth[ctuSizeIdx][aqDepth][d]的理解是:
(1)aqLayerDepth的定义如下
static const uint32_t aqLayerDepth[3][4][4] =
{
{ // ctu size 64
{ 1, 0, 1, 0 }, aqDepth = 0
{ 1, 1, 1, 0 }, aqDepth = 1
{ 1, 1, 1, 0 }, aqDepth = 2
{ 1, 1, 1, 1 } aqDepth = 3
},
{ // ctu size 32
{ 1, 1, 0, 0 },
{ 1, 1, 0, 0 },
{ 1, 1, 1, 0 },
{ 0, 0, 0, 0 },
},
{ // ctu size 16
{ 1, 0, 0, 0 },
{ 1, 1, 0, 0 },
{ 0, 0, 0, 0 },
{ 0, 0, 0, 0 }
}
};
(2)假设aqLayerDepth[0][1][0]=1,表示
(a)ctuSizeIdx = 0,表示maxCUSize为64
(b)aqDepth = 1,表示qgSize为32
(c)d = 0,表示当前执行第0个级别粒度的分析
aqLayerDepth[0][1] = {1, 1, 1, 0}表示允许进行第0,1,2级别粒度的分析,每种粒度会对应不同
分析块的尺寸
*/
if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
continue;
const pixel* src = curFrame->m_fencPic->m_picOrg[0];; // 0表示luma分量
PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[d]; // 获取当前粒度下的layer
const uint32_t aqPartWidth = pQPLayer->aqPartWidth; // 3种粒度,分别是{64, 32, 16}
const uint32_t aqPartHeight = pQPLayer->aqPartHeight; // 3种粒度,分别是{64, 32, 16}
double* pcAQU = pQPLayer->dActivity;
double dSumAct = 0.0;
// 按照不同的粒度,来遍历一帧当中所有的块
for (uint32_t y = 0; y < height; y += aqPartHeight)
{
const uint32_t currAQPartHeight = X265_MIN(aqPartHeight, height - y);
for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++)
{
const uint32_t currAQPartWidth = X265_MIN(aqPartWidth, width - x);
const pixel* pBlkY = &src[x];
uint64_t sum[4] = { 0, 0, 0, 0 };
uint64_t sumSq[4] = { 0, 0, 0, 0 };
uint32_t by = 0;
/*
sum[4]中的4个索引分别表示4个子块
+---+---+
| 0 | 1 |
+---+---+
| 2 | 3 |
+---+---+
*/
for (; by < currAQPartHeight >> 1; by++)
{
uint32_t bx = 0;
for (; bx < currAQPartWidth >> 1; bx++)
{
sum[0] += pBlkY[bx];
sumSq[0] += pBlkY[bx] * pBlkY[bx];
}
for (; bx < currAQPartWidth; bx++)
{
sum[1] += pBlkY[bx];
sumSq[1] += pBlkY[bx] * pBlkY[bx];
}
pBlkY += stride;
}
for (; by < currAQPartHeight; by++)
{
uint32_t bx = 0;
for (; bx < currAQPartWidth >> 1; bx++)
{
sum[2] += pBlkY[bx];
sumSq[2] += pBlkY[bx] * pBlkY[bx];
}
for (; bx < currAQPartWidth; bx++)
{
sum[3] += pBlkY[bx];
sumSq[3] += pBlkY[bx] * pBlkY[bx];
}
pBlkY += stride;
}
assert((currAQPartWidth & 1) == 0);
assert((currAQPartHeight & 1) == 0);
const uint32_t pixelWidthOfQuadrants = currAQPartWidth >> 1;
const uint32_t pixelHeightOfQuadrants = currAQPartHeight >> 1;
// 计算每个子块中像素的数量
const uint32_t numPixInAQPart = pixelWidthOfQuadrants * pixelHeightOfQuadrants;
double dMinVar = MAX_DOUBLE;
// 求每个子块均值和方差
if (numPixInAQPart != 0)
{
for (int i = 0; i < 4; i++)
{
const double dAverage = double(sum[i]) / numPixInAQPart;
const double dVariance = double(sumSq[i]) / numPixInAQPart - dAverage * dAverage;
dMinVar = X265_MIN(dMinVar, dVariance);
}
}
else
{
dMinVar = 0.0;
}
double dActivity = 1.0 + dMinVar;
// 存储方差
*pcAQU = dActivity; // CU级方差
dSumAct += dActivity;
}
src += stride * currAQPartHeight;
}
// 计算当前粒度下的均值方差
const double dAvgAct = dSumAct / (pQPLayer->numAQPartInWidth * pQPLayer->numAQPartInHeight);
pQPLayer->dAvgActivity = dAvgAct;
}
// 2.按照不同的粒度,去分析QP
xPreanalyzeQp(curFrame);
// 最小AQDepth(最细粒度)
int minAQDepth = curFrame->m_lowres.pAQLayer->minAQDepth;
PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[minAQDepth];
const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
double* pcQP = pQPLayer->dQpOffset;
// Use new qp offset values for qpAqOffset, qpCuTreeOffset and invQscaleFactor buffer
// 使用从最细粒度获取的pcQP值,来计算invQscaleFactor
// 这里似乎没有计算qpAqOffset和qpCuTreeOffset?
int blockXY = 0;
for (uint32_t y = 0; y < height; y += aqPartHeight)
{
for (uint32_t x = 0; x < width; x += aqPartWidth, pcQP++)
{
curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(*pcQP);
blockXY++;
acEnergyCu(curFrame, x, y, curFrame->m_param->internalCsp, curFrame->m_param->rc.qgSize);
}
}
}
xPreanalyzeQP()的定义如下
void LookaheadTLD::xPreanalyzeQp(Frame* curFrame)
{
const uint32_t width = curFrame->m_fencPic->m_picWidth;
const uint32_t height = curFrame->m_fencPic->m_picHeight;
for (uint32_t d = 0; d < 4; d++)
{
int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
continue;
PicQPAdaptationLayer* pcAQLayer = &curFrame->m_lowres.pAQLayer[d];
const uint32_t aqPartWidth = pcAQLayer->aqPartWidth;
const uint32_t aqPartHeight = pcAQLayer->aqPartHeight;
double* pcAQU = pcAQLayer->dActivity;
double* pcQP = pcAQLayer->dQpOffset;
double* pcCuTree = pcAQLayer->dCuTreeOffset;
// 分析每个粒度下的qpOffset
for (uint32_t y = 0; y < height; y += aqPartHeight)
{
for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++, pcQP++, pcCuTree++)
{
// param->rc.qpAdaptationRange = 1.0;
double dMaxQScale = pow(2.0, curFrame->m_param->rc.qpAdaptationRange / 6.0);
// CU级别的方差
double dCUAct = *pcAQU;
// 粒度级别的方差(或者说CU级别的平均方差)
double dAvgAct = pcAQLayer->dAvgActivity;
/*
dNormtAct计算的公式的含义是
(1)对比当前CU方差和平均方差的大小
(a)如果当前CU方差大于平均方差,则dNormAct偏大,qpOffset偏大,即当前CU很重要,应该以低qp(高质量)编码
(2)举例
(a)已知 dMaxQScale = pow(2, 1/6) = 1.1224
(b)假设dCUAct = 10,dAvgAct = 5,表明当前CU的方差比较大,此时
dNormAct = (1.124 * 10 + 5) / (1.124 * 5 + 10) = 1.040
(c)假设dCUAct = 5,dAvgAct = 10, 表明当前CU的方差比较小,此时
dNormAct = (1.124 * 5 + 10) / (1.124 * 10 + 5) = 0.962
dNormAct越大,qpOffset越大,所以当dCUAct越大,qpOffset越大
*/
double dNormAct = (dMaxQScale*dCUAct + dAvgAct) / (dCUAct + dMaxQScale*dAvgAct);
double dQpOffset = (X265_LOG2(dNormAct) / X265_LOG2(2.0)) * 6.0;
// 存储qpOffset
*pcQP = dQpOffset;
*pcCuTree = dQpOffset;
}
}
}
}
5.2 qpOffset的计算(computeCUTreeQpOffset)
computeCUTreeQpOffset()被cuTreeFinish()调用,实现了hevcAq模式下的qpOffset的计算,其定义如下
void Lookahead::computeCUTreeQpOffset(Lowres *frame, double averageDuration, int ref0Distance)
{
int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
double weightdelta = 0.0;
if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
uint32_t widthFullRes = frame->widthFullRes;
uint32_t heightFullRes = frame->heightFullRes;
if (m_param->rc.qgSize == 8)
{
// ...
}
else
{ // 遍历每个粒度
for (uint32_t d = 0; d < 4; d++)
{
int ctuSizeIdx = 6 - g_log2Size[m_param->maxCUSize];
int aqDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
continue;
PicQPAdaptationLayer* pQPLayer = &frame->pAQLayer[d];
const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
const uint32_t numAQPartInWidth = pQPLayer->numAQPartInWidth;
const uint32_t numAQPartInHeight = pQPLayer->numAQPartInHeight;
double* pcQP = pQPLayer->dQpOffset;
double* pcCuTree = pQPLayer->dCuTreeOffset;
uint32_t maxCols = frame->maxBlocksInRow;
for (uint32_t y = 0; y < numAQPartInHeight; y++)
{
for (uint32_t x = 0; x < numAQPartInWidth; x++, pcQP++, pcCuTree++)
{
uint32_t block_x = x * aqPartWidth;
uint32_t block_y = y * aqPartHeight;
uint32_t blockXY = 0;
double log2_ratio = 0;
for (uint32_t block_yy = block_y; block_yy < block_y + aqPartHeight && block_yy < heightFullRes; block_yy += loopIncr)
{
for (uint32_t block_xx = block_x; block_xx < block_x + aqPartWidth && block_xx < widthFullRes; block_xx += loopIncr)
{
uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
int intraCost = (frame->intraCost[idx] * frame->invQscaleFactor[idx] + 128) >> 8;
int propagateCost = (frame->propagateCost[idx] * fpsFactor + 128) >> 8;
log2_ratio += (X265_LOG2(intraCost + propagateCost) - X265_LOG2(intraCost) + weightdelta);
blockXY++;
}
}
// 计算qpOffset
double qp_offset = (m_cuTreeStrength * log2_ratio) / blockXY;
*pcCuTree = *pcQP - qp_offset;
}
}
}
}
}