当前位置: 首页 > article >正文

【x265】码率控制模块的简单分析—块级码控工具(AQ和cuTree)

目录

  • 1. 自适应QP技术(Adaptive QP)
    • 1.2 图像纹理的检测(edgeFilter)
    • 1.3 计算AC energy(acEnergyCu)
    • 1.4 计算梯度密度(edgeDensityCu)
  • 2.宏块树(cuTree)
    • 2.1 计算帧损失(singleCost)
    • 2.2 计算传播损失(estimateCUPropagate)
    • 2.3 计算qp调整量(cuTreeFinish)
  • 3.qpOffset的使用
  • 4.AQ技术和cuTree技术之间的关联
  • 5.hevcAq模式
    • 5.1 复杂度的计算(xPreanalyze)
    • 5.2 qpOffset的计算(computeCUTreeQpOffset)

x265相关:
【x265】x265编码器参数配置
【x265】预测模块的简单分析—帧内预测
【x265】预测模块的简单分析—帧间预测

1. 自适应QP技术(Adaptive QP)

在x265当中定义了5种AQ模式,较之于x264而言,新增了AQ_EDGE这种模式

#define X265_AQ_NONE                 0	// 不使用AQ
#define X265_AQ_VARIANCE             1	// 方差模式,仅考虑当前块
#define X265_AQ_AUTO_VARIANCE        2	// 自方差模式,考虑整帧中的块
#define X265_AQ_AUTO_VARIANCE_BIASED 3  // 带偏置项的自方差模式,考虑整帧中的块,并且带一个可调控的偏置项
#define X265_AQ_EDGE                 4	// 边缘模式

AQ模式的计算位于encoder/slicetype.cpp中,由calcAdaptiveQuantFrame()实现,主要的步骤为:

  1. 如果使用hevcAq,则使用xPreanalyze()去分析当前帧
  2. 如果使用常规AQ
    (1)如果使用X265_AQ_EDGE模式,则先进行滤波,检测边缘纹理(edgeFilter)
    (2)如果使用X265_AQ_AUTO_VARIANCE、X265_AQ_ATUO_VARIANCE_BIASED或X265_AQ_EDGE模式中的一种,则根据全局范围的纹理情况来计算qp_adj
     (a)如果是X265_AQ_EDGE模式,会计算边缘密度,依据边缘密度计算块级(默认为16x16)qp_adj
     (b)如果非X265_AQ_EDGE模式,根据AC energy计算块级qp_adj
     (c)将所有块的qp_adj平均计算,得到平均qp_adj
    (3)如果是X265_AQ_VARIANCE,根据strength调整qp
void LookaheadTLD::calcAdaptiveQuantFrame(Frame* curFrame, x265_param* param)
{
	/* Actual adaptive quantization */
	int maxCol = curFrame->m_fencPic->m_picWidth;
	int maxRow = curFrame->m_fencPic->m_picHeight;
	int blockCount, loopIncr;
	float modeOneConst, modeTwoConst;
	/*
		qgSize表示量化组大小(quantization group size)
		(1)qg将图像划分成为固定大小的正方形像素块(NxN),同一个qg内所有非零系数的CU使用同一个qp
			不同的qg使用不同的qp
		(2)通过调整qgSize,能够针对不同区域的图像内容,使用不同的qp,从而节省码率
		(3)qgSize默认的大小为32
	*/
	if (param->rc.qgSize == 8)
	{
		blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
		modeOneConst = 11.427f;
		modeTwoConst = 8.f;
		loopIncr = 8;
	}
	else
	{
		blockCount = widthInCU * heightInCU;
		modeOneConst = 14.427f;
		modeTwoConst = 11.f;
		loopIncr = 16;
	}

	float* quantOffsets = curFrame->m_quantOffsets;
	/*
		m_lowres表示低分辨率视频,在编码器中用于优化编码效率和质量
		(1)降低视频分辨率,能够减少编码过程中的计算量,通常降低分辨率为输入视频的1/4
		(2)提升编码质量,低分辨率下的Intra和Inter模式,能够良好的应用于场景检测、帧结构确定及CU Tree优化
		(3)优化帧类型选择,选择合适的帧类型(Intra、Inter、Bi-Inter)
	*/
	for (int y = 0; y < 3; y++)
	{
		curFrame->m_lowres.wp_ssd[y] = 0;
		curFrame->m_lowres.wp_sum[y] = 0;
	}
	// bStatRead表示从文件中读取信息(multi-pass)
	if (!(param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
	{
		/* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
		// 为帧当中的16x16和8x8计算QP偏移量
		// 如果aq模式为NONE或aq强度为0
		if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) 
		{
			if (param->rc.aqMode && param->rc.aqStrength == 0)
			{
				if (quantOffsets)
				{
					for (int cuxy = 0; cuxy < blockCount; cuxy++)
					{
						curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
						curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
					}
				}
				else
				{
					memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
					memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
					for (int cuxy = 0; cuxy < blockCount; cuxy++)
						curFrame->m_lowres.invQscaleFactor[cuxy] = 256; // 初始化为256
				}
			}

			/* Need variance data for weighted prediction and dynamic refinement*/
			if (param->bEnableWeightedPred || param->bEnableWeightedBiPred) // 是否使用加权预测
			{
				for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
					for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
						acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
			}
		}
		else // 当前模式不为X265_AQ_NONE
		{
			// 1.是否使用hevcAq,这是一种新的面向hevc的AQ模式
			if (param->rc.hevcAq)
			{
				// New method for calculating variance and qp offset
				// 提取图像特征并计算图像内容的特性,辅助编码器进行更精确的AQ
				xPreanalyze(curFrame);
			}
			else
			{	// 2.使用常规的AQ
				int blockXY = 0, inclinedEdge = 0;
				double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
				double bias_strength = 0.f;
				double strength = 0.f;
				// 3.如果使用X265_AQ_EDGE模式,则对帧进行高斯滤波和sobel滤波,检测图像的纹理边界
				if (param->rc.aqMode == X265_AQ_EDGE)
					edgeFilter(curFrame, param);

				/*
					aqMode 默认为 X265_AQ_AUTO_VARIANCE
					bHistBasedSceneCut 默认为 0
					recurisonSkipMode 默认为 1
				*/
				if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
				{
					pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
					// 对各个plane进行位移操作
					primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
						curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
				}

				if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
				{
					double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8))); // bitdepth修正
					// 4.计算AC energy(高频信息,也可以理解为图像复杂度),随后调整avg_adj和strength
					for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
					{
						for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
						{
							uint32_t energy, edgeDensity, avgAngle;
							// 计算AC energy
							energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
							if (param->rc.aqMode == X265_AQ_EDGE)
							{
								// Edge模式会计算CU的边缘密度,能够识别出图像中的高对比度区域,这些区域通常包括更多的视觉细节
								edgeDensity = edgeDensityCu(curFrame, avgAngle, blockX, blockY, param->rc.qgSize); // avgAngle是当前块中像素的角度
								if (edgeDensity)
								{	// 依据边缘密度计算qp_adj
									qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1);
									//Increasing the QP of a block if its edge orientation lies around the multiples of 45 degree
									// 正负45°或者是正负90°左右
									if ((avgAngle >= EDGE_INCLINATION - 15 && avgAngle <= EDGE_INCLINATION + 15) || (avgAngle >= EDGE_INCLINATION + 75 && avgAngle <= EDGE_INCLINATION + 105))
										curFrame->m_lowres.edgeInclined[blockXY] = 1; // edgeInclined表示倾向于是edge区域
									else
										curFrame->m_lowres.edgeInclined[blockXY] = 0;
								}
								else // 边缘密度为0,直接使用AC来计算qp_adj
								{
									qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
									curFrame->m_lowres.edgeInclined[blockXY] = 0;
								}
							}
							else // 非edge模式
								qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
							// 写入单个16x16块的qp调整量
							curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
							avg_adj += qp_adj;
							avg_adj_pow2 += qp_adj * qp_adj;
							blockXY++;
						}
					}
					avg_adj /= blockCount;
					avg_adj_pow2 /= blockCount;
					// 根据avg_adj(一帧中平均qp调整量)调整strength,aqStrength默认为1.f
					strength = param->rc.aqStrength * avg_adj;
					// 调整avg_adj(modeTwoConst默认为11.f)
					avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
					bias_strength = param->rc.aqStrength;
				}
				else // 如果是X265_AQ_VARIANCE模式,直接计算strength
					strength = param->rc.aqStrength * 1.0397f;

				// 5.根据不同的AQ模式来调整qp
				blockXY = 0;
				for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
				{
					for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
					{
						if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED) // 自方差并且携带偏置项
						{
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							// modeTwoConst = 11.f
							qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
						}
						else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE) // 自方差模式
						{
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							qp_adj = strength * (qp_adj - avg_adj);
						}
						else if (param->rc.aqMode == X265_AQ_EDGE) // 边缘模式
						{
							inclinedEdge = curFrame->m_lowres.edgeInclined[blockXY];
							qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
							if (inclinedEdge && (qp_adj - avg_adj > 0)) 
								// 期望调整的qp量大于平均值,则加上一个EDGE_BIAS进行调整
								// AQ_EDGE_BIAS = 0.5
								qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj)); 
							else
								qp_adj = strength * (qp_adj - avg_adj);
						}
						else
						{	// 如果是X265_AQ_VARIANCE模式
							uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
							qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
						}

						if (param->bHDR10Opt)
						{
							uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
							uint32_t lumaAvg = sum / (loopIncr * loopIncr);
							if (lumaAvg < 301)
								qp_adj += 3;
							else if (lumaAvg >= 301 && lumaAvg < 367)
								qp_adj += 2;
							else if (lumaAvg >= 367 && lumaAvg < 434)
								qp_adj += 1;
							else if (lumaAvg >= 501 && lumaAvg < 567)
								qp_adj -= 1;
							else if (lumaAvg >= 567 && lumaAvg < 634)
								qp_adj -= 2;
							else if (lumaAvg >= 634 && lumaAvg < 701)
								qp_adj -= 3;
							else if (lumaAvg >= 701 && lumaAvg < 767)
								qp_adj -= 4;
							else if (lumaAvg >= 767 && lumaAvg < 834)
								qp_adj -= 5;
							else if (lumaAvg >= 834)
								qp_adj -= 6;
						}
						if (quantOffsets != NULL)
							qp_adj += quantOffsets[blockXY];
						// 存储qp_adj
						curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
						curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
						curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
						blockXY++;
					}
				}
			}
		}

		if (param->rc.qgSize == 8)
		{
			for (int cuY = 0; cuY < heightInCU; cuY++)
			{
				for (int cuX = 0; cuX < widthInCU; cuX++)
				{
					const int cuXY = cuX + cuY * widthInCU;
					curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
						curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
				}
			}
		}
	}
	// 是否使用加权预测
	if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
	{
		if (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame))
		{
			for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
				for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
					acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
		}

		int hShift = CHROMA_H_SHIFT(param->internalCsp);
		int vShift = CHROMA_V_SHIFT(param->internalCsp);
		maxCol = ((maxCol + 8) >> 4) << 4;
		maxRow = ((maxRow + 8) >> 4) << 4;
		int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift };
		int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };

		for (int i = 0; i < 3; i++)
		{
			uint64_t sum, ssd;
			sum = curFrame->m_lowres.wp_sum[i];
			ssd = curFrame->m_lowres.wp_ssd[i];
			curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
		}
	}
	// 是否使用动态优化或渐入
	if (param->bDynamicRefine || param->bEnableFades)
	{
		uint64_t blockXY = 0, rowVariance = 0;
		curFrame->m_lowres.frameVariance = 0;
		for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
		{
			for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
			{
				curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
				rowVariance += curFrame->m_lowres.blockVariance[blockXY];
				blockXY++;
			}
			curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
		}
		curFrame->m_lowres.frameVariance /= maxRow;
	}
}

从上面的代码中看,4种模式对应的qp_adj方式为:

X265_AQ_VARIANCE
(1)strength = param->rc.aqStrength * 1.0397f
(2)qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)))

X265_AQ_AUTO_VARIANCE
(1)qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)strength = param->rc.aqStrength * avg_adj;(avg_adj为qp_adj的均值)
(3)avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
(4)qp_adj = strength * (qp_adj - avg_adj)

X265_AQ_AUTO_VARIANCE_BIASED
(1)qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)strength = param->rc.aqStrength * avg_adj;(avg_adj为qp_adj的均值)
(3)avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj
(4)bias_strength = param->rc.aqStrength;
(5)qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj))

X265_AQ_EDGE
(1)计算qp_adj初始值
如果edgeDensity不为0:qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1)
如果edgeDensity为0:qp_adj = pow(energy * bit_depth_correction + 1, 0.1)
(2)调整qp_adj的值
如果当前块为edge块,并且qp_adj - avg_adj > 0:qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj))
其他情况:qp_adj = strength * (qp_adj - avg_adj)

总体上看,X265_AQ_VARIANCE模式为aq调整的基础,如果考虑了当前帧中其他的块,变为X265_AQ_AUTO_VARIANCE模式。如果想要更精确地调控,可以增加一些调控因子,演变为X265_AQ_AUTO_VARIANCE_BIASED。如果考虑不局限于AC energy,增加梯度的检测,演变为X265_AQ_EDGE

1.2 图像纹理的检测(edgeFilter)

函数的主要功能是对当前帧进行图像纹理的检测,具体来说,会对输入图像进行高斯滤波,随后进行sobel滤波获得图像的边界纹理

void edgeFilter(Frame *curFrame, x265_param* param)
{
    int height = curFrame->m_fencPic->m_picHeight;
    int width = curFrame->m_fencPic->m_picWidth;
    intptr_t stride = curFrame->m_fencPic->m_stride;
    uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize;
    int maxHeight = numCuInHeight * param->maxCUSize;
	// 初始化
    memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
    memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
    memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));

    pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
    pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;

    for (int i = 0; i < height; i++)
    {
        memcpy(edgePic, src, width * sizeof(pixel));
        memcpy(refPic, src, width * sizeof(pixel));
        src += stride;
        edgePic += stride;
        refPic += stride;
    }

    //Applying Gaussian filter on the picture
	// 进行高斯滤波
    src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
    refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel pixelValue = 0;

    for (int rowNum = 0; rowNum < height; rowNum++)
    {
        for (int colNum = 0; colNum < width; colNum++)
        {
            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
            {
                /*  5x5 Gaussian filter 滤波器
                    [2   4   5   4   2]
                 1  [4   9   12  9   4]
                --- [5   12  15  12  5]
                159 [4   9   12  9   4]
                    [2   4   5   4   2]*/

                const intptr_t rowOne = (rowNum - 2)*stride, colOne = colNum - 2;
                const intptr_t rowTwo = (rowNum - 1)*stride, colTwo = colNum - 1;
                const intptr_t rowThree = rowNum * stride, colThree = colNum;
                const intptr_t rowFour = (rowNum + 1)*stride, colFour = colNum + 1;
                const intptr_t rowFive = (rowNum + 2)*stride, colFive = colNum + 2;
                const intptr_t index = (rowNum*stride) + colNum;
				// 进行高斯滤波
                pixelValue = ((2 * src[rowOne + colOne] + 4 * src[rowOne + colTwo] + 5 * src[rowOne + colThree] + 4 * src[rowOne + colFour] + 2 * src[rowOne + colFive] +
                    4 * src[rowTwo + colOne] + 9 * src[rowTwo + colTwo] + 12 * src[rowTwo + colThree] + 9 * src[rowTwo + colFour] + 4 * src[rowTwo + colFive] +
                    5 * src[rowThree + colOne] + 12 * src[rowThree + colTwo] + 15 * src[rowThree + colThree] + 12 * src[rowThree + colFour] + 5 * src[rowThree + colFive] +
                    4 * src[rowFour + colOne] + 9 * src[rowFour + colTwo] + 12 * src[rowFour + colThree] + 9 * src[rowFour + colFour] + 4 * src[rowFour + colFive] +
                    2 * src[rowFive + colOne] + 4 * src[rowFive + colTwo] + 5 * src[rowFive + colThree] + 4 * src[rowFive + colFour] + 2 * src[rowFive + colFive]) / 159);
                refPic[index] = pixelValue;
            }
        }
    }
	// 对已经进行了高斯滤波的图像再进行sobel滤波
    if(!computeEdge(edgePic, refPic, edgeTheta, stride, height, width, true))
        x265_log(NULL, X265_LOG_ERROR, "Failed edge computation!");
}

computeEdge()的代码为

bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
{
    intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
    intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;

    const int startIndex = 1;

    if (!edgePic || !refPic || (!edgeTheta && bcalcTheta))
    {
        return false;
    }
    else
    {
        float gradientH = 0, gradientV = 0, radians = 0, theta = 0;
        float gradientMagnitude = 0;
        pixel blackPixel = 0;

        //Applying Sobel filter expect for border pixels
		// 对于边界像素应用sobel滤波
        height = height - startIndex;
        width = width - startIndex;
        for (int rowNum = startIndex; rowNum < height; rowNum++)
        {
            rowTwo = rowNum * stride;
            rowOne = rowTwo - stride;
            rowThree = rowTwo + stride;

            for (int colNum = startIndex; colNum < width; colNum++)
            {
				/*
					标准的sobel滤波算子为,x265当中应该是进行了微调
					     [-1  0  1] 	   [-1  -2  -1]
					gH = [-2  0  2]   gV = [0    0   0]
						 [-1  0  1]		   [ 1   2   1]
				*/
                 /*  Horizontal and vertical gradients
                     [ -3   0   3 ]        [-3   -10  -3 ]
                 gH =[ -10  0   10]   gV = [ 0    0    0 ]
                     [ -3   0   3 ]        [ 3    10   3 ] */

                colTwo = colNum;
                colOne = colTwo - startIndex;
                colThree = colTwo + startIndex;
                middle = rowTwo + colTwo;
                topLeft = rowOne + colOne;
                topRight = rowOne + colThree;
                bottomLeft = rowThree + colOne;
                bottomRight = rowThree + colThree;
                // 计算水平方向梯度
                gradientH = (float)(-3 * refPic[topLeft] + 3 * refPic[topRight] - 10 * refPic[rowTwo + colOne] + 10 * refPic[rowTwo + colThree] - 3 * refPic[bottomLeft] + 3 * refPic[bottomRight]);
                // 计算垂直方向梯度
                gradientV = (float)(-3 * refPic[topLeft] - 10 * refPic[rowOne + colTwo] - 3 * refPic[topRight] + 3 * refPic[bottomLeft] + 10 * refPic[rowThree + colTwo] + 3 * refPic[bottomRight]);
                // 计算总共的梯度大小,描述该位置的复杂度
                gradientMagnitude = sqrtf(gradientH * gradientH + gradientV * gradientV);
                if(bcalcTheta) 
                {
                    edgeTheta[middle] = 0;
                    radians = atan2(gradientV, gradientH);
                    theta = (float)((radians * 180) / PI);
                    if (theta < 0)
                       theta = 180 + theta;
                    edgeTheta[middle] = (pixel)theta;
                }
				// 如果梯度幅值超出了阈值,则赋值为whitePixel,whitePixel=1表示检测到了边界
                edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
            }
        }
        return true;
    }
}

1.3 计算AC energy(acEnergyCu)

函数计算了一帧图像的AC分量

/* Find the total AC energy of each block in all planes */
uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
{
    intptr_t stride = curFrame->m_fencPic->m_stride;
    intptr_t cStride = curFrame->m_fencPic->m_strideC;
    intptr_t blockOffsetLuma = blockX + (blockY * stride);
    int hShift = CHROMA_H_SHIFT(csp);
    int vShift = CHROMA_V_SHIFT(csp);
    intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);

    uint32_t var;
	// 计算luma的AC分量
    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp, qgSize);
	// 计算chroma的AC分量
    if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
    {
        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp, qgSize);
        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp, qgSize);
    }
    x265_emms();
    return var;
}

计算单通道的AC energy

/* Find the energy of each block in Y/Cb/Cr plane */
inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat, uint32_t qgSize)
{
	// 计算chroma
    if ((colorFormat != X265_CSP_I444) && plane)
    {
        if (qgSize == 8) // 如果qgSize比较小,则使用4x4尺寸计算AC
        {
            ALIGN_VAR_4(pixel, pix[4 * 4]);
            primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
			// primitives.cu[BLOCK_4x4].var(pix, 4)表示计算图像的方差
            return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
        }
        else
        {	// 使用8x8尺寸计算AC
            ALIGN_VAR_8(pixel, pix[8 * 8]);
            primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
        }
    }
    else
    {	// 计算luma
        if (qgSize == 8)
            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
        else
            return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
    }
}

acEnergyVar()的定义如下,获得每帧的AC energy

/* Compute variance to derive AC energy of each block */
inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
{
    uint32_t sum = (uint32_t)sum_ssd;
    uint32_t ssd = (uint32_t)(sum_ssd >> 32);

    curFrame->m_lowres.wp_sum[plane] += sum;
    curFrame->m_lowres.wp_ssd[plane] += ssd;
    return ssd - ((uint64_t)sum * sum >> shift);
}

1.4 计算梯度密度(edgeDensityCu)

如果当前的模式为X265_AQ_EDGE模式,则会计算梯度密度。具体来说,先寻找一个块的平均角度,随后计算每帧的AC energy

uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
{
    pixel *edgeImage = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
    pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
    intptr_t srcStride = curFrame->m_fencPic->m_stride;
    intptr_t blockOffsetLuma = blockX + (blockY * srcStride);
    int plane = 0; // Sobel filter is applied only on Y component
    uint32_t var;

    if (qgSize == 8)
    {
        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, qgSize, avgAngle);
        var = acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(edgeImage + blockOffsetLuma, srcStride), 6, plane);
    }
    else
    {
    	// 寻找块的平均角度,通过求取平均值实现,这里的edgeTheta在edgeFilter()当中计算得到
        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, 16, avgAngle);
        // 计算AC energy
        var = acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(edgeImage + blockOffsetLuma, srcStride), 8, plane);
    }
    x265_emms();
    return var;
}

findAvgAngle()的实现如下,通过求一个块中的平均像素角度来估算一个块的角度

//Find the angle of a block by averaging the pixel angles 
inline void findAvgAngle(const pixel* block, intptr_t stride, uint32_t size, uint32_t &angle)
{
    int sum = 0;
    for (uint32_t y = 0; y < size; y++)
    {
        for (uint32_t x = 0; x < size; x++)
        {
            sum += block[x];
        }
        block += stride;
    }
    angle = sum / (size*size);
}

2.宏块树(cuTree)

在x264当中,有mbtree这一项工具,用于提升宏块级编码效率。在x265当中也有类似的技术,叫做cuTree,两者差不多。具体来说,cuTree位于lookahead模块中,通过将lookahead队列中的帧按照从后向前的顺序进行分析,来获得前序帧中CU相对于后序帧中CU的重要程度,这里的后序帧CU会将前序帧CU作为参考CU。在主线程编码流程中,如果前序帧中的CU重要程度比较高,说明应该为其使用较低的QP(即高质量编码),这样后序帧中的CU就能够获得更好的编码效果

/*
	例如,队列中有3个P帧,主线程编码顺序为Pn-1,Pn,Pn+1,如下
	... Pn-1 -> Pn -> Pn+1 ...
	cuTree分析时的顺序为Pn+1,Pn,Pn-1
*/

cuTree的计算流程位于encoder\slicetype.cpp中,由cuTree()实现,主要工作流程为
(1)计算帧intra和inter cost(singleCost)
(2)计算CU的传播cost(estimateCUPropagate)
(3)根据前面计算的传播cost来评估qp调整量(cuTreeFinish)

void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
{
    int idx = !bIntra;
    int lastnonb, curnonb = 1;
    int bframes = 0;

    x265_emms();
    double totalDuration = 0.0;
    for (int j = 0; j <= numframes; j++)
        totalDuration += (double)m_param->fpsDenom / m_param->fpsNum;
	// 计算平均持续时间
    double averageDuration = totalDuration / (numframes + 1);

    int i = numframes;
	// 从后向前,寻找到第一个非B帧
    while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
        i--;

    lastnonb = i;

    /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
     * be applied to the end of a lookahead buffer of any size.  However, it's most needed when
     * lookahead=0, so that's what's currently implemented. */
	// 如果lookahead队列为空,将propagatecost和qpCuTreeOffset都初始化为0
    if (!m_param->lookaheadDepth)
    {
        if (bIntra)
        {
            memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
            if (m_param->rc.qgSize == 8)
                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * 4 * sizeof(double));
            else
                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
            return;
        }
        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
        memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
    }
    else
    {
        if (lastnonb < idx)
            return;
        memset(frames[lastnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
    }

    CostEstimateGroup estGroup(*this, frames);
	// 开始向前计算propagate cost
    while (i-- > idx)
    {
        curnonb = i;
		// 查找第一个非B帧,作为当前节点
        while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
            curnonb--;

        if (curnonb < idx)
            break;
		// 1.计算intra和inter cost
        estGroup.singleCost(curnonb, lastnonb, lastnonb);

        memset(frames[curnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
        bframes = lastnonb - curnonb - 1;
		// 是否使用金字塔模式
        if (m_param->bBPyramid && bframes > 1) 
        {
            int middle = (bframes + 1) / 2 + curnonb;
            estGroup.singleCost(curnonb, lastnonb, middle);
            memset(frames[middle]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
            while (i > curnonb)
            {
                int p0 = i > middle ? middle : curnonb;
                int p1 = i < middle ? middle : lastnonb;
                if (i != middle)
                {
                    estGroup.singleCost(p0, p1, i);
                    estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
                }
                i--;
            }

            estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
        }
        else
        {
			// i为一个B帧
            while (i > curnonb)
            {
                estGroup.singleCost(curnonb, lastnonb, i);
                estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
                i--;
            }
        }
		// 2.计算CU的传播cost
        estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
        lastnonb = curnonb;
    }

    if (!m_param->lookaheadDepth)
    {
        estGroup.singleCost(0, lastnonb, lastnonb);
        estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
        std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
    }
	// 3.根据前面计算的传播cost来评估qp调整量
    cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
    if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
        cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
}

2.1 计算帧损失(singleCost)

singleCost()中调用了estimateFrameCost()计算帧的cost

int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
{
    LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];
    return estimateFrameCost(tld, p0, p1, b, intraPenalty);
}

estimateFrameCost()的定义如下

/*
	计算一帧的cost,其中
	p0表示前向参考帧位置,p1表示后向参考帧位置,b为当前帧位置
	若p0 = p1 = b,则表示没有参考帧,即I帧
	若p1 = b,则表示只有前向参考帧,即P帧

	作为I帧,所有宏块的cost = intra cost
	作为P帧,所有宏块的cost = min( intra cost, inter cost)
	作为B帧,所有宏块的cost = inter cost

	其中每一个帧都带有开销矩阵costEst[b-p0][p1-b]
	表示帧b以p0为前向参考,p1为后向参考时的帧cost
*/
int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
{
    Lowres*     fenc  = m_frames[b];
    x265_param* param = m_lookahead.m_param;
    int64_t     score = 0;

	// 是否已经存在cost
    if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
        score = fenc->costEst[b - p0][p1 - b];
    else
    {
        bool bDoSearch[2];
        bDoSearch[0] = fenc->lowresMvs[0][b - p0][0].x == 0x7FFF;			// 是否进行前向搜索
        bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFF; // 是否进行后向搜索

#if CHECKED_BUILD
        X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0][0].x == 0x7FFE), "motion search batch duplication L0\n");
        X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFE), "motion search batch duplication L1\n");
        if (bDoSearch[0]) fenc->lowresMvs[0][b - p0][0].x = 0x7FFE;
        if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b][0].x = 0x7FFE;
#endif

        fenc->weightedRef[b - p0].isWeighted = false;
		// 是否进行加权预测
        if (param->bEnableWeightedPred && bDoSearch[0])
            tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);

        fenc->costEst[b - p0][p1 - b] = 0;
        fenc->costEstAq[b - p0][p1 - b] = 0;
		// m_batchMode表示批量处理模式(即并行模式,默认会使用),这里是帧级别的并行
        if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1]))
        {
            /* Use cooperative mode if a thread pool is available and the cost estimate is
             * going to need motion searches or bidir measurements */
			// 如果线程池可用,并且成本估计将需要运动搜索或边界测量,则使用合作模式
            memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);

            m_lock.acquire();
            X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
            m_coop.p0 = p0;
            m_coop.p1 = p1;
            m_coop.b = b;
            m_coop.bDoSearch[0] = bDoSearch[0];
            m_coop.bDoSearch[1] = bDoSearch[1];
            m_jobTotal = m_lookahead.m_numCoopSlices;
            m_jobAcquired = 0;
            m_lock.release();

            tryBondPeers(*m_lookahead.m_pool, m_jobTotal);

            processTasks(-1);

            waitForExit();

            for (int i = 0; i < m_lookahead.m_numCoopSlices; i++)
            {
                fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
                fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq;
                if (p1 == b) // 没有后向参考,写入Intra cost
                    fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
            }
        }
        else
        {
            /* Calculate MVs for 1/16th resolution*/
            bool lastRow;
            if (param->bEnableHME)
            {
                lastRow = true;
                for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
                {
                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
                        estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
                    lastRow = false;
                }
            }
			/*
				计算每个CU的cost
				(1)这里使用的是倒序的方式,从一帧的最右下角开始计算,在x264的slicetype_slice_cost函数中解释如下:
					MV在主编码过程中被用作预测器,通过倒序的方式能够有效提高MV预测的总体水平
				(2)我猜测这里可能有几方面的原因
					(a)视频特性
						一般情况下,视频具有向右、向下的一个趋势,人眼注意力也会倾向于关注偏右下角区域
					(b)信息参考
						先预测右下角,这使得左上角CU在进行预测时能够获取较多的信息参考,计算左上角CU的损失时更加准确,qp的调控也更加准确
						此时,在主编码流程中,如果左上角CU编码质量较高,整帧的编码质量都会较高
					(c)经验性
						可能按照这种配置,在大规模测试时,取得了不错的性能
			*/
            lastRow = true;
            for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
            {
                fenc->rowSatds[b - p0][p1 - b][cuY] = 0;

                for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);

                lastRow = false;
            }
        }

        score = fenc->costEst[b - p0][p1 - b];

        if (b != p1)
            score = score * 100 / (130 + param->bFrameBias);

        fenc->costEst[b - p0][p1 - b] = score;
    }

    if (bIntraPenalty)
        // arbitrary penalty for I-blocks after B-frames
        score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8);

    return score;
}

estimateCUCost()的定义如下

void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
    Lowres *fref0 = m_frames[p0];
    Lowres *fref1 = m_frames[p1];
    Lowres *fenc  = m_frames[b];

    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
	// 如果是hme,块大小为4x4,否则为8x8
    const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
    const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
    const int bBidir = (b < p1);
    const int cuXY = cuX + cuY * widthInCU;
    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
    const int cuSize = X265_LOWRES_CU_SIZE;
    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);

    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
    else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);


    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
    int lowresPenalty = 4;
    int listDist[2] = { b - p0, p1 - b};

    MV mvmin, mvmax;
    int bcost = tld.me.COST_MAX;
    int listused = 0;

    // TODO: restrict to slices boundaries
    // establish search bounds that don't cross extended frame boundaries
    mvmin.x = (int32_t)(-cuX * cuSize - 8);
    mvmin.y = (int32_t)(-cuY * cuSize - 8);
    mvmax.x = (int32_t)((widthInCU - cuX - 1) * cuSize + 8);
    mvmax.y = (int32_t)((heightInCU - cuY - 1) * cuSize + 8);

    for (int i = 0; i < 1 + bBidir; i++)
    {
        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
        int skipCost = INT_MAX;
		// 不进行搜索,直接对比cost
        if (!bDoSearch[i])
        {
            COPY2_IF_LT(bcost, fencCost, listused, i + 1);
            continue;
        }

        int numc = 0;
        MV mvc[5], mvp;
		// 如果使用hme搜索,则使用resmvs,即残差mv
        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
        ReferencePlanes* fref = i ? fref1 : wfref0;

        /* Reverse-order MV prediction */
#define MVC(mv) mvc[numc++] = mv;
		// 将mv填充到mvc中
        if (cuX < widthInCU - 1)
            MVC(fencMV[1]);	// 填充右侧块的MV
        if (!lastRow)
        {
            MVC(fencMV[widthInCU]);	// 填充下方块的MV
            if (cuX > 0)
                MVC(fencMV[widthInCU - 1]);	// 填充左下方块的MV
            if (cuX < widthInCU - 1)
                MVC(fencMV[widthInCU + 1]);	// 填充右下方块的MV
        }
        if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
        {
            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
        }
#undef MVC
		// 如果无可用mv,则mvp设置为0
        if (!numc)
            mvp = 0;
        else
        {
            ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
            int mvpcost = MotionEstimate::COST_MAX;

            /* measure SATD cost of each neighbor MV (estimating merge analysis)
             * and use the lowest cost MV as MVP (estimating AMVP). Since all
             * mvc[] candidates are measured here, none are passed to motionEstimate */
			// 对相邻mv评估SATD的损失,最佳的mv存储到mvp中
            for (int idx = 0; idx < numc; idx++)
            {
                intptr_t stride = X265_LOWRES_CU_SIZE;
				// 低分辨率的运动补偿
                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
				// 计算SATD
                int cost = tld.me.bufSATD(src, stride);
                COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
                /* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
                if (!mvp.notZero() && bBidir)
                    skipCost = cost;
            }
        }
		// 基于前面获取的最佳mv,进行运动估计
        int searchRange = m_lookahead.m_param->bEnableHME ? (hme ? m_lookahead.m_param->hmeRange[0] : m_lookahead.m_param->hmeRange[1]) : s_merange;
        /* ME will never return a cost larger than the cost @MVP, so we do not
         * have to check that ME cost is more than the estimated merge cost */
        if(!hme)
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices);
        else
            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, searchRange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
        if (skipCost < 64 && skipCost < fencCost && bBidir)
        {
            fencCost = skipCost;
            *fencMV = 0;
        }
        COPY2_IF_LT(bcost, fencCost, listused, i + 1);
    }
    if (hme)
        return;

    if (bBidir) /* B, also consider bidir */
    {
        /* NOTE: the wfref0 (weightp) is not used for BIDIR */

        /* avg(l0-mv, l1-mv) candidate */
        ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
        ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
        int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
        COPY2_IF_LT(bcost, bicost, listused, 3);
        /* coloc candidate */
        src0 = fref0->lowresPlane[0] + pelOffset;
        src1 = fref1->lowresPlane[0] + pelOffset;
        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
        bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
        COPY2_IF_LT(bcost, bicost, listused, 3);
        bcost += lowresPenalty;
    }
    else /* P, also consider intra */
    {
        bcost += lowresPenalty;

        if (fenc->intraCost[cuXY] < bcost)
        {
            bcost = fenc->intraCost[cuXY];
            listused = 0;
        }
    }

    /* do not include edge blocks in the frame cost estimates, they are not very accurate */
	// 不能将边界块纳入计算,因为不准确
    const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
    int bcostAq;
    if (m_lookahead.m_param->rc.qgSize == 8)
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
    else
        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;

    if (bFrameScoreCU)
    {
        if (slice < 0)
        {
            fenc->costEst[b - p0][p1 - b] += bcost;
            fenc->costEstAq[b - p0][p1 - b] += bcostAq;
            if (!listused && !bBidir)
                fenc->intraMbs[b - p0]++;
        }
        else
        {
            m_slice[slice].costEst += bcost;
            m_slice[slice].costEstAq += bcostAq;
            if (!listused && !bBidir)
                m_slice[slice].intraMbs++;
        }
    }

    fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
    fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}

2.2 计算传播损失(estimateCUPropagate)

函数的主要功能是根据前面获得的inter与intra cost来计算传播损失,传播损失计算的公式为
c o s t = ( p r o p a g a t e I n + i n t r a C o s t ∗ i n v q s c a l e ∗ f p s F a c t o r > > 8 ) ∗ ( 1 − i n t e r C o s t i n t r a C o s t ) cost = (propagateIn + intraCost * invqscale * fpsFactor >> 8) * (1-\frac{ interCost}{intraCost}) cost=(propagateIn+intraCostinvqscalefpsFactor>>8)(1intraCostinterCost)
这个公式表示的意思是:
(1)当前CU的cost,与propagateIn呈正相关(或者说别的CU赋予当前CU的重要程度)。如果别的CU认为当前CU很重要,则当前CU应该被认为是重要的,应该以较高质量编码

(2)当前CU的cost,与intraCost呈正相关。如果intraCost比较大,说明纹理比较复杂,应该以较高质量编码。qscale表示一个与qp相关的因子,可以理解是一个调控因子

(3)当前CU的cost,与fps(或者说视频持续时长)呈正相关。如果fps较大,从人眼视觉来说,当前帧比较重要

(4)当前CU的cost,与interCost和intraCost之间的关系有关,只有当interCost小于intraCost时,传播损失才大于0,并且只有传播损失大于0时才会被使用。如果interCost远小于intraCost,说明视频前后的图像很相似,使用Inter模式带来的损失很小,如果将当前CU以高质量编码,后续CU编码损失会很小。在这种情况下,传播损失会比较大,即当前CU的重要程度很高

PS:如果interCost大于intraCost,说明当前CU直接使用Intra模式效果更好,而cuTree是面向Inter模式的一种技术,这种情况下的传播损失设置为0

// 根据前面获得的inter 和 intra cost,计算CU级的传播cost
void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
{
	// 在lookahead当中进行帧计算时,使用的是经过下采样的低分辨率图像Lowres
    uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
	/*
		b表示当前帧
		p0表示b的前向参考帧
		p1表示b的后向参考帧
	*/
    int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
    int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
    int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
    int listDist[2] = { b - p0, p1 - b };

    memset(m_scratch, 0, m_8x8Width * sizeof(int));

    uint16_t *propagateCost = frames[b]->propagateCost;

    x265_emms();
    double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);

    /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
    if (!referenced) // 如果当前帧没有被参考,则inter cost为0
        memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t));

    int32_t strideInCU = m_8x8Width;
    for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++)
    {
        int cuIndex = blocky * strideInCU;
		/*
			逐行计算propagate cost,计算的结果存储在m_scratch中,这是一个并行操作,每次都会计算一行
			(1)propagateCost是其他帧传递给当前帧的cost,表示当前帧的重要程度
			(2)frames[b]->intraCost表示当前帧的intra cost
			(3)frames[b]->lowresCosts[b - p0][p1 - b]表示前一个参考帧传递给后一个参考帧的inter cost
			(4)frames[b]->invQscaleFactor表示invqscale,可以理解是与qp相关的影响因子
			(5)fpsFactor表示fps因子,如果当前帧持续时间长,说明当前帧比较重要
		*/
        if (m_param->rc.qgSize == 8)
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
        else // 使用x265_mbtree_propagate_cost_avx2实现行级计算
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);

        if (referenced)
            propagateCost += m_8x8Width;

        for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++)
        {
            int32_t propagate_amount = m_scratch[blockx]; // 这里的propagate_amount就代表了cost
            /* Don't propagate for an intra block. */
            if (propagate_amount > 0) // intra block不传播
            {
                /* Access width-2 bitfield. */
                int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
                /* Follow the MVs to the previous frame(s). */
                for (uint16_t list = 0; list < 2; list++)
                {
                    if ((lists_used >> list) & 1)
                    {
#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) // 两个数字相加并返回给s,并检查是否上溢
                        int32_t listamount = propagate_amount;
                        /* Apply bipred weighting. */
                        if (lists_used == 3)
                            listamount = (listamount * bipredWeights[list] + 32) >> 6;

                        MV *mvs = frames[b]->lowresMvs[list][listDist[list]];

                        /* Early termination for simple case of mv0. */
                        if (!mvs[cuIndex].word) // mv为零,直接将cost写入
                        {
                            CLIP_ADD(refCosts[list][cuIndex], listamount);
                            continue;
                        }
						/*
							如果mv不为0,说明当前块传递给别的块(传播块)时,对应的传播块不是一个完整块,此时需要划分成4个块来考虑

							+ ---- + ---- +
							|	0  |   1  |
							+ ---- + ---- +
							|   2  |   3  |
							+ ---- + ---- +

							在内存中,mv的存储以1/4像素进行存储,要将mv中的x和y转换成为以8x8块为单位的坐标
							(1)x >> 2 表示将1/4像素转换成整像素
							(2)x >> 3 表示将整像素转换成为以8x8为单位的坐标
						*/
                        int32_t x = mvs[cuIndex].x;
                        int32_t y = mvs[cuIndex].y;
                        int32_t cux = (x >> 5) + blockx;		// 以8x8为单位块的横坐标
                        int32_t cuy = (y >> 5) + blocky;		// 以8x8为单位块的纵坐标
                        int32_t idx0 = cux + cuy * strideInCU;	// 0号块位置
                        int32_t idx1 = idx0 + 1;				// 1号块位置
                        int32_t idx2 = idx0 + strideInCU;		// 2号块位置
                        int32_t idx3 = idx0 + strideInCU + 1;	// 3号块位置
                        x &= 31;
                        y &= 31;
                        int32_t idx0weight = (32 - y) * (32 - x);	// 0号块权重
                        int32_t idx1weight = (32 - y) * x;			// 1号块权重
                        int32_t idx2weight = y * (32 - x);			// 2号块权重
                        int32_t idx3weight = y * x;					// 3号块权重

                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
                         * be counted. */
						/*
							检查对应的传播块是否超出边界
							(1)如果没有超出边界,则直接计算cost并赋值
							(2)如果有部分块超出了边界,则赋值可用的块
						*/
                        if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0) // 所有块都没有超出边界
                        {
                            CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
                        }
                        else /* Check offsets individually */
                        {
							// 可能有部分块超出了边界
                            if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0)					// idx0可用
                                CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0)			// idx1可用
                                CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0)			// idx2可用
                                CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0)	// idx3可用
                                CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);	
                        }
                    }
                }
            }
        }
    }

    if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
        cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
}

2.3 计算qp调整量(cuTreeFinish)

函数的主要功能是基于前面已经获取的传播损失,来计算当前CU的qp调整量

void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
{
	// 是否使用hevcAq模式
    if (m_param->rc.hevcAq)
    {
        computeCUTreeQpOffset(frame, averageDuration, ref0Distance);
    }
    else
    {	// 不使用hevcAq模式
        int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
        double weightdelta = 0.0;

        if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
            weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);

        if (m_param->rc.qgSize == 8)
        {
            for (int cuY = 0; cuY < m_8x8Height; cuY++)
            {
                for (int cuX = 0; cuX < m_8x8Width; cuX++)
                {
                    const int cuXY = cuX + cuY * m_8x8Width;
                    int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
                    if (intracost)
                    {
                        int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
                        double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
                    }
                }
            }
        }
        else
        {
            for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
            {
                int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
				// 当前CU纹理值得被参考时(intracost不为0),才计算传播cost
                if (intracost)
                {
					// 为每个cu块(以16x16为尺寸)赋值qpOffset
                    int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
                    double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
                    /*
                    	(1)m_cuTreeStrength的计算取决于是否使用hevcAq以及qCompress
						m_cuTreeStrength = (m_param->rc.hevcAq ? 6.0 : 5.0) * (1.0 - m_param->rc.qCompress);
						(2)log2_ratio的计算取决于propagateCost和intraCost的比值,propagateCost相比于intraCost而言越大,
						log2_ratio的值越大,frame->qpCuTreeOffset[cuIndex]就越小,此时应该以高质量编码
						
						PS: qpAqOffset的值可以为负,log2_ratio的值越大,则负的越多;如果qpAqOffset为正,log2_ratio的值越大,
							则正的越少。不论哪种情况,结果都是实际编码qp会更低
					*/
                    frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
                }
            }
        }
    }
}

3.qpOffset的使用

通过前面的AQ和cuTree获得了qpOffset等信息,在实际编码过程中会被使用到,粗略来说,可能有几种用法:
(1)直接使用qp进行CU级调整(例如calculateQpforCuSize)
(2)调整行级平均qp(与bOptCUDeltaQP相关)
(3)调整lowres的帧级cost,影响码控(与VBV相关,例如getEstimatedPictureCost)

在这几种使用中,简单记录一下calculateQpForCuSize()

int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
{
    FrameData& curEncData = *m_frame->m_encData;
    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;

    if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
    {
        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
            qp += distortionData->offset[ctu.m_cuAddr];
    }
	// analysisLoadReuseLevel默认为0
    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
    {
        int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
        if (ctu.m_slice->m_sliceType == I_SLICE)
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
        else
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
    }
	// 是否使用hevcAq
    if (m_param->rc.hevcAq)
    {
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
        double dQpOffset = 0;
        if (bCuTreeOffset)
        {
            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
        }
        else
        {
            dQpOffset = aqQPOffset(ctu, cuGeom);
            if (complexCheck)
            {
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
                return (offset < max_threshold);
            }
        }
        qp += dQpOffset;
    }
    else
    {
        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
        if (qpoffs)
        {
            uint32_t width = m_frame->m_fencPic->m_picWidth;
            uint32_t height = m_frame->m_fencPic->m_picHeight;
            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
            double dQpOffset = 0;
            uint32_t cnt = 0;
            // 遍历16x16小块,从中取出原先计算好的qpOffset
            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
            {
                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
                {
                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
                    dQpOffset += qpoffs[idx];
                    cnt++;
                }
            }
            dQpOffset /= cnt;
            qp += dQpOffset; // 进行qp的调整
			// complexCheck默认为 -1
            if (complexCheck)	
            {
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
                return (offset < max_threshold);
            }
        }
    }
	// 对qp进行clip,防止溢出
    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
}

4.AQ技术和cuTree技术之间的关联

AQ技术和cuTree技术都是CU级别的码率控制技术,其中AQ技术主要思想是基于帧内图像的空域相关性来调整qp,cuTree技术主要思想是基于帧间图像的时域相关性来调整qp。在实际编码过程中,通常是先计算AQ再计算cuTree,两者之间的影响关系如下所示,位于encoder\encoder.cpp中

/*
	Encoder::configure(),位于encoder.cpp中
	(1)如果不启用aq,但启用了cuTree,则会强制设置aqMode = X265_AQ_VARIANCE
	(2)如果aqStrength为0,同时不启用cuTree,则aqMode = X265_AQ_NONE
	(3)如果不启用aq和cuTree,则aqStrength = 0
*/
if (p->rc.aqMode == 0 && p->rc.cuTree)
{
    p->rc.aqMode = X265_AQ_VARIANCE;
    p->rc.aqStrength = 0.0;
}

if (p->rc.aqStrength == 0 && p->rc.cuTree == 0)
{
    p->rc.aqMode = X265_AQ_NONE;
    p->rc.hevcAq = 0;
}

if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0)
    p->rc.aqStrength = 0;

5.hevcAq模式

这个模式是后续提出来的,单独面向265标准的一种qp计算模式,默认不会启用。这种模式的主要思想是考虑一帧之内的纹理复杂度(用方差描述),利用单个块和整帧平均值来计算qpOffset。在这个模式下,复杂度的计算使用xPreanalyze()实现,qpOffset的计算使用computeCUTreeQpOffset()实现

5.1 复杂度的计算(xPreanalyze)

xPreanalyze()函数被calcAdaptiveQuantFrame()调用,简单来说就是按照不同的粒度对当前帧进行分析,分析的依据是方差,根据方差来对CU进行qp的调整

void LookaheadTLD::xPreanalyze(Frame* curFrame)
{
    const uint32_t width = curFrame->m_fencPic->m_picWidth;
    const uint32_t height = curFrame->m_fencPic->m_picHeight;
    const intptr_t stride = curFrame->m_fencPic->m_stride;
	// 1.按照不同的粒度,计算方差(或者说纹理复杂度)
    for (uint32_t d = 0; d < 4; d++)
    {
		// maxCUSize默认为64,ctuSizeIdx为0
        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
		// qgSize默认为32,aqDepth为1
        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
		/*
			对于aqLayerDepth[ctuSizeIdx][aqDepth][d]的理解是:
			(1)aqLayerDepth的定义如下
			static const uint32_t aqLayerDepth[3][4][4] = 
			{
				{  // ctu size 64
					{ 1, 0, 1, 0 },	aqDepth = 0
					{ 1, 1, 1, 0 }, aqDepth = 1
					{ 1, 1, 1, 0 }, aqDepth = 2
					{ 1, 1, 1, 1 }	aqDepth = 3
				},
				{  // ctu size 32
					{ 1, 1, 0, 0 },
					{ 1, 1, 0, 0 },
					{ 1, 1, 1, 0 },
					{ 0, 0, 0, 0 },
				},
				{  // ctu size 16
					{ 1, 0, 0, 0 },
					{ 1, 1, 0, 0 },
					{ 0, 0, 0, 0 },
					{ 0, 0, 0, 0 }
				}
			};
			(2)假设aqLayerDepth[0][1][0]=1,表示
				(a)ctuSizeIdx = 0,表示maxCUSize为64
				(b)aqDepth = 1,表示qgSize为32
				(c)d = 0,表示当前执行第0个级别粒度的分析

				aqLayerDepth[0][1] = {1, 1, 1, 0}表示允许进行第0,1,2级别粒度的分析,每种粒度会对应不同
					分析块的尺寸
		*/
        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
            continue;
		
        const pixel* src = curFrame->m_fencPic->m_picOrg[0];; // 0表示luma分量
        PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[d]; // 获取当前粒度下的layer
        const uint32_t aqPartWidth = pQPLayer->aqPartWidth;		// 3种粒度,分别是{64, 32, 16}
        const uint32_t aqPartHeight = pQPLayer->aqPartHeight;	// 3种粒度,分别是{64, 32, 16}
        double* pcAQU = pQPLayer->dActivity;

        double dSumAct = 0.0;
		// 按照不同的粒度,来遍历一帧当中所有的块
        for (uint32_t y = 0; y < height; y += aqPartHeight)
        {
            const uint32_t currAQPartHeight = X265_MIN(aqPartHeight, height - y);
            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++)
            {
                const uint32_t currAQPartWidth = X265_MIN(aqPartWidth, width - x);
                const pixel* pBlkY = &src[x];
                uint64_t sum[4] = { 0, 0, 0, 0 };
                uint64_t sumSq[4] = { 0, 0, 0, 0 };
                uint32_t by = 0;
				/*
					sum[4]中的4个索引分别表示4个子块
					+---+---+
					| 0 | 1 |
					+---+---+
					| 2 | 3 |
					+---+---+
				*/
                for (; by < currAQPartHeight >> 1; by++)
                {
                    uint32_t bx = 0;
                    for (; bx < currAQPartWidth >> 1; bx++)
                    {
                        sum[0] += pBlkY[bx];
                        sumSq[0] += pBlkY[bx] * pBlkY[bx];
                    }
                    for (; bx < currAQPartWidth; bx++)
                    {
                        sum[1] += pBlkY[bx];
                        sumSq[1] += pBlkY[bx] * pBlkY[bx];
                    }
                    pBlkY += stride;
                }
                for (; by < currAQPartHeight; by++)
                {
                    uint32_t bx = 0;
                    for (; bx < currAQPartWidth >> 1; bx++)
                    {
                        sum[2] += pBlkY[bx];
                        sumSq[2] += pBlkY[bx] * pBlkY[bx];
                    }
                    for (; bx < currAQPartWidth; bx++)
                    {
                        sum[3] += pBlkY[bx];
                        sumSq[3] += pBlkY[bx] * pBlkY[bx];
                    }
                    pBlkY += stride;
                }

                assert((currAQPartWidth & 1) == 0);
                assert((currAQPartHeight & 1) == 0);
                const uint32_t pixelWidthOfQuadrants = currAQPartWidth >> 1;
                const uint32_t pixelHeightOfQuadrants = currAQPartHeight >> 1;
				// 计算每个子块中像素的数量
                const uint32_t numPixInAQPart = pixelWidthOfQuadrants * pixelHeightOfQuadrants;

                double dMinVar = MAX_DOUBLE;
				// 求每个子块均值和方差
                if (numPixInAQPart != 0)
                {
                    for (int i = 0; i < 4; i++)
                    {
                        const double dAverage = double(sum[i]) / numPixInAQPart;
                        const double dVariance = double(sumSq[i]) / numPixInAQPart - dAverage * dAverage;
                        dMinVar = X265_MIN(dMinVar, dVariance);
                    }
                }
                else
                {
                    dMinVar = 0.0;
                }
                double dActivity = 1.0 + dMinVar;
				// 存储方差
                *pcAQU = dActivity;	// CU级方差
                dSumAct += dActivity;
            }
            src += stride * currAQPartHeight;
        }
		// 计算当前粒度下的均值方差
        const double dAvgAct = dSumAct / (pQPLayer->numAQPartInWidth * pQPLayer->numAQPartInHeight);
        pQPLayer->dAvgActivity = dAvgAct;
    }
	// 2.按照不同的粒度,去分析QP
    xPreanalyzeQp(curFrame);
	// 最小AQDepth(最细粒度)
    int minAQDepth = curFrame->m_lowres.pAQLayer->minAQDepth;

    PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[minAQDepth];
    const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
    const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
    double* pcQP = pQPLayer->dQpOffset;

    // Use new qp offset values for qpAqOffset, qpCuTreeOffset and invQscaleFactor buffer
	// 使用从最细粒度获取的pcQP值,来计算invQscaleFactor
	// 这里似乎没有计算qpAqOffset和qpCuTreeOffset?
    int blockXY = 0;
    for (uint32_t y = 0; y < height; y += aqPartHeight)
    {
        for (uint32_t x = 0; x < width; x += aqPartWidth, pcQP++)
        {
            curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(*pcQP);
            blockXY++;

            acEnergyCu(curFrame, x, y, curFrame->m_param->internalCsp, curFrame->m_param->rc.qgSize);
        }
    }
}

xPreanalyzeQP()的定义如下

void LookaheadTLD::xPreanalyzeQp(Frame* curFrame)
{
    const uint32_t width = curFrame->m_fencPic->m_picWidth;
    const uint32_t height = curFrame->m_fencPic->m_picHeight;

    for (uint32_t d = 0; d < 4; d++)
    {
        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
            continue;

        PicQPAdaptationLayer* pcAQLayer = &curFrame->m_lowres.pAQLayer[d];
        const uint32_t aqPartWidth = pcAQLayer->aqPartWidth;
        const uint32_t aqPartHeight = pcAQLayer->aqPartHeight;
        double* pcAQU = pcAQLayer->dActivity;
        double* pcQP = pcAQLayer->dQpOffset;
        double* pcCuTree = pcAQLayer->dCuTreeOffset;
		// 分析每个粒度下的qpOffset
        for (uint32_t y = 0; y < height; y += aqPartHeight)
        {
            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++, pcQP++, pcCuTree++)
            {
				// param->rc.qpAdaptationRange = 1.0;
                double dMaxQScale = pow(2.0, curFrame->m_param->rc.qpAdaptationRange / 6.0);
				// CU级别的方差
                double dCUAct = *pcAQU;
				// 粒度级别的方差(或者说CU级别的平均方差)
                double dAvgAct = pcAQLayer->dAvgActivity;
				/*
					dNormtAct计算的公式的含义是
					(1)对比当前CU方差和平均方差的大小
						(a)如果当前CU方差大于平均方差,则dNormAct偏大,qpOffset偏大,即当前CU很重要,应该以低qp(高质量)编码
					(2)举例
						(a)已知 dMaxQScale = pow(2, 1/6) = 1.1224
						(b)假设dCUAct = 10,dAvgAct = 5,表明当前CU的方差比较大,此时
							dNormAct = (1.124 * 10 + 5) / (1.124 * 5 + 10) = 1.040
						(c)假设dCUAct = 5,dAvgAct = 10, 表明当前CU的方差比较小,此时
							dNormAct = (1.124 * 5 + 10) / (1.124 * 10 + 5) = 0.962
						
						dNormAct越大,qpOffset越大,所以当dCUAct越大,qpOffset越大
				*/
                double dNormAct = (dMaxQScale*dCUAct + dAvgAct) / (dCUAct + dMaxQScale*dAvgAct);
                double dQpOffset = (X265_LOG2(dNormAct) / X265_LOG2(2.0)) * 6.0;
				// 存储qpOffset
                *pcQP = dQpOffset;
                *pcCuTree = dQpOffset;
            }
        }
    }
}

5.2 qpOffset的计算(computeCUTreeQpOffset)

computeCUTreeQpOffset()被cuTreeFinish()调用,实现了hevcAq模式下的qpOffset的计算,其定义如下

void Lookahead::computeCUTreeQpOffset(Lowres *frame, double averageDuration, int ref0Distance)
{
    int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;

    double weightdelta = 0.0;
    if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
        weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);

    uint32_t widthFullRes = frame->widthFullRes;
    uint32_t heightFullRes = frame->heightFullRes;

    if (m_param->rc.qgSize == 8)
    {
        // ... 
    }
    else
    {	// 遍历每个粒度
        for (uint32_t d = 0; d < 4; d++)
        {
            int ctuSizeIdx = 6 - g_log2Size[m_param->maxCUSize];
            int aqDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
                continue;

            PicQPAdaptationLayer* pQPLayer = &frame->pAQLayer[d];
            const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
            const uint32_t aqPartHeight = pQPLayer->aqPartHeight;

            const uint32_t numAQPartInWidth = pQPLayer->numAQPartInWidth;
            const uint32_t numAQPartInHeight = pQPLayer->numAQPartInHeight;

            double* pcQP = pQPLayer->dQpOffset;
            double* pcCuTree = pQPLayer->dCuTreeOffset;

            uint32_t maxCols = frame->maxBlocksInRow;

            for (uint32_t y = 0; y < numAQPartInHeight; y++)
            {
                for (uint32_t x = 0; x < numAQPartInWidth; x++, pcQP++, pcCuTree++)
                {
                    uint32_t block_x = x * aqPartWidth;
                    uint32_t block_y = y * aqPartHeight;

                    uint32_t blockXY = 0;
                    double log2_ratio = 0;
                    for (uint32_t block_yy = block_y; block_yy < block_y + aqPartHeight && block_yy < heightFullRes; block_yy += loopIncr)
                    {
                        for (uint32_t block_xx = block_x; block_xx < block_x + aqPartWidth && block_xx < widthFullRes; block_xx += loopIncr)
                        {
                            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);

                            int intraCost = (frame->intraCost[idx] * frame->invQscaleFactor[idx] + 128) >> 8;
                            int propagateCost = (frame->propagateCost[idx] * fpsFactor + 128) >> 8;

                            log2_ratio += (X265_LOG2(intraCost + propagateCost) - X265_LOG2(intraCost) + weightdelta);

                            blockXY++;
                        }
                    }
					// 计算qpOffset
                    double qp_offset = (m_cuTreeStrength * log2_ratio) / blockXY;

                    *pcCuTree = *pcQP - qp_offset;

                }
            }
        }
    }
}

http://www.kler.cn/a/302472.html

相关文章:

  • 深入解析人工智能中的协同过滤算法及其在推荐系统中的应用与优化
  • day 21
  • 京华春梦,守岁这方烟火人间
  • 2025寒假备战蓝桥杯01---朴素二分查找的学习
  • PyTorch使用教程(9)-使用profiler进行模型性能分析
  • 在C#中添加I/O延时和持续时间
  • k8s环境搭建(续)
  • HarmonyOS开发实战( Beta5.0)使用ArkUI的FrameNode扩展实现动态布局类框架详解
  • 【VuePress 个人博客搭建】
  • 一文彻底搞懂数字孪生、仿真与虚拟调试
  • 基于SpringBoot的古城墙景区管理系统
  • OGG几何内核算法研究-大型装配模型快速载入研究
  • LVS--负载均衡调度器
  • Vue Router 中,meta 对象。
  • QGIS 如何连接空间库,并实时编辑空间表?编辑后库表如何刷新,保证是最新数据?
  • 【动态规划】(一)动态规划理论及基础题目
  • JavaScript web API part2
  • 网站如何防范BOT流量?
  • Python计算机视觉 第8章-图像内容分类
  • Vue3中Pinia存储和修改数据应用实践
  • oracle事务隔离级别
  • 大三大四
  • 浅谈电动汽车充电桩绝缘智能化自检装置的设计与应用
  • Qt QSerialPort数据发送和接收DataComm
  • GIS开发从0到1|MapboxGL可视化项目实战教程(含步骤说明和代码展示)2
  • Spring Boot整合Velocity 模板引擎