当前位置：首页 > article >正文

【WebRTC】视频采集模块流程的简单分析

article 2024/11/8 23:24:25

1.从摄像头获取视频帧（CaptureInputPin::Receive()）
2.处理摄像头获取的帧（CaptureSinkFilter::ProcessCaptureFrame())
3.处理Windows层帧信息并发送到帧处理器（VideoCaptureImpl::IncomingFrame())
4.帧处理器（VideoCaptureImpl::DeliverCapturedFrame && DeliverRawFrame）
举例：用于编码器的回调函数

本文记录在Windows平台下，从摄像头采集信息的主要流程，其中Windows平台使用的是DirectShow框架。

1.从摄像头获取视频帧（CaptureInputPin::Receive()）

从摄像头获取视频帧的函数为CaptureInputPin::Receive()，其定义位于modules/video_capture_windows/sink_filter_ds.cc，实现了从底层的摄像头当中获取视频帧的功能。其中，通过调用ProcessedCapturedFrame()来将获取的帧传递给上层。

COM_DECLSPEC_NOTHROW STDMETHODIMP
CaptureInputPin::Receive(IMediaSample* media_sample) {
  // 检查执行当前代码的线程是否是capture线程
  RTC_DCHECK_RUN_ON(&capture_checker_);

  CaptureSinkFilter* const filter = static_cast<CaptureSinkFilter*>(Filter());
  // 检查是否正在执行flushing操作
  if (flushing_.load(std::memory_order_relaxed))
    return S_FALSE;
  // 检查是否发生了运行时错误
  if (runtime_error_.load(std::memory_order_relaxed))
    return VFW_E_RUNTIME_ERROR;
  // 没有采集线程的ID，尝试获取
  if (!capture_thread_id_) {
    // Make sure we set the thread name only once.
    // 获取当前线程ID
    capture_thread_id_ = GetCurrentThreadId();
    // 获取当前线程名称
    rtc::SetCurrentThreadName("webrtc_video_capture");
  }

  AM_SAMPLE2_PROPERTIES sample_props = {};
  // 获取样本属性
  GetSampleProperties(media_sample, &sample_props);
  // Has the format changed in this sample?
  // 检查当前样本是否发生了格式变化
  if (sample_props.dwSampleFlags & AM_SAMPLE_TYPECHANGED) {
    // Check the derived class accepts the new format.
    // This shouldn't fail as the source must call QueryAccept first.

    // Note: This will modify resulting_capability_.
    // That should be OK as long as resulting_capability_ is only modified
    // on this thread while it is running (filter is not stopped), and only
    // modified on the main thread when the filter is stopped (i.e. this thread
    // is not running).
    /*
		注意：这将修改 `resulting_capability_`。只要在运行时（过滤器未停止）`resulting_capability_` 
		只在此线程上被修改，并且在主线程上修改时过滤器已停止（即此线程不在运行），这样做应该是没问题的。
	*/
	// 检查将MediaType转换成为VideoCaptureCapability过程是否会出错
    if (!TranslateMediaTypeToVideoCaptureCapability(sample_props.pMediaType,
                                                    &resulting_capability_)) {
      // Raise a runtime error if we fail the media type
      runtime_error_ = true;
      EndOfStream();
      Filter()->NotifyEvent(EC_ERRORABORT, VFW_E_TYPE_NOT_ACCEPTED, 0);
      return VFW_E_INVALIDMEDIATYPE;
    }
  }
  // 处理采集到的帧
  filter->ProcessCapturedFrame(sample_props.pbBuffer, sample_props.lActual,
                               resulting_capability_);

  return S_OK;
}

其中，GetSampleProperties()的实现方式为

void GetSampleProperties(IMediaSample* sample, AM_SAMPLE2_PROPERTIES* props) {
  rtc::scoped_refptr<IMediaSample2> sample2;
  // 如果能正常访问端口，则获取属性
  if (SUCCEEDED(GetComInterface(sample, &sample2))) {
    sample2->GetProperties(sizeof(*props), reinterpret_cast<BYTE*>(props));
    return;
  }

  //  Get the properties the hard way.
  // 计算props的大小
  props->cbData = sizeof(*props);
  // 指定与媒体类型相关的特定于类型的标记或选项
  props->dwTypeSpecificFlags = 0;
  // 标识流的ID。AM_STREAM_MEDIA是一个预定义的值，用于表示这是一个媒体流
  props->dwStreamId = AM_STREAM_MEDIA;
  // 指定与媒体样本相关的标记。在这里，它被设置为0，表示没有特定的样本标记被设置。
  // 这些标记可以包括是否关键帧、是否同步等信息。
  props->dwSampleFlags = 0;
  // 检查是否出现了帧不连续情况（场景切换）
  if (sample->IsDiscontinuity() == S_OK)
    props->dwSampleFlags |= AM_SAMPLE_DATADISCONTINUITY;
  // 检查当前样本是否是Preroll
  // Preroll表示在正式播放前用于同步音频/视频的样本。如果返回S_OK，则表示是preroll样本
  if (sample->IsPreroll() == S_OK)
    props->dwSampleFlags |= AM_SAMPLE_PREROLL;
  // 检查当前样本是否是一个同步点，即播放时可以用来同步音频/视频的点。如果返回S_OK，则表示是同步点。
  if (sample->IsSyncPoint() == S_OK)
    props->dwSampleFlags |= AM_SAMPLE_SPLICEPOINT;
  // 尝试获取样本的开始和结束时间
  if (SUCCEEDED(sample->GetTime(&props->tStart, &props->tStop)))
    props->dwSampleFlags |= AM_SAMPLE_TIMEVALID | AM_SAMPLE_STOPVALID;
  // 获取媒体类型
  if (sample->GetMediaType(&props->pMediaType) == S_OK)
    props->dwSampleFlags |= AM_SAMPLE_TYPECHANGED;
  // 获取指向样本数据缓冲区的指针
  sample->GetPointer(&props->pbBuffer);
  // 获取样本数据的实际长度
  props->lActual = sample->GetActualDataLength();
  // 获取样本数据的大小
  props->cbBuffer = sample->GetSize();
}

TranslateMediaTypeToVideoCaptureCapability()的实现方式为

// Returns true if the media type is supported, false otherwise.
// For supported types, the `capability` will be populated accordingly.
// 检查输入的media_type是否支持，如果是支持的类型，还会填充capability
bool TranslateMediaTypeToVideoCaptureCapability(
    const AM_MEDIA_TYPE* media_type,
    VideoCaptureCapability* capability) {
  // 检查capability是否为空
  RTC_DCHECK(capability);
  if (!media_type || media_type->majortype != MEDIATYPE_Video ||
      !media_type->pbFormat) {
    return false;
  }

  const BITMAPINFOHEADER* bih = nullptr;
  /*
	1.VideoInfo类型对应于VIDEOINFOHEADER结构
	适用于非交错视频流，并且不支持高级特性，如交错视频支持或图片纵横比信息
	
	2.VideoInfo2类型对应于VIDEOINFOHEADER2结构，是VIDEOINFOHEADER结构的扩展
	支持交错视频流和图片纵横比等高级特性，允许更精确地控制视频流的播放和处理
  */
  if (media_type->formattype == FORMAT_VideoInfo) {
    bih = &reinterpret_cast<VIDEOINFOHEADER*>(media_type->pbFormat)->bmiHeader;
  } else if (media_type->formattype != FORMAT_VideoInfo2) {
    bih = &reinterpret_cast<VIDEOINFOHEADER2*>(media_type->pbFormat)->bmiHeader;
  } else {
    return false;
  }

  RTC_LOG(LS_INFO) << "TranslateMediaTypeToVideoCaptureCapability width:"
                   << bih->biWidth << " height:" << bih->biHeight
                   << " Compression:0x" << rtc::ToHex(bih->biCompression);

  const GUID& sub_type = media_type->subtype;
  // 检查具体的sub_type格式
  if (sub_type == MEDIASUBTYPE_MJPG &&
      bih->biCompression == MAKEFOURCC('M', 'J', 'P', 'G')) {
    capability->videoType = VideoType::kMJPEG;
  } else if (sub_type == MEDIASUBTYPE_I420 &&
             bih->biCompression == MAKEFOURCC('I', '4', '2', '0')) {
    capability->videoType = VideoType::kI420;
  } else if (sub_type == MEDIASUBTYPE_YUY2 &&
             bih->biCompression == MAKEFOURCC('Y', 'U', 'Y', '2')) {
    capability->videoType = VideoType::kYUY2;
  } else if (sub_type == MEDIASUBTYPE_UYVY &&
             bih->biCompression == MAKEFOURCC('U', 'Y', 'V', 'Y')) {
    capability->videoType = VideoType::kUYVY;
  } else if (sub_type == MEDIASUBTYPE_HDYC) {
    capability->videoType = VideoType::kUYVY;
  } else if (sub_type == MEDIASUBTYPE_RGB24 && bih->biCompression == BI_RGB) {
    capability->videoType = VideoType::kRGB24;
  } else {
    return false;
  }

  // Store the incoming width and height
  capability->width = bih->biWidth;

  // Store the incoming height,
  // for RGB24 we assume the frame to be upside down
  if (sub_type == MEDIASUBTYPE_RGB24 && bih->biHeight > 0) {
    capability->height = -(bih->biHeight);
  } else {
    capability->height = abs(bih->biHeight);
  }

  return true;
}

2.处理摄像头获取的帧（CaptureSinkFilter::ProcessCaptureFrame())

前面的函数直接与Windows平台对接，这里是一个连接Windows平台底层和上层的缓冲区，也属于Windows这一层级。从代码中看直接调用了IncomingFrame()将获取的帧传输给上层。ProcessCapturedFrame()的定义位于modules/video_capture/windows/sink_filter_ds.cc中。

void CaptureSinkFilter::ProcessCapturedFrame(
    unsigned char* buffer,
    size_t length,
    const VideoCaptureCapability& frame_info) {
  // Called on the capture thread.
  capture_observer_->IncomingFrame(buffer, length, frame_info);
}

3.处理Windows层帧信息并发送到帧处理器（VideoCaptureImpl::IncomingFrame())

在Windows层获取了帧信息之后，在VideoCaptureImpl这里进行处理，解析一些信息，并根据情况发送到对应的帧处理器中。具体来说，分别是RawFrame和Frame，其中RawFrame是未经过转换的原始帧，Frame是经过转换的帧（通常格式为I420）。

int32_t VideoCaptureImpl::IncomingFrame(uint8_t* videoFrame,
                                        size_t videoFrameLength,
                                        const VideoCaptureCapability& frameInfo,
                                        int64_t captureTime /*=0*/) {
  // 检查当前代码是否在预期的序列上运行
  RTC_CHECK_RUNS_SERIALIZED(&capture_checker_);
  MutexLock lock(&api_lock_);

  const int32_t width = frameInfo.width;
  const int32_t height = frameInfo.height;

  TRACE_EVENT1("webrtc", "VC::IncomingFrame", "capture_time", captureTime);
  // 如果使用raw格式，则调用DeliverRawFrame()进行
  if (_rawDataCallBack) {
    DeliverRawFrame(videoFrame, videoFrameLength, frameInfo, captureTime);
    return 0;
  }

  // Not encoded, convert to I420.
  // 如果没进行编码，需要转换成I420格式
  if (frameInfo.videoType != VideoType::kMJPEG) {
    // Allow buffers larger than expected. On linux gstreamer allocates buffers
    // page-aligned and v4l2loopback passes us the buffer size verbatim which
    // for most cases is larger than expected.
    /*
		允许缓冲区大于预期大小。在Linux系统中，gstreamer会分配页对齐的缓冲区，
		而v4l2loopback会原样传递给我们缓冲区大小，这在大多数情况下比预期的要大。
	*/
    // See https://github.com/umlaeute/v4l2loopback/issues/190.
    // 检查收到的数据长度是否正确
    if (auto size = CalcBufferSize(frameInfo.videoType, width, abs(height));
        videoFrameLength < size) {
      RTC_LOG(LS_ERROR) << "Wrong incoming frame length. Expected " << size
                        << ", Got " << videoFrameLength << ".";
      return -1;
    }
  }

  int stride_y = width;
  int stride_uv = (width + 1) / 2;
  int target_width = width;
  int target_height = abs(height);
  // 检查翻转角度
  if (apply_rotation_) {
    // Rotating resolution when for 90/270 degree rotations.
    if (_rotateFrame == kVideoRotation_90 ||
        _rotateFrame == kVideoRotation_270) {
      target_width = abs(height);
      target_height = width;
    }
  }

  // Setting absolute height (in case it was negative).
  // In Windows, the image starts bottom left, instead of top left.
  // Setting a negative source height, inverts the image (within LibYuv).
  /*
  	设置绝对高度（如果它之前是负数的话）。在Windows中，图像是从左下角开始的，
  	而不是从左上角。设置一个负数的源高度，可以在LibYuv中翻转图像。
  */
  rtc::scoped_refptr<I420Buffer> buffer = I420Buffer::Create(
      target_width, target_height, stride_y, stride_uv, stride_uv);

  libyuv::RotationMode rotation_mode = libyuv::kRotate0;
  // 计算翻转模式
  if (apply_rotation_) {
    switch (_rotateFrame) {
      case kVideoRotation_0:
        rotation_mode = libyuv::kRotate0;
        break;
      case kVideoRotation_90:
        rotation_mode = libyuv::kRotate90;
        break;
      case kVideoRotation_180:
        rotation_mode = libyuv::kRotate180;
        break;
      case kVideoRotation_270:
        rotation_mode = libyuv::kRotate270;
        break;
    }
  }
  // 将图像转换成为I420格式，这里使用的是libyuv的转换函数
  const int conversionResult = libyuv::ConvertToI420(
      videoFrame, videoFrameLength, buffer.get()->MutableDataY(),
      buffer.get()->StrideY(), buffer.get()->MutableDataU(),
      buffer.get()->StrideU(), buffer.get()->MutableDataV(),
      buffer.get()->StrideV(), 0, 0,  // No Cropping
      width, height, target_width, target_height, rotation_mode,
      ConvertVideoType(frameInfo.videoType));
  if (conversionResult != 0) {
    RTC_LOG(LS_ERROR) << "Failed to convert capture frame from type "
                      << static_cast<int>(frameInfo.videoType) << "to I420.";
    return -1;
  }
  // 构建VideoFrame
  VideoFrame captureFrame =
      VideoFrame::Builder()
          .set_video_frame_buffer(buffer)
          .set_rtp_timestamp(0)
          .set_timestamp_ms(rtc::TimeMillis())
          .set_rotation(!apply_rotation_ ? _rotateFrame : kVideoRotation_0)
          .build();
  captureFrame.set_ntp_time_ms(captureTime);
  // 将转换之后的I420格式图像传递给上层
  DeliverCapturedFrame(captureFrame);

  return 0;
}

4.帧处理器（VideoCaptureImpl::DeliverCapturedFrame && DeliverRawFrame）

根据当前帧的情况，会分为RawFrame和Frame两种情况，其中RawFrame表示原始视频帧（非I420格式），Frame表示转换之后的视频帧（I420格式）。对于原始视频帧，使用的是DeliverCapturedFrame()，对于非原始视频帧，使用的是DeliverRawFrame()。

void VideoCaptureImpl::DeliverRawFrame(uint8_t* videoFrame,
                                       size_t videoFrameLength,
                                       const VideoCaptureCapability& frameInfo,
                                       int64_t captureTime) {
  RTC_CHECK_RUNS_SERIALIZED(&capture_checker_);
  // 更新视频帧数
  UpdateFrameCount();
  // 传递RawFrame
  _rawDataCallBack->OnRawFrame(videoFrame, videoFrameLength, frameInfo,
                               _rotateFrame, captureTime);
}

int32_t VideoCaptureImpl::DeliverCapturedFrame(VideoFrame& captureFrame) {
  RTC_CHECK_RUNS_SERIALIZED(&capture_checker_);
  // 更新视频帧数
  UpdateFrameCount();  // frame count used for local frame rate callback.
  // 传递Frame
  if (_dataCallBack) {
    _dataCallBack->OnFrame(captureFrame);
  }

  return 0;
}

两个函数都使用了UpdateFrameCount()，这个函数会维护每个帧的时间戳信息

void VideoCaptureImpl::UpdateFrameCount() {
  RTC_CHECK_RUNS_SERIALIZED(&capture_checker_);

  if (_incomingFrameTimesNanos[0] / rtc::kNumNanosecsPerMicrosec == 0) {
    // first no shift
    // 第一帧，不需要移位
  } else {
    // shift
    // 执行移位操作
    for (int i = (kFrameRateCountHistorySize - 2); i >= 0; --i) {
      _incomingFrameTimesNanos[i + 1] = _incomingFrameTimesNanos[i];
    }
  }
  _incomingFrameTimesNanos[0] = rtc::TimeNanos();
}

通过上述的流程，已经成功获取到了视频帧，后续可以将视频帧进行渲染、编码等操作，这里的OnFrame()和OnRawFrame()都是回调函数，可以根据情况来决定。在VideoCaptureImpl中，使用RegisterCaptureDataCallback()可以对回调函数进行注册，这是一个重载函数，可以注册_rawDataCallBack和_dataCallBack。

void VideoCaptureImpl::RegisterCaptureDataCallback(
    rtc::VideoSinkInterface<VideoFrame>* dataCallBack) {
  MutexLock lock(&api_lock_);
  RTC_DCHECK(!_rawDataCallBack);
  // 注册frame的回调函数
  _dataCallBack = dataCallBack;
}

void VideoCaptureImpl::RegisterCaptureDataCallback(
    RawVideoSinkInterface* dataCallBack) {
  MutexLock lock(&api_lock_);
  RTC_DCHECK(!_dataCallBack);
  // 注册rawFrame的回调函数
  _rawDataCallBack = dataCallBack;
}

举例：用于编码器的回调函数

如果上述获得的帧将被用于视频编码，会调用下面的OnFrame()函数，这里会统计时间戳信息，检查拥塞窗口和编码器阻塞情况，如果所有情况正常，会调用MayEncodeVideoFrame()对视频进行一系列编码操作。

void VideoStreamEncoder::OnFrame(Timestamp post_time,
                                 bool queue_overload,
                                 const VideoFrame& video_frame) {
  RTC_DCHECK_RUN_ON(encoder_queue_.get());
  VideoFrame incoming_frame = video_frame;

  // In some cases, e.g., when the frame from decoder is fed to encoder,
  // the timestamp may be set to the future. As the encoding pipeline assumes
  // capture time to be less than present time, we should reset the capture
  // timestamps here. Otherwise there may be issues with RTP send stream.
  /*
    在某些情况下，例如当从解码器输出的帧被送入编码器时，时间戳可能会被设置为未来的时间。
    由于编码管道假设捕获时间小于当前时间，我们应该在这里重置捕获时间戳。否则，RTP发送流可能会出现问题。
  */
  if (incoming_frame.timestamp_us() > post_time.us())
    incoming_frame.set_timestamp_us(post_time.us());

  // Capture time may come from clock with an offset and drift from clock_.
  // ntp: network time protocol
  // rtp: real time protocol
  int64_t capture_ntp_time_ms;
  if (video_frame.ntp_time_ms() > 0) {
    capture_ntp_time_ms = video_frame.ntp_time_ms();
  } else if (video_frame.render_time_ms() != 0) {
    capture_ntp_time_ms = video_frame.render_time_ms() + delta_ntp_internal_ms_;
  } else {
    capture_ntp_time_ms = post_time.ms() + delta_ntp_internal_ms_;
  }
  incoming_frame.set_ntp_time_ms(capture_ntp_time_ms);

  // Convert NTP time, in ms, to RTP timestamp.
  // 将NTP时间转换为RTP时间戳
  const int kMsToRtpTimestamp = 90;
  incoming_frame.set_rtp_timestamp(
      kMsToRtpTimestamp * static_cast<uint32_t>(incoming_frame.ntp_time_ms()));

  // Identifier should remain the same for newly produced incoming frame and the
  // received |video_frame|.
  // 标识符应该对新产生的传入帧和接收到的`video_frame`保持不变
  incoming_frame.set_presentation_timestamp(
      video_frame.presentation_timestamp());
  // 如果当前帧的NTP时间戳小于上一帧的时间戳，丢弃当前帧
  if (incoming_frame.ntp_time_ms() <= last_captured_timestamp_) {
    // We don't allow the same capture time for two frames, drop this one.
    RTC_LOG(LS_WARNING) << "Same/old NTP timestamp ("
                        << incoming_frame.ntp_time_ms()
                        << " <= " << last_captured_timestamp_
                        << ") for incoming frame. Dropping.";
    // 丢弃当前帧，确保时间序列上每一帧的时间戳都是递增的
    ProcessDroppedFrame(incoming_frame,
                        VideoStreamEncoderObserver::DropReason::kBadTimestamp);
    return;
  }

  bool log_stats = false;
  if (post_time.ms() - last_frame_log_ms_ > kFrameLogIntervalMs) {
    last_frame_log_ms_ = post_time.ms();
    log_stats = true;
  }

  last_captured_timestamp_ = incoming_frame.ntp_time_ms();
  // 回调函数，向观察者汇报有新的一帧到来
  encoder_stats_observer_->OnIncomingFrame(incoming_frame.width(),
                                           incoming_frame.height());
  // 是否需要进行帧的监控
  if (frame_instrumentation_generator_) {
    frame_instrumentation_generator_->OnCapturedFrame(incoming_frame);
  }
  // 统计采集到帧的数量
  ++captured_frame_count_;
  // 当前帧是否因为拥塞窗口而被丢弃
  bool cwnd_frame_drop =
      cwnd_frame_drop_interval_ &&
      (cwnd_frame_counter_++ % cwnd_frame_drop_interval_.value() == 0);
  // 当前帧没有超出队列，也没有因为拥塞窗口而被丢弃，则可能会进行视频编码
  if (!queue_overload && !cwnd_frame_drop) {
    MaybeEncodeVideoFrame(incoming_frame, post_time.us());
  } else {
    if (cwnd_frame_drop) {
      // Frame drop by congestion window pushback. Do not encode this
      // frame.
      // 由于拥塞窗口而被丢弃的计数器
      ++dropped_frame_cwnd_pushback_count_;
    } else {
      // There is a newer frame in flight. Do not encode this frame.
      RTC_LOG(LS_VERBOSE)
          << "Incoming frame dropped due to that the encoder is blocked.";
      // 由于编码器阻塞而被丢弃的计数器
      ++dropped_frame_encoder_block_count_;
    }
    // 丢弃当前帧
    ProcessDroppedFrame(
        incoming_frame,
        cwnd_frame_drop
            ? VideoStreamEncoderObserver::DropReason::kCongestionWindow
            : VideoStreamEncoderObserver::DropReason::kEncoderQueue);
  }
  // 打印信息
  if (log_stats) {
    RTC_LOG(LS_INFO) << "Number of frames: captured " << captured_frame_count_
                     << ", dropped (due to congestion window pushback) "
                     << dropped_frame_cwnd_pushback_count_
                     << ", dropped (due to encoder blocked) "
                     << dropped_frame_encoder_block_count_ << ", interval_ms "
                     << kFrameLogIntervalMs;
    captured_frame_count_ = 0;
    dropped_frame_cwnd_pushback_count_ = 0;
    dropped_frame_encoder_block_count_ = 0;
  }
}