当前位置: 首页 > article >正文

DPDK系列之十六虚拟化virtio源码分析之virtio-user

一、virtio-user说明

在网络IO的半虚拟中,vhost-user是目前最优的解决方案。在DPDK中,同样也采用了这种方式。vhost-user是为了解决内核状态数据操作复杂的情况提出的一种解决方式,通过在用户进程来替代内核进程来实现数据交互的最少化。在vhost-user在应用场景中,虚拟化的容器支持是一个重点方向。起初的virtio-user就是为了支持容器内部与DPDK通信的。后来也发展到虚拟设备间的通信。
DPDK与Kernel的通信也叫做“exception path”,通常来说,这种通信方式主要有几种:
1、KNI,是目前DPDK中用户使用的主要方案。即通过虚拟网络接口,利用队列和DPDK应用交的数据,但无法实现upstream(一种负载均衡的手段)
2、Tun/Tap或者pcap PMD.需要内核切换,效率差
3、Flow Bifurcation,虚拟多张网卡,依赖硬件,不灵活
4、virtio-user和vhost-net,这是比较好的一种实现机制。
virtio-user在DPDK和虚拟场景下的应用还是非常多的。virtio-user虚拟出的设备和真实的设备在上层看没有区别,这个非常重要。

二、数据结构

下面看一下在DPDK中相关的数据结构定义:

struct virtio_user_queue {
	uint16_t used_idx;
	bool avail_wrap_counter;
	bool used_wrap_counter;
};

struct virtio_user_dev {
	/* for vhost_user backend */
	int		vhostfd;
	int		listenfd;   /* listening fd */
	bool		is_server;  /* server or client mode */

	/* for vhost_kernel backend */
	char		*ifname;
	int		*vhostfds;
	int		*tapfds;

	/* for both vhost_user and vhost_kernel */
	int		callfds[VIRTIO_MAX_VIRTQUEUES];
	int		kickfds[VIRTIO_MAX_VIRTQUEUES];
	int		mac_specified;
	uint32_t	max_queue_pairs;
	uint32_t	queue_pairs;
	uint32_t	queue_size;
	uint64_t	features; /* the negotiated features with driver,
				   * and will be sync with device
				   */
	uint64_t	device_features; /* supported features by device */
	uint64_t	frontend_features; /* enabled frontend features */
	uint64_t	unsupported_features; /* unsupported features mask */
	uint8_t		status;
	uint16_t	net_status;
	uint16_t	port_id;
	uint8_t		mac_addr[RTE_ETHER_ADDR_LEN];
	char		path[PATH_MAX];
	union {
		struct vring		vrings[VIRTIO_MAX_VIRTQUEUES];
		struct vring_packed	packed_vrings[VIRTIO_MAX_VIRTQUEUES];
	};
	struct virtio_user_queue packed_queues[VIRTIO_MAX_VIRTQUEUES];
	bool		qp_enabled[VIRTIO_MAX_VIRTQUEUE_PAIRS];

	struct virtio_user_backend_ops *ops;
	pthread_mutex_t	mutex;
	bool		started;
};

除了虚拟设备外,其实它主要是和VHOST以及相关数据队列的操作,而那些数据结构在前面已经基本都介绍过了。

三、基本流程

其实在前面说了,virtio-user在虚拟环境中应用非常广泛,在virtio-user文件夹(driver/net/virtio)下可以看到,其实最主要的就是那几个文件:

int
virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
		     int cq, int queue_size, const char *mac, char **ifname,
		     int server, int mrg_rxbuf, int in_order, int packed_vq)
{
	pthread_mutex_init(&dev->mutex, NULL);
	strlcpy(dev->path, path, PATH_MAX);
	dev->started = 0;
	dev->max_queue_pairs = queues;
	dev->queue_pairs = 1; /* mq disabled by default */
	dev->queue_size = queue_size;
	dev->is_server = server;
	dev->mac_specified = 0;
	dev->frontend_features = 0;
	dev->unsupported_features = ~VIRTIO_USER_SUPPORTED_FEATURES;
	parse_mac(dev, mac);

	if (*ifname) {
		dev->ifname = *ifname;
		*ifname = NULL;
	}

	if (virtio_user_dev_setup(dev) < 0) {
		PMD_INIT_LOG(ERR, "backend set up fails");
		return -1;
	}

	if (!dev->is_server) {
		if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER,
					   NULL) < 0) {
			PMD_INIT_LOG(ERR, "set_owner fails: %s",
				     strerror(errno));
			return -1;
		}

		if (dev->ops->send_request(dev, VHOST_USER_GET_FEATURES,
					   &dev->device_features) < 0) {
			PMD_INIT_LOG(ERR, "get_features failed: %s",
				     strerror(errno));
			return -1;
		}
	} else {
		/* We just pretend vhost-user can support all these features.
		 * Note that this could be problematic that if some feature is
		 * negotiated but not supported by the vhost-user which comes
		 * later.
		 */
		dev->device_features = VIRTIO_USER_SUPPORTED_FEATURES;
	}

	if (!mrg_rxbuf)
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_MRG_RXBUF);

	if (!in_order)
		dev->unsupported_features |= (1ull << VIRTIO_F_IN_ORDER);

	if (!packed_vq)
		dev->unsupported_features |= (1ull << VIRTIO_F_RING_PACKED);

	if (dev->mac_specified)
		dev->frontend_features |= (1ull << VIRTIO_NET_F_MAC);
	else
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_MAC);

	if (cq) {
		/* device does not really need to know anything about CQ,
		 * so if necessary, we just claim to support CQ
		 */
		dev->frontend_features |= (1ull << VIRTIO_NET_F_CTRL_VQ);
	} else {
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VQ);
		/* Also disable features that depend on VIRTIO_NET_F_CTRL_VQ */
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_RX);
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VLAN);
		dev->unsupported_features |=
			(1ull << VIRTIO_NET_F_GUEST_ANNOUNCE);
		dev->unsupported_features |= (1ull << VIRTIO_NET_F_MQ);
		dev->unsupported_features |=
			(1ull << VIRTIO_NET_F_CTRL_MAC_ADDR);
	}

	/* The backend will not report this feature, we add it explicitly */
	if (is_vhost_user_by_type(dev->path))
		dev->frontend_features |= (1ull << VIRTIO_NET_F_STATUS);

	/*
	 * Device features =
	 *     (frontend_features | backend_features) & ~unsupported_features;
	 */
	dev->device_features |= dev->frontend_features;
	dev->device_features &= ~dev->unsupported_features;

	if (rte_mem_event_callback_register(VIRTIO_USER_MEM_EVENT_CLB_NAME,
				virtio_user_mem_event_cb, dev)) {
		if (rte_errno != ENOTSUP) {
			PMD_INIT_LOG(ERR, "Failed to register mem event"
					" callback\n");
			return -1;
		}
	}

	return 0;
}

先是对设备的初始化,然后进行Setup:

static int
virtio_user_dev_setup(struct virtio_user_dev *dev)
{
	uint32_t q;

	dev->vhostfd = -1;
	dev->vhostfds = NULL;
	dev->tapfds = NULL;

	if (dev->is_server) {
		if (access(dev->path, F_OK) == 0 &&
		    !is_vhost_user_by_type(dev->path)) {
			PMD_DRV_LOG(ERR, "Server mode doesn't support vhost-kernel!");
			return -1;
		}
		dev->ops = &virtio_ops_user;
	} else {
		if (is_vhost_user_by_type(dev->path)) {
			dev->ops = &virtio_ops_user;
		} else {
			dev->ops = &virtio_ops_kernel;

			dev->vhostfds = malloc(dev->max_queue_pairs *
					       sizeof(int));
			dev->tapfds = malloc(dev->max_queue_pairs *
					     sizeof(int));
			if (!dev->vhostfds || !dev->tapfds) {
				PMD_INIT_LOG(ERR, "Failed to malloc");
				return -1;
			}

			for (q = 0; q < dev->max_queue_pairs; ++q) {
				dev->vhostfds[q] = -1;
				dev->tapfds[q] = -1;
			}
		}
	}

	if (dev->ops->setup(dev) < 0)
		return -1;

	if (virtio_user_dev_init_notify(dev) < 0)
		return -1;

	if (virtio_user_fill_intr_handle(dev) < 0)
		return -1;

	return 0;
}

然后在处理用户状态时可以启动:

//drivers/net/virtio/virtio_user_ethdev.c
static void
virtio_user_set_status(struct virtio_hw *hw, uint8_t status)
{
	struct virtio_user_dev *dev = virtio_user_get_dev(hw);

	if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK)
		virtio_user_start_device(dev);
	else if (status == VIRTIO_CONFIG_STATUS_RESET)
		virtio_user_reset(hw);
	dev->status = status;
}

int
virtio_user_start_device(struct virtio_user_dev *dev)
{
	uint64_t features;
	int ret;

	/*
	 * XXX workaround!
	 *
	 * We need to make sure that the locks will be
	 * taken in the correct order to avoid deadlocks.
	 *
	 * Before releasing this lock, this thread should
	 * not trigger any memory hotplug events.
	 *
	 * This is a temporary workaround, and should be
	 * replaced when we get proper supports from the
	 * memory subsystem in the future.
	 */
	rte_mcfg_mem_read_lock();
	pthread_mutex_lock(&dev->mutex);

	if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0)
		goto error;

	/* Step 0: tell vhost to create queues */
	if (virtio_user_queue_setup(dev, virtio_user_create_queue) < 0)
		goto error;

	/* Step 1: set features */
	features = dev->features;
	/* Strip VIRTIO_NET_F_MAC, as MAC address is handled in vdev init */
	features &= ~(1ull << VIRTIO_NET_F_MAC);
	/* Strip VIRTIO_NET_F_CTRL_VQ, as devices do not really need to know */
	features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ);
	features &= ~(1ull << VIRTIO_NET_F_STATUS);
	ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
	if (ret < 0)
		goto error;
	PMD_DRV_LOG(INFO, "set features: %" PRIx64, features);

	/* Step 2: share memory regions */
	ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
	if (ret < 0)
		goto error;

	/* Step 3: kick queues */
	if (virtio_user_queue_setup(dev, virtio_user_kick_queue) < 0)
		goto error;

	/* Step 4: enable queues
	 * we enable the 1st queue pair by default.
	 */
	dev->ops->enable_qp(dev, 0, 1);

	dev->started = true;
	pthread_mutex_unlock(&dev->mutex);
	rte_mcfg_mem_read_unlock();

	return 0;
error:
	pthread_mutex_unlock(&dev->mutex);
	rte_mcfg_mem_read_unlock();
	/* TODO: free resource here or caller to check */
	return -1;
}

这里其实会调用send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL)来传递内存数据(ops中设置),如果后端为vhost-user时,即为vhost_user_sock。

//drivers/net/virtio/virtio-user
static int
vhost_user_sock(struct virtio_user_dev *dev,
		enum vhost_user_request req,
		void *arg)
{
	struct vhost_user_msg msg;
	struct vhost_vring_file *file = 0;
	int need_reply = 0;
	int fds[VHOST_MEMORY_MAX_NREGIONS];
	int fd_num = 0;
	int len;
	int vhostfd = dev->vhostfd;

	RTE_SET_USED(m);

	PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]);

	if (dev->is_server && vhostfd < 0)
		return -1;

	msg.request = req;
	msg.flags = VHOST_USER_VERSION;
	msg.size = 0;

	switch (req) {
	case VHOST_USER_GET_FEATURES:
		need_reply = 1;
		break;

	case VHOST_USER_SET_FEATURES:
	case VHOST_USER_SET_LOG_BASE:
		msg.payload.u64 = *((__u64 *)arg);
		msg.size = sizeof(m.payload.u64);
		break;

	case VHOST_USER_SET_OWNER:
	case VHOST_USER_RESET_OWNER:
		break;

	case VHOST_USER_SET_MEM_TABLE:
		if (prepare_vhost_memory_user(&msg, fds) < 0)
			return -1;
		fd_num = msg.payload.memory.nregions;
		msg.size = sizeof(m.payload.memory.nregions);
		msg.size += sizeof(m.payload.memory.padding);
		msg.size += fd_num * sizeof(struct vhost_memory_region);
		break;

	case VHOST_USER_SET_LOG_FD:
		fds[fd_num++] = *((int *)arg);
		break;

	case VHOST_USER_SET_VRING_NUM:
	case VHOST_USER_SET_VRING_BASE:
	case VHOST_USER_SET_VRING_ENABLE:
		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
		msg.size = sizeof(m.payload.state);
		break;

	case VHOST_USER_GET_VRING_BASE:
		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
		msg.size = sizeof(m.payload.state);
		need_reply = 1;
		break;

	case VHOST_USER_SET_VRING_ADDR:
		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
		msg.size = sizeof(m.payload.addr);
		break;

	case VHOST_USER_SET_VRING_KICK:
	case VHOST_USER_SET_VRING_CALL:
	case VHOST_USER_SET_VRING_ERR:
		file = arg;
		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
		msg.size = sizeof(m.payload.u64);
		if (file->fd > 0)
			fds[fd_num++] = file->fd;
		else
			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
		break;

	default:
		PMD_DRV_LOG(ERR, "trying to send unhandled msg type");
		return -1;
	}

	len = VHOST_USER_HDR_SIZE + msg.size;
	if (vhost_user_write(vhostfd, &msg, len, fds, fd_num) < 0) {
		PMD_DRV_LOG(ERR, "%s failed: %s",
			    vhost_msg_strings[req], strerror(errno));
		return -1;
	}

	if (need_reply) {
		if (vhost_user_read(vhostfd, &msg) < 0) {
			PMD_DRV_LOG(ERR, "Received msg failed: %s",
				    strerror(errno));
			return -1;
		}

		if (req != msg.request) {
			PMD_DRV_LOG(ERR, "Received unexpected msg type");
			return -1;
		}

		switch (req) {
		case VHOST_USER_GET_FEATURES:
			if (msg.size != sizeof(m.payload.u64)) {
				PMD_DRV_LOG(ERR, "Received bad msg size");
				return -1;
			}
			*((__u64 *)arg) = msg.payload.u64;
			break;
		case VHOST_USER_GET_VRING_BASE:
			if (msg.size != sizeof(m.payload.state)) {
				PMD_DRV_LOG(ERR, "Received bad msg size");
				return -1;
			}
			memcpy(arg, &msg.payload.state,
			       sizeof(struct vhost_vring_state));
			break;
		default:
			PMD_DRV_LOG(ERR, "Received unexpected msg type");
			return -1;
		}
	}

	return 0;
}

找到相关的VHOST_USER_SET_MEM_TABLE选项设置就看了数据的准备,从调用函数就可以一路深入进去,明白整个过程。这里就不再做介绍。

四、总结

通过上面的分析可以看出,virtio-user既可以实现虚拟机前后端的通信,也可以实现不同设备间的通信,还可以实现与内核间的通信。所以一种新的技术被提出后,会不断的推动应用的向前发展,反过来,应用的发展又不断要求前者提供更好的支持。互相促进,就会形成一个新的应用场景并有可能暴发。


http://www.kler.cn/a/15748.html

相关文章:

  • 中仕公考怎么样?事业编面试不去有影响吗?
  • 《C语言程序设计现代方法》note-5 数组
  • 【Docker容器】一、一文了解docker
  • ubuntu22 安装 minikube
  • 速通前端篇 —— HTML
  • ThriveX 博客管理系统前后端项目部署教程
  • JS手撕代码系列【手写实现Promise】
  • 【Redis16】Redis进阶:内存优化
  • wifi芯片行业信息汇总
  • AcWing55. 连续子数组的最大和
  • 【柒志科技】面经 base上海
  • 了解hiberfil.sys文件:计算机休眠模式的背后
  • 【数据治理】数据治理的定义和价值
  • 标准错误重定向
  • 2023-04-30:用go语言重写ffmpeg的resampling_audio.c示例,它实现了音频重采样的功能。
  • 在全志V851S开发板上使用SSH配置步骤分析
  • 前端小白是如何利用chatgt用一周时间从做一款微信小程序的
  • 【MATLAB数据处理实用案例详解(15)】——利用BP神经网络实现个人信贷信用评估
  • 零基础想成为黑客,只需要四步
  • CF662C Binary Table
  • nvm安装使用详解,附gnvm介绍
  • 史上最全的接口测试,吐血整理从零到接口自动化实战...
  • 1992-2022年31省人均gdp/各省人均地区生产总值
  • @PostConstruct注解和@PreDestroy注解
  • 【AI生产力工具】Upscale.media:用AI技术提升照片质量,让你的作品更出色
  • 【LeetCode股票买卖系列:121. 买卖股票的最佳时机 | 一次遍历 | 暴力递归=>记忆化搜索=>动态规划】