DPDK系列之十六虚拟化virtio源码分析之virtio-user
一、virtio-user说明
在网络IO的半虚拟中,vhost-user是目前最优的解决方案。在DPDK中,同样也采用了这种方式。vhost-user是为了解决内核状态数据操作复杂的情况提出的一种解决方式,通过在用户进程来替代内核进程来实现数据交互的最少化。在vhost-user在应用场景中,虚拟化的容器支持是一个重点方向。起初的virtio-user就是为了支持容器内部与DPDK通信的。后来也发展到虚拟设备间的通信。
DPDK与Kernel的通信也叫做“exception path”,通常来说,这种通信方式主要有几种:
1、KNI,是目前DPDK中用户使用的主要方案。即通过虚拟网络接口,利用队列和DPDK应用交的数据,但无法实现upstream(一种负载均衡的手段)
2、Tun/Tap或者pcap PMD.需要内核切换,效率差
3、Flow Bifurcation,虚拟多张网卡,依赖硬件,不灵活
4、virtio-user和vhost-net,这是比较好的一种实现机制。
virtio-user在DPDK和虚拟场景下的应用还是非常多的。virtio-user虚拟出的设备和真实的设备在上层看没有区别,这个非常重要。
二、数据结构
下面看一下在DPDK中相关的数据结构定义:
struct virtio_user_queue {
uint16_t used_idx;
bool avail_wrap_counter;
bool used_wrap_counter;
};
struct virtio_user_dev {
/* for vhost_user backend */
int vhostfd;
int listenfd; /* listening fd */
bool is_server; /* server or client mode */
/* for vhost_kernel backend */
char *ifname;
int *vhostfds;
int *tapfds;
/* for both vhost_user and vhost_kernel */
int callfds[VIRTIO_MAX_VIRTQUEUES];
int kickfds[VIRTIO_MAX_VIRTQUEUES];
int mac_specified;
uint32_t max_queue_pairs;
uint32_t queue_pairs;
uint32_t queue_size;
uint64_t features; /* the negotiated features with driver,
* and will be sync with device
*/
uint64_t device_features; /* supported features by device */
uint64_t frontend_features; /* enabled frontend features */
uint64_t unsupported_features; /* unsupported features mask */
uint8_t status;
uint16_t net_status;
uint16_t port_id;
uint8_t mac_addr[RTE_ETHER_ADDR_LEN];
char path[PATH_MAX];
union {
struct vring vrings[VIRTIO_MAX_VIRTQUEUES];
struct vring_packed packed_vrings[VIRTIO_MAX_VIRTQUEUES];
};
struct virtio_user_queue packed_queues[VIRTIO_MAX_VIRTQUEUES];
bool qp_enabled[VIRTIO_MAX_VIRTQUEUE_PAIRS];
struct virtio_user_backend_ops *ops;
pthread_mutex_t mutex;
bool started;
};
除了虚拟设备外,其实它主要是和VHOST以及相关数据队列的操作,而那些数据结构在前面已经基本都介绍过了。
三、基本流程
其实在前面说了,virtio-user在虚拟环境中应用非常广泛,在virtio-user文件夹(driver/net/virtio)下可以看到,其实最主要的就是那几个文件:
int
virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
int cq, int queue_size, const char *mac, char **ifname,
int server, int mrg_rxbuf, int in_order, int packed_vq)
{
pthread_mutex_init(&dev->mutex, NULL);
strlcpy(dev->path, path, PATH_MAX);
dev->started = 0;
dev->max_queue_pairs = queues;
dev->queue_pairs = 1; /* mq disabled by default */
dev->queue_size = queue_size;
dev->is_server = server;
dev->mac_specified = 0;
dev->frontend_features = 0;
dev->unsupported_features = ~VIRTIO_USER_SUPPORTED_FEATURES;
parse_mac(dev, mac);
if (*ifname) {
dev->ifname = *ifname;
*ifname = NULL;
}
if (virtio_user_dev_setup(dev) < 0) {
PMD_INIT_LOG(ERR, "backend set up fails");
return -1;
}
if (!dev->is_server) {
if (dev->ops->send_request(dev, VHOST_USER_SET_OWNER,
NULL) < 0) {
PMD_INIT_LOG(ERR, "set_owner fails: %s",
strerror(errno));
return -1;
}
if (dev->ops->send_request(dev, VHOST_USER_GET_FEATURES,
&dev->device_features) < 0) {
PMD_INIT_LOG(ERR, "get_features failed: %s",
strerror(errno));
return -1;
}
} else {
/* We just pretend vhost-user can support all these features.
* Note that this could be problematic that if some feature is
* negotiated but not supported by the vhost-user which comes
* later.
*/
dev->device_features = VIRTIO_USER_SUPPORTED_FEATURES;
}
if (!mrg_rxbuf)
dev->unsupported_features |= (1ull << VIRTIO_NET_F_MRG_RXBUF);
if (!in_order)
dev->unsupported_features |= (1ull << VIRTIO_F_IN_ORDER);
if (!packed_vq)
dev->unsupported_features |= (1ull << VIRTIO_F_RING_PACKED);
if (dev->mac_specified)
dev->frontend_features |= (1ull << VIRTIO_NET_F_MAC);
else
dev->unsupported_features |= (1ull << VIRTIO_NET_F_MAC);
if (cq) {
/* device does not really need to know anything about CQ,
* so if necessary, we just claim to support CQ
*/
dev->frontend_features |= (1ull << VIRTIO_NET_F_CTRL_VQ);
} else {
dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VQ);
/* Also disable features that depend on VIRTIO_NET_F_CTRL_VQ */
dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_RX);
dev->unsupported_features |= (1ull << VIRTIO_NET_F_CTRL_VLAN);
dev->unsupported_features |=
(1ull << VIRTIO_NET_F_GUEST_ANNOUNCE);
dev->unsupported_features |= (1ull << VIRTIO_NET_F_MQ);
dev->unsupported_features |=
(1ull << VIRTIO_NET_F_CTRL_MAC_ADDR);
}
/* The backend will not report this feature, we add it explicitly */
if (is_vhost_user_by_type(dev->path))
dev->frontend_features |= (1ull << VIRTIO_NET_F_STATUS);
/*
* Device features =
* (frontend_features | backend_features) & ~unsupported_features;
*/
dev->device_features |= dev->frontend_features;
dev->device_features &= ~dev->unsupported_features;
if (rte_mem_event_callback_register(VIRTIO_USER_MEM_EVENT_CLB_NAME,
virtio_user_mem_event_cb, dev)) {
if (rte_errno != ENOTSUP) {
PMD_INIT_LOG(ERR, "Failed to register mem event"
" callback\n");
return -1;
}
}
return 0;
}
先是对设备的初始化,然后进行Setup:
static int
virtio_user_dev_setup(struct virtio_user_dev *dev)
{
uint32_t q;
dev->vhostfd = -1;
dev->vhostfds = NULL;
dev->tapfds = NULL;
if (dev->is_server) {
if (access(dev->path, F_OK) == 0 &&
!is_vhost_user_by_type(dev->path)) {
PMD_DRV_LOG(ERR, "Server mode doesn't support vhost-kernel!");
return -1;
}
dev->ops = &virtio_ops_user;
} else {
if (is_vhost_user_by_type(dev->path)) {
dev->ops = &virtio_ops_user;
} else {
dev->ops = &virtio_ops_kernel;
dev->vhostfds = malloc(dev->max_queue_pairs *
sizeof(int));
dev->tapfds = malloc(dev->max_queue_pairs *
sizeof(int));
if (!dev->vhostfds || !dev->tapfds) {
PMD_INIT_LOG(ERR, "Failed to malloc");
return -1;
}
for (q = 0; q < dev->max_queue_pairs; ++q) {
dev->vhostfds[q] = -1;
dev->tapfds[q] = -1;
}
}
}
if (dev->ops->setup(dev) < 0)
return -1;
if (virtio_user_dev_init_notify(dev) < 0)
return -1;
if (virtio_user_fill_intr_handle(dev) < 0)
return -1;
return 0;
}
然后在处理用户状态时可以启动:
//drivers/net/virtio/virtio_user_ethdev.c
static void
virtio_user_set_status(struct virtio_hw *hw, uint8_t status)
{
struct virtio_user_dev *dev = virtio_user_get_dev(hw);
if (status & VIRTIO_CONFIG_STATUS_DRIVER_OK)
virtio_user_start_device(dev);
else if (status == VIRTIO_CONFIG_STATUS_RESET)
virtio_user_reset(hw);
dev->status = status;
}
int
virtio_user_start_device(struct virtio_user_dev *dev)
{
uint64_t features;
int ret;
/*
* XXX workaround!
*
* We need to make sure that the locks will be
* taken in the correct order to avoid deadlocks.
*
* Before releasing this lock, this thread should
* not trigger any memory hotplug events.
*
* This is a temporary workaround, and should be
* replaced when we get proper supports from the
* memory subsystem in the future.
*/
rte_mcfg_mem_read_lock();
pthread_mutex_lock(&dev->mutex);
if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0)
goto error;
/* Step 0: tell vhost to create queues */
if (virtio_user_queue_setup(dev, virtio_user_create_queue) < 0)
goto error;
/* Step 1: set features */
features = dev->features;
/* Strip VIRTIO_NET_F_MAC, as MAC address is handled in vdev init */
features &= ~(1ull << VIRTIO_NET_F_MAC);
/* Strip VIRTIO_NET_F_CTRL_VQ, as devices do not really need to know */
features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ);
features &= ~(1ull << VIRTIO_NET_F_STATUS);
ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
if (ret < 0)
goto error;
PMD_DRV_LOG(INFO, "set features: %" PRIx64, features);
/* Step 2: share memory regions */
ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
if (ret < 0)
goto error;
/* Step 3: kick queues */
if (virtio_user_queue_setup(dev, virtio_user_kick_queue) < 0)
goto error;
/* Step 4: enable queues
* we enable the 1st queue pair by default.
*/
dev->ops->enable_qp(dev, 0, 1);
dev->started = true;
pthread_mutex_unlock(&dev->mutex);
rte_mcfg_mem_read_unlock();
return 0;
error:
pthread_mutex_unlock(&dev->mutex);
rte_mcfg_mem_read_unlock();
/* TODO: free resource here or caller to check */
return -1;
}
这里其实会调用send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL)来传递内存数据(ops中设置),如果后端为vhost-user时,即为vhost_user_sock。
//drivers/net/virtio/virtio-user
static int
vhost_user_sock(struct virtio_user_dev *dev,
enum vhost_user_request req,
void *arg)
{
struct vhost_user_msg msg;
struct vhost_vring_file *file = 0;
int need_reply = 0;
int fds[VHOST_MEMORY_MAX_NREGIONS];
int fd_num = 0;
int len;
int vhostfd = dev->vhostfd;
RTE_SET_USED(m);
PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]);
if (dev->is_server && vhostfd < 0)
return -1;
msg.request = req;
msg.flags = VHOST_USER_VERSION;
msg.size = 0;
switch (req) {
case VHOST_USER_GET_FEATURES:
need_reply = 1;
break;
case VHOST_USER_SET_FEATURES:
case VHOST_USER_SET_LOG_BASE:
msg.payload.u64 = *((__u64 *)arg);
msg.size = sizeof(m.payload.u64);
break;
case VHOST_USER_SET_OWNER:
case VHOST_USER_RESET_OWNER:
break;
case VHOST_USER_SET_MEM_TABLE:
if (prepare_vhost_memory_user(&msg, fds) < 0)
return -1;
fd_num = msg.payload.memory.nregions;
msg.size = sizeof(m.payload.memory.nregions);
msg.size += sizeof(m.payload.memory.padding);
msg.size += fd_num * sizeof(struct vhost_memory_region);
break;
case VHOST_USER_SET_LOG_FD:
fds[fd_num++] = *((int *)arg);
break;
case VHOST_USER_SET_VRING_NUM:
case VHOST_USER_SET_VRING_BASE:
case VHOST_USER_SET_VRING_ENABLE:
memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
msg.size = sizeof(m.payload.state);
break;
case VHOST_USER_GET_VRING_BASE:
memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
msg.size = sizeof(m.payload.state);
need_reply = 1;
break;
case VHOST_USER_SET_VRING_ADDR:
memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
msg.size = sizeof(m.payload.addr);
break;
case VHOST_USER_SET_VRING_KICK:
case VHOST_USER_SET_VRING_CALL:
case VHOST_USER_SET_VRING_ERR:
file = arg;
msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
msg.size = sizeof(m.payload.u64);
if (file->fd > 0)
fds[fd_num++] = file->fd;
else
msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
break;
default:
PMD_DRV_LOG(ERR, "trying to send unhandled msg type");
return -1;
}
len = VHOST_USER_HDR_SIZE + msg.size;
if (vhost_user_write(vhostfd, &msg, len, fds, fd_num) < 0) {
PMD_DRV_LOG(ERR, "%s failed: %s",
vhost_msg_strings[req], strerror(errno));
return -1;
}
if (need_reply) {
if (vhost_user_read(vhostfd, &msg) < 0) {
PMD_DRV_LOG(ERR, "Received msg failed: %s",
strerror(errno));
return -1;
}
if (req != msg.request) {
PMD_DRV_LOG(ERR, "Received unexpected msg type");
return -1;
}
switch (req) {
case VHOST_USER_GET_FEATURES:
if (msg.size != sizeof(m.payload.u64)) {
PMD_DRV_LOG(ERR, "Received bad msg size");
return -1;
}
*((__u64 *)arg) = msg.payload.u64;
break;
case VHOST_USER_GET_VRING_BASE:
if (msg.size != sizeof(m.payload.state)) {
PMD_DRV_LOG(ERR, "Received bad msg size");
return -1;
}
memcpy(arg, &msg.payload.state,
sizeof(struct vhost_vring_state));
break;
default:
PMD_DRV_LOG(ERR, "Received unexpected msg type");
return -1;
}
}
return 0;
}
找到相关的VHOST_USER_SET_MEM_TABLE选项设置就看了数据的准备,从调用函数就可以一路深入进去,明白整个过程。这里就不再做介绍。
四、总结
通过上面的分析可以看出,virtio-user既可以实现虚拟机前后端的通信,也可以实现不同设备间的通信,还可以实现与内核间的通信。所以一种新的技术被提出后,会不断的推动应用的向前发展,反过来,应用的发展又不断要求前者提供更好的支持。互相促进,就会形成一个新的应用场景并有可能暴发。