VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_UFO,
+ VIRTIO_NET_F_GUEST_USO4,
+ VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_HOST_ECN,
VIRTIO_NET_F_HOST_TSO4,
VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_UFO,
+ VIRTIO_NET_F_HOST_USO,
VIRTIO_NET_F_MQ,
VIRTIO_NET_F_MRG_RXBUF,
VIRTIO_NET_F_MTU,
BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
BIT_ULL(VIRTIO_NET_F_STATUS) |
BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+ BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
+ BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |
+ BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) |
BIT_ULL(VIRTIO_NET_F_MQ) |
BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
/* VHOST_F_LOG_ALL is exposed by SVQ */
BIT_ULL(VHOST_F_LOG_ALL) |
+ BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |
BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
BIT_ULL(VIRTIO_NET_F_STANDBY) |
BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX);
}
}
+/** Dummy SetSteeringEBPF to support RSS for vhost-vdpa backend */
+static bool vhost_vdpa_set_steering_ebpf(NetClientState *nc, int prog_fd)
+{
+ return true;
+}
+
static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
{
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
{
struct vhost_vdpa *v = &s->vhost_vdpa;
- add_migration_state_change_notifier(&s->migration_state);
+ migration_add_notifier(&s->migration_state,
+ vdpa_net_migration_state_notifier);
if (v->shadow_vqs_enabled) {
v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
v->iova_range.last);
return 0;
}
+static int vhost_vdpa_net_data_load(NetClientState *nc)
+{
+ VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+ struct vhost_vdpa *v = &s->vhost_vdpa;
+ bool has_cvq = v->dev->vq_index_end % 2;
+
+ if (has_cvq) {
+ return 0;
+ }
+
+ for (int i = 0; i < v->dev->nvqs; ++i) {
+ vhost_vdpa_set_vring_ready(v, i + v->dev->vq_index);
+ }
+ return 0;
+}
+
static void vhost_vdpa_net_client_stop(NetClientState *nc)
{
VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
if (s->vhost_vdpa.index == 0) {
- remove_migration_state_change_notifier(&s->migration_state);
+ migration_remove_notifier(&s->migration_state);
}
dev = s->vhost_vdpa.dev;
if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+ } else {
+ s->vhost_vdpa.iova_tree = NULL;
}
}
.size = sizeof(VhostVDPAState),
.receive = vhost_vdpa_receive,
.start = vhost_vdpa_net_data_start,
+ .load = vhost_vdpa_net_data_load,
.stop = vhost_vdpa_net_client_stop,
.cleanup = vhost_vdpa_cleanup,
.has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
.has_ufo = vhost_vdpa_has_ufo,
.check_peer_type = vhost_vdpa_check_peer_type,
+ .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
};
static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index,
s0 = vhost_vdpa_net_first_nc_vdpa(s);
v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled;
- v->shadow_vqs_enabled = s->always_svq;
+ v->shadow_vqs_enabled = s0->vhost_vdpa.shadow_vqs_enabled;
s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
if (s->vhost_vdpa.shadow_data) {
vhost_vdpa_net_client_stop(nc);
}
-static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
- size_t in_len)
+static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s,
+ const struct iovec *out_sg, size_t out_num,
+ const struct iovec *in_sg, size_t in_num)
{
- /* Buffers for the device */
- const struct iovec out = {
- .iov_base = s->cvq_cmd_out_buffer,
- .iov_len = out_len,
- };
- const struct iovec in = {
- .iov_base = s->status,
- .iov_len = sizeof(virtio_net_ctrl_ack),
- };
VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
int r;
- r = vhost_svq_add(svq, &out, 1, &in, 1, NULL);
+ r = vhost_svq_add(svq, out_sg, out_num, in_sg, in_num, NULL);
if (unlikely(r != 0)) {
if (unlikely(r == -ENOSPC)) {
qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
__func__);
}
- return r;
}
- /*
- * We can poll here since we've had BQL from the time we sent the
- * descriptor. Also, we need to take the answer before SVQ pulls by itself,
- * when BQL is released
- */
- return vhost_svq_poll(svq);
+ return r;
+}
+
+/*
+ * Convenience wrapper to poll SVQ for multiple control commands.
+ *
+ * Caller should hold the BQL when invoking this function, and should take
+ * the answer before SVQ pulls by itself when BQL is released.
+ */
+static ssize_t vhost_vdpa_net_svq_poll(VhostVDPAState *s, size_t cmds_in_flight)
+{
+ VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
+ return vhost_svq_poll(svq, cmds_in_flight);
+}
+
+static void vhost_vdpa_net_load_cursor_reset(VhostVDPAState *s,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
+{
+ /* reset the cursor of the output buffer for the device */
+ out_cursor->iov_base = s->cvq_cmd_out_buffer;
+ out_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
+
+ /* reset the cursor of the in buffer for the device */
+ in_cursor->iov_base = s->status;
+ in_cursor->iov_len = vhost_vdpa_net_cvq_cmd_page_len();
+}
+
+/*
+ * Poll SVQ for multiple pending control commands and check the device's ack.
+ *
+ * Caller should hold the BQL when invoking this function.
+ *
+ * @s: The VhostVDPAState
+ * @len: The length of the pending status shadow buffer
+ */
+static ssize_t vhost_vdpa_net_svq_flush(VhostVDPAState *s, size_t len)
+{
+ /* device uses a one-byte length ack for each control command */
+ ssize_t dev_written = vhost_vdpa_net_svq_poll(s, len);
+ if (unlikely(dev_written != len)) {
+ return -EIO;
+ }
+
+ /* check the device's ack */
+ for (int i = 0; i < len; ++i) {
+ if (s->status[i] != VIRTIO_NET_OK) {
+ return -EIO;
+ }
+ }
+ return 0;
}
-static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
- uint8_t cmd, const void *data,
- size_t data_size)
+static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor, uint8_t class,
+ uint8_t cmd, const struct iovec *data_sg,
+ size_t data_num)
{
const struct virtio_net_ctrl_hdr ctrl = {
.class = class,
.cmd = cmd,
};
+ size_t data_size = iov_size(data_sg, data_num), cmd_size;
+ struct iovec out, in;
+ ssize_t r;
+ unsigned dummy_cursor_iov_cnt;
+ VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
+ cmd_size = sizeof(ctrl) + data_size;
+ if (vhost_svq_available_slots(svq) < 2 ||
+ iov_size(out_cursor, 1) < cmd_size) {
+ /*
+ * It is time to flush all pending control commands if SVQ is full
+ * or control commands shadow buffers are full.
+ *
+ * We can poll here since we've had BQL from the time
+ * we sent the descriptor.
+ */
+ r = vhost_vdpa_net_svq_flush(s, in_cursor->iov_base -
+ (void *)s->status);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+
+ vhost_vdpa_net_load_cursor_reset(s, out_cursor, in_cursor);
+ }
+
+ /* pack the CVQ command header */
+ iov_from_buf(out_cursor, 1, 0, &ctrl, sizeof(ctrl));
+ /* pack the CVQ command command-specific-data */
+ iov_to_buf(data_sg, data_num, 0,
+ out_cursor->iov_base + sizeof(ctrl), data_size);
+
+ /* extract the required buffer from the cursor for output */
+ iov_copy(&out, 1, out_cursor, 1, 0, cmd_size);
+ /* extract the required buffer from the cursor for input */
+ iov_copy(&in, 1, in_cursor, 1, 0, sizeof(*s->status));
+
+ r = vhost_vdpa_net_cvq_add(s, &out, 1, &in, 1);
+ if (unlikely(r < 0)) {
+ return r;
+ }
- memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl));
- memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size);
+ /* iterate the cursors */
+ dummy_cursor_iov_cnt = 1;
+ iov_discard_front(&out_cursor, &dummy_cursor_iov_cnt, cmd_size);
+ dummy_cursor_iov_cnt = 1;
+ iov_discard_front(&in_cursor, &dummy_cursor_iov_cnt, sizeof(*s->status));
- return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size,
- sizeof(virtio_net_ctrl_ack));
+ return 0;
}
-static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
+static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
{
if (virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
- ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
- VIRTIO_NET_CTRL_MAC_ADDR_SET,
- n->mac, sizeof(n->mac));
- if (unlikely(dev_written < 0)) {
- return dev_written;
+ const struct iovec data = {
+ .iov_base = (void *)n->mac,
+ .iov_len = sizeof(n->mac),
+ };
+ ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_MAC,
+ VIRTIO_NET_CTRL_MAC_ADDR_SET,
+ &data, 1);
+ if (unlikely(r < 0)) {
+ return r;
}
- if (*s->status != VIRTIO_NET_OK) {
- return -EIO;
+ }
+
+ /*
+ * According to VirtIO standard, "The device MUST have an
+ * empty MAC filtering table on reset.".
+ *
+ * Therefore, there is no need to send this CVQ command if the
+ * driver also sets an empty MAC filter table, which aligns with
+ * the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX) ||
+ n->mac_table.in_use == 0) {
+ return 0;
+ }
+
+ uint32_t uni_entries = n->mac_table.first_multi,
+ uni_macs_size = uni_entries * ETH_ALEN,
+ mul_entries = n->mac_table.in_use - uni_entries,
+ mul_macs_size = mul_entries * ETH_ALEN;
+ struct virtio_net_ctrl_mac uni = {
+ .entries = cpu_to_le32(uni_entries),
+ };
+ struct virtio_net_ctrl_mac mul = {
+ .entries = cpu_to_le32(mul_entries),
+ };
+ const struct iovec data[] = {
+ {
+ .iov_base = &uni,
+ .iov_len = sizeof(uni),
+ }, {
+ .iov_base = n->mac_table.macs,
+ .iov_len = uni_macs_size,
+ }, {
+ .iov_base = &mul,
+ .iov_len = sizeof(mul),
+ }, {
+ .iov_base = &n->mac_table.macs[uni_macs_size],
+ .iov_len = mul_macs_size,
+ },
+ };
+ ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_MAC,
+ VIRTIO_NET_CTRL_MAC_TABLE_SET,
+ data, ARRAY_SIZE(data));
+ if (unlikely(r < 0)) {
+ return r;
+ }
+
+ return 0;
+}
+
+static int vhost_vdpa_net_load_rss(VhostVDPAState *s, const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
+{
+ struct virtio_net_rss_config cfg = {};
+ ssize_t r;
+ g_autofree uint16_t *table = NULL;
+
+ /*
+ * According to VirtIO standard, "Initially the device has all hash
+ * types disabled and reports only VIRTIO_NET_HASH_REPORT_NONE.".
+ *
+ * Therefore, there is no need to send this CVQ command if the
+ * driver disables the all hash types, which aligns with
+ * the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (!n->rss_data.enabled ||
+ n->rss_data.hash_types == VIRTIO_NET_HASH_REPORT_NONE) {
+ return 0;
+ }
+
+ table = g_malloc_n(n->rss_data.indirections_len,
+ sizeof(n->rss_data.indirections_table[0]));
+ cfg.hash_types = cpu_to_le32(n->rss_data.hash_types);
+
+ /*
+ * According to VirtIO standard, "Field reserved MUST contain zeroes.
+ * It is defined to make the structure to match the layout of
+ * virtio_net_rss_config structure, defined in 5.1.6.5.7.".
+ *
+ * Therefore, we need to zero the fields in
+ * struct virtio_net_rss_config, which corresponds to the
+ * `reserved` field in struct virtio_net_hash_config.
+ *
+ * Note that all other fields are zeroed at their definitions,
+ * except for the `indirection_table` field, where the actual data
+ * is stored in the `table` variable to ensure compatibility
+ * with RSS case. Therefore, we need to zero the `table` variable here.
+ */
+ table[0] = 0;
+
+ /*
+ * Considering that virtio_net_handle_rss() currently does not restore
+ * the hash key length parsed from the CVQ command sent from the guest
+ * into n->rss_data and uses the maximum key length in other code, so
+ * we also employ the maximum key length here.
+ */
+ cfg.hash_key_length = sizeof(n->rss_data.key);
+
+ const struct iovec data[] = {
+ {
+ .iov_base = &cfg,
+ .iov_len = offsetof(struct virtio_net_rss_config,
+ indirection_table),
+ }, {
+ .iov_base = table,
+ .iov_len = n->rss_data.indirections_len *
+ sizeof(n->rss_data.indirections_table[0]),
+ }, {
+ .iov_base = &cfg.max_tx_vq,
+ .iov_len = offsetof(struct virtio_net_rss_config, hash_key_data) -
+ offsetof(struct virtio_net_rss_config, max_tx_vq),
+ }, {
+ .iov_base = (void *)n->rss_data.key,
+ .iov_len = sizeof(n->rss_data.key),
}
+ };
+
+ r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_MQ,
+ VIRTIO_NET_CTRL_MQ_HASH_CONFIG,
+ data, ARRAY_SIZE(data));
+ if (unlikely(r < 0)) {
+ return r;
}
return 0;
}
static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
- const VirtIONet *n)
+ const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
{
struct virtio_net_ctrl_mq mq;
- ssize_t dev_written;
+ ssize_t r;
if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_MQ)) {
return 0;
}
mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
- dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
- VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &mq,
- sizeof(mq));
- if (unlikely(dev_written < 0)) {
- return dev_written;
+ const struct iovec data = {
+ .iov_base = &mq,
+ .iov_len = sizeof(mq),
+ };
+ r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_MQ,
+ VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
+ &data, 1);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+
+ if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_HASH_REPORT)) {
+ return 0;
}
- return *s->status != VIRTIO_NET_OK;
+ r = vhost_vdpa_net_load_rss(s, n, out_cursor, in_cursor);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+
+ return 0;
}
static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
- const VirtIONet *n)
+ const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
{
uint64_t offloads;
- ssize_t dev_written;
+ ssize_t r;
if (!virtio_vdev_has_feature(&n->parent_obj,
VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
}
offloads = cpu_to_le64(n->curr_guest_offloads);
- dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
- VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
- &offloads, sizeof(offloads));
- if (unlikely(dev_written < 0)) {
- return dev_written;
+ const struct iovec data = {
+ .iov_base = &offloads,
+ .iov_len = sizeof(offloads),
+ };
+ r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_GUEST_OFFLOADS,
+ VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
+ &data, 1);
+ if (unlikely(r < 0)) {
+ return r;
}
- return *s->status != VIRTIO_NET_OK;
+ return 0;
}
-static int vhost_vdpa_net_load(NetClientState *nc)
+static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor,
+ uint8_t cmd,
+ uint8_t on)
{
- VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
- struct vhost_vdpa *v = &s->vhost_vdpa;
- const VirtIONet *n;
- int r;
+ const struct iovec data = {
+ .iov_base = &on,
+ .iov_len = sizeof(on),
+ };
+ ssize_t r;
- assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+ r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX, cmd, &data, 1);
+ if (unlikely(r < 0)) {
+ return r;
+ }
- if (!v->shadow_vqs_enabled) {
+ return 0;
+}
+
+static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
+ const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
+{
+ ssize_t r;
+
+ if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX)) {
return 0;
}
- n = VIRTIO_NET(v->dev->vdev);
- r = vhost_vdpa_net_load_mac(s, n);
+ /*
+ * According to virtio_net_reset(), device turns promiscuous mode
+ * on by default.
+ *
+ * Additionally, according to VirtIO standard, "Since there are
+ * no guarantees, it can use a hash filter or silently switch to
+ * allmulti or promiscuous mode if it is given too many addresses.".
+ * QEMU marks `n->mac_table.uni_overflow` if guest sets too many
+ * non-multicast MAC addresses, indicating that promiscuous mode
+ * should be enabled.
+ *
+ * Therefore, QEMU should only send this CVQ command if the
+ * `n->mac_table.uni_overflow` is not marked and `n->promisc` is off,
+ * which sets promiscuous mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (!n->mac_table.uni_overflow && !n->promisc) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_PROMISC, 0);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+ }
+
+ /*
+ * According to virtio_net_reset(), device turns all-multicast mode
+ * off by default.
+ *
+ * According to VirtIO standard, "Since there are no guarantees,
+ * it can use a hash filter or silently switch to allmulti or
+ * promiscuous mode if it is given too many addresses.". QEMU marks
+ * `n->mac_table.multi_overflow` if guest sets too many
+ * non-multicast MAC addresses.
+ *
+ * Therefore, QEMU should only send this CVQ command if the
+ * `n->mac_table.multi_overflow` is marked or `n->allmulti` is on,
+ * which sets all-multicast mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (n->mac_table.multi_overflow || n->allmulti) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_ALLMULTI, 1);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+ }
+
+ if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_RX_EXTRA)) {
+ return 0;
+ }
+
+ /*
+ * According to virtio_net_reset(), device turns all-unicast mode
+ * off by default.
+ *
+ * Therefore, QEMU should only send this CVQ command if the driver
+ * sets all-unicast mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (n->alluni) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_ALLUNI, 1);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ /*
+ * According to virtio_net_reset(), device turns non-multicast mode
+ * off by default.
+ *
+ * Therefore, QEMU should only send this CVQ command if the driver
+ * sets non-multicast mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (n->nomulti) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_NOMULTI, 1);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ /*
+ * According to virtio_net_reset(), device turns non-unicast mode
+ * off by default.
+ *
+ * Therefore, QEMU should only send this CVQ command if the driver
+ * sets non-unicast mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (n->nouni) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_NOUNI, 1);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ /*
+ * According to virtio_net_reset(), device turns non-broadcast mode
+ * off by default.
+ *
+ * Therefore, QEMU should only send this CVQ command if the driver
+ * sets non-broadcast mode on, different from the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+ if (n->nobcast) {
+ r = vhost_vdpa_net_load_rx_mode(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_RX_NOBCAST, 1);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
+ const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor,
+ uint16_t vid)
+{
+ const struct iovec data = {
+ .iov_base = &vid,
+ .iov_len = sizeof(vid),
+ };
+ ssize_t r = vhost_vdpa_net_load_cmd(s, out_cursor, in_cursor,
+ VIRTIO_NET_CTRL_VLAN,
+ VIRTIO_NET_CTRL_VLAN_ADD,
+ &data, 1);
if (unlikely(r < 0)) {
return r;
}
- r = vhost_vdpa_net_load_mq(s, n);
- if (unlikely(r)) {
- return r;
+
+ return 0;
+}
+
+static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
+ const VirtIONet *n,
+ struct iovec *out_cursor,
+ struct iovec *in_cursor)
+{
+ int r;
+
+ if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
+ return 0;
}
- r = vhost_vdpa_net_load_offloads(s, n);
- if (unlikely(r)) {
- return r;
+
+ for (int i = 0; i < MAX_VLAN >> 5; i++) {
+ for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
+ if (n->vlans[i] & (1U << j)) {
+ r = vhost_vdpa_net_load_single_vlan(s, n, out_cursor,
+ in_cursor, (i << 5) + j);
+ if (unlikely(r != 0)) {
+ return r;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_vdpa_net_cvq_load(NetClientState *nc)
+{
+ VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+ struct vhost_vdpa *v = &s->vhost_vdpa;
+ const VirtIONet *n;
+ int r;
+ struct iovec out_cursor, in_cursor;
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+
+ vhost_vdpa_set_vring_ready(v, v->dev->vq_index);
+
+ if (v->shadow_vqs_enabled) {
+ n = VIRTIO_NET(v->dev->vdev);
+ vhost_vdpa_net_load_cursor_reset(s, &out_cursor, &in_cursor);
+ r = vhost_vdpa_net_load_mac(s, n, &out_cursor, &in_cursor);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+ r = vhost_vdpa_net_load_mq(s, n, &out_cursor, &in_cursor);
+ if (unlikely(r)) {
+ return r;
+ }
+ r = vhost_vdpa_net_load_offloads(s, n, &out_cursor, &in_cursor);
+ if (unlikely(r)) {
+ return r;
+ }
+ r = vhost_vdpa_net_load_rx(s, n, &out_cursor, &in_cursor);
+ if (unlikely(r)) {
+ return r;
+ }
+ r = vhost_vdpa_net_load_vlan(s, n, &out_cursor, &in_cursor);
+ if (unlikely(r)) {
+ return r;
+ }
+
+ /*
+ * We need to poll and check all pending device's used buffers.
+ *
+ * We can poll here since we've had BQL from the time
+ * we sent the descriptor.
+ */
+ r = vhost_vdpa_net_svq_flush(s, in_cursor.iov_base - (void *)s->status);
+ if (unlikely(r)) {
+ return r;
+ }
+ }
+
+ for (int i = 0; i < v->dev->vq_index; ++i) {
+ vhost_vdpa_set_vring_ready(v, i);
}
return 0;
.size = sizeof(VhostVDPAState),
.receive = vhost_vdpa_receive,
.start = vhost_vdpa_net_cvq_start,
- .load = vhost_vdpa_net_load,
+ .load = vhost_vdpa_net_cvq_load,
.stop = vhost_vdpa_net_cvq_stop,
.cleanup = vhost_vdpa_cleanup,
.has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
.has_ufo = vhost_vdpa_has_ufo,
.check_peer_type = vhost_vdpa_check_peer_type,
+ .set_steering_ebpf = vhost_vdpa_set_steering_ebpf,
};
+/*
+ * Forward the excessive VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command to
+ * vdpa device.
+ *
+ * Considering that QEMU cannot send the entire filter table to the
+ * vdpa device, it should send the VIRTIO_NET_CTRL_RX_PROMISC CVQ
+ * command to enable promiscuous mode to receive all packets,
+ * according to VirtIO standard, "Since there are no guarantees,
+ * it can use a hash filter or silently switch to allmulti or
+ * promiscuous mode if it is given too many addresses.".
+ *
+ * Since QEMU ignores MAC addresses beyond `MAC_TABLE_ENTRIES` and
+ * marks `n->mac_table.x_overflow` accordingly, it should have
+ * the same effect on the device model to receive
+ * (`MAC_TABLE_ENTRIES` + 1) or more non-multicast MAC addresses.
+ * The same applies to multicast MAC addresses.
+ *
+ * Therefore, QEMU can provide the device model with a fake
+ * VIRTIO_NET_CTRL_MAC_TABLE_SET command with (`MAC_TABLE_ENTRIES` + 1)
+ * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1) multicast
+ * MAC addresses. This ensures that the device model marks
+ * `n->mac_table.uni_overflow` and `n->mac_table.multi_overflow`,
+ * allowing all packets to be received, which aligns with the
+ * state of the vdpa device.
+ */
+static int vhost_vdpa_net_excessive_mac_filter_cvq_add(VhostVDPAState *s,
+ VirtQueueElement *elem,
+ struct iovec *out,
+ const struct iovec *in)
+{
+ struct virtio_net_ctrl_mac mac_data, *mac_ptr;
+ struct virtio_net_ctrl_hdr *hdr_ptr;
+ uint32_t cursor;
+ ssize_t r;
+ uint8_t on = 1;
+
+ /* parse the non-multicast MAC address entries from CVQ command */
+ cursor = sizeof(*hdr_ptr);
+ r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
+ &mac_data, sizeof(mac_data));
+ if (unlikely(r != sizeof(mac_data))) {
+ /*
+ * If the CVQ command is invalid, we should simulate the vdpa device
+ * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
+ */
+ *s->status = VIRTIO_NET_ERR;
+ return sizeof(*s->status);
+ }
+ cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
+
+ /* parse the multicast MAC address entries from CVQ command */
+ r = iov_to_buf(elem->out_sg, elem->out_num, cursor,
+ &mac_data, sizeof(mac_data));
+ if (r != sizeof(mac_data)) {
+ /*
+ * If the CVQ command is invalid, we should simulate the vdpa device
+ * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
+ */
+ *s->status = VIRTIO_NET_ERR;
+ return sizeof(*s->status);
+ }
+ cursor += sizeof(mac_data) + le32_to_cpu(mac_data.entries) * ETH_ALEN;
+
+ /* validate the CVQ command */
+ if (iov_size(elem->out_sg, elem->out_num) != cursor) {
+ /*
+ * If the CVQ command is invalid, we should simulate the vdpa device
+ * to reject the VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
+ */
+ *s->status = VIRTIO_NET_ERR;
+ return sizeof(*s->status);
+ }
+
+ /*
+ * According to VirtIO standard, "Since there are no guarantees,
+ * it can use a hash filter or silently switch to allmulti or
+ * promiscuous mode if it is given too many addresses.".
+ *
+ * Therefore, considering that QEMU is unable to send the entire
+ * filter table to the vdpa device, it should send the
+ * VIRTIO_NET_CTRL_RX_PROMISC CVQ command to enable promiscuous mode
+ */
+ hdr_ptr = out->iov_base;
+ out->iov_len = sizeof(*hdr_ptr) + sizeof(on);
+
+ hdr_ptr->class = VIRTIO_NET_CTRL_RX;
+ hdr_ptr->cmd = VIRTIO_NET_CTRL_RX_PROMISC;
+ iov_from_buf(out, 1, sizeof(*hdr_ptr), &on, sizeof(on));
+ r = vhost_vdpa_net_cvq_add(s, out, 1, in, 1);
+ if (unlikely(r < 0)) {
+ return r;
+ }
+
+ /*
+ * We can poll here since we've had BQL from the time
+ * we sent the descriptor.
+ */
+ r = vhost_vdpa_net_svq_poll(s, 1);
+ if (unlikely(r < sizeof(*s->status))) {
+ return r;
+ }
+ if (*s->status != VIRTIO_NET_OK) {
+ return sizeof(*s->status);
+ }
+
+ /*
+ * QEMU should also send a fake VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ
+ * command to the device model, including (`MAC_TABLE_ENTRIES` + 1)
+ * non-multicast MAC addresses and (`MAC_TABLE_ENTRIES` + 1)
+ * multicast MAC addresses.
+ *
+ * By doing so, the device model can mark `n->mac_table.uni_overflow`
+ * and `n->mac_table.multi_overflow`, enabling all packets to be
+ * received, which aligns with the state of the vdpa device.
+ */
+ cursor = 0;
+ uint32_t fake_uni_entries = MAC_TABLE_ENTRIES + 1,
+ fake_mul_entries = MAC_TABLE_ENTRIES + 1,
+ fake_cvq_size = sizeof(struct virtio_net_ctrl_hdr) +
+ sizeof(mac_data) + fake_uni_entries * ETH_ALEN +
+ sizeof(mac_data) + fake_mul_entries * ETH_ALEN;
+
+ assert(fake_cvq_size < vhost_vdpa_net_cvq_cmd_page_len());
+ out->iov_len = fake_cvq_size;
+
+ /* pack the header for fake CVQ command */
+ hdr_ptr = out->iov_base + cursor;
+ hdr_ptr->class = VIRTIO_NET_CTRL_MAC;
+ hdr_ptr->cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
+ cursor += sizeof(*hdr_ptr);
+
+ /*
+ * Pack the non-multicast MAC addresses part for fake CVQ command.
+ *
+ * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
+ * addresses provided in CVQ command. Therefore, only the entries
+ * field need to be prepared in the CVQ command.
+ */
+ mac_ptr = out->iov_base + cursor;
+ mac_ptr->entries = cpu_to_le32(fake_uni_entries);
+ cursor += sizeof(*mac_ptr) + fake_uni_entries * ETH_ALEN;
+
+ /*
+ * Pack the multicast MAC addresses part for fake CVQ command.
+ *
+ * According to virtio_net_handle_mac(), QEMU doesn't verify the MAC
+ * addresses provided in CVQ command. Therefore, only the entries
+ * field need to be prepared in the CVQ command.
+ */
+ mac_ptr = out->iov_base + cursor;
+ mac_ptr->entries = cpu_to_le32(fake_mul_entries);
+
+ /*
+ * Simulating QEMU poll a vdpa device used buffer
+ * for VIRTIO_NET_CTRL_MAC_TABLE_SET CVQ command
+ */
+ return sizeof(*s->status);
+}
+
/**
* Validate and copy control virtqueue commands.
*
{
VhostVDPAState *s = opaque;
size_t in_len;
+ const struct virtio_net_ctrl_hdr *ctrl;
virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
/* Out buffer sent to both the vdpa device and the device model */
struct iovec out = {
.iov_base = s->cvq_cmd_out_buffer,
};
/* in buffer used for device model */
- const struct iovec in = {
+ const struct iovec model_in = {
.iov_base = &status,
.iov_len = sizeof(status),
};
+ /* in buffer used for vdpa device */
+ const struct iovec vdpa_in = {
+ .iov_base = s->status,
+ .iov_len = sizeof(*s->status),
+ };
ssize_t dev_written = -EINVAL;
out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
s->cvq_cmd_out_buffer,
- vhost_vdpa_net_cvq_cmd_len());
- if (*(uint8_t *)s->cvq_cmd_out_buffer == VIRTIO_NET_CTRL_ANNOUNCE) {
+ vhost_vdpa_net_cvq_cmd_page_len());
+
+ ctrl = s->cvq_cmd_out_buffer;
+ if (ctrl->class == VIRTIO_NET_CTRL_ANNOUNCE) {
/*
* Guest announce capability is emulated by qemu, so don't forward to
* the device.
*/
dev_written = sizeof(status);
*s->status = VIRTIO_NET_OK;
- } else {
- dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status));
+ } else if (unlikely(ctrl->class == VIRTIO_NET_CTRL_MAC &&
+ ctrl->cmd == VIRTIO_NET_CTRL_MAC_TABLE_SET &&
+ iov_size(elem->out_sg, elem->out_num) > out.iov_len)) {
+ /*
+ * Due to the size limitation of the out buffer sent to the vdpa device,
+ * which is determined by vhost_vdpa_net_cvq_cmd_page_len(), excessive
+ * MAC addresses set by the driver for the filter table can cause
+ * truncation of the CVQ command in QEMU. As a result, the vdpa device
+ * rejects the flawed CVQ command.
+ *
+ * Therefore, QEMU must handle this situation instead of sending
+ * the CVQ command directly.
+ */
+ dev_written = vhost_vdpa_net_excessive_mac_filter_cvq_add(s, elem,
+ &out, &vdpa_in);
if (unlikely(dev_written < 0)) {
goto out;
}
+ } else {
+ ssize_t r;
+ r = vhost_vdpa_net_cvq_add(s, &out, 1, &vdpa_in, 1);
+ if (unlikely(r < 0)) {
+ dev_written = r;
+ goto out;
+ }
+
+ /*
+ * We can poll here since we've had BQL from the time
+ * we sent the descriptor.
+ */
+ dev_written = vhost_vdpa_net_svq_poll(s, 1);
}
if (unlikely(dev_written < sizeof(status))) {
}
status = VIRTIO_NET_ERR;
- virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1);
+ virtio_net_handle_ctrl_iov(svq->vdev, &model_in, 1, &out, 1);
if (status != VIRTIO_NET_OK) {
error_report("Bad CVQ processing in model");
}
error_report("Bad device CVQ written length");
}
vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
- g_free(elem);
+ /*
+ * `elem` belongs to vhost_vdpa_net_handle_ctrl_avail() only when
+ * the function successfully forwards the CVQ command, indicated
+ * by a non-negative value of `dev_written`. Otherwise, it still
+ * belongs to SVQ.
+ * This function should only free the `elem` when it owns.
+ */
+ if (dev_written >= 0) {
+ g_free(elem);
+ }
return dev_written < 0 ? dev_written : 0;
}
uint64_t backend_features;
int64_t cvq_group;
uint8_t status = VIRTIO_CONFIG_S_ACKNOWLEDGE |
- VIRTIO_CONFIG_S_DRIVER |
- VIRTIO_CONFIG_S_FEATURES_OK;
+ VIRTIO_CONFIG_S_DRIVER;
int r;
ERRP_GUARD();
return 0;
}
+ r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
+ if (unlikely(r)) {
+ error_setg_errno(errp, -r, "Cannot set device status");
+ goto out;
+ }
+
r = ioctl(device_fd, VHOST_SET_FEATURES, &features);
if (unlikely(r)) {
- error_setg_errno(errp, errno, "Cannot set features");
+ error_setg_errno(errp, -r, "Cannot set features");
+ goto out;
}
+ status |= VIRTIO_CONFIG_S_FEATURES_OK;
r = ioctl(device_fd, VHOST_VDPA_SET_STATUS, &status);
if (unlikely(r)) {
- error_setg_errno(errp, -r, "Cannot set device features");
+ error_setg_errno(errp, -r, "Cannot set device status");
goto out;
}
VhostVDPAState *s;
int ret = 0;
assert(name);
- int cvq_isolated;
+ int cvq_isolated = 0;
if (is_datapath) {
nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
s->vhost_vdpa.device_fd = vdpa_device_fd;
s->vhost_vdpa.index = queue_pair_index;
s->always_svq = svq;
- s->migration_state.notify = vdpa_net_migration_state_notifier;
+ s->migration_state.notify = NULL;
s->vhost_vdpa.shadow_vqs_enabled = svq;
s->vhost_vdpa.iova_range = iova_range;
s->vhost_vdpa.shadow_data = svq;
s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
s->vhost_vdpa.shadow_vq_ops_opaque = s;
s->cvq_isolated = cvq_isolated;
-
- /*
- * TODO: We cannot migrate devices with CVQ and no x-svq enabled as
- * there is no way to set the device state (MAC, MQ, etc) before
- * starting the datapath.
- *
- * Migration blocker ownership now belongs to s->vhost_vdpa.
- */
- if (!svq) {
- error_setg(&s->vhost_vdpa.migration_blocker,
- "net vdpa cannot migrate with CVQ feature");
- }
}
ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
if (ret) {