]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | #include <arpa/inet.h> | |
35 | #include <getopt.h> | |
36 | #include <linux/if_ether.h> | |
37 | #include <linux/if_vlan.h> | |
38 | #include <linux/virtio_net.h> | |
39 | #include <linux/virtio_ring.h> | |
40 | #include <signal.h> | |
41 | #include <stdint.h> | |
42 | #include <sys/eventfd.h> | |
43 | #include <sys/param.h> | |
44 | #include <unistd.h> | |
45 | ||
46 | #include <rte_atomic.h> | |
47 | #include <rte_cycles.h> | |
48 | #include <rte_ethdev.h> | |
49 | #include <rte_log.h> | |
50 | #include <rte_string_fns.h> | |
51 | #include <rte_malloc.h> | |
52 | #include <rte_virtio_net.h> | |
53 | #include <rte_ip.h> | |
54 | #include <rte_tcp.h> | |
55 | ||
56 | #include "main.h" | |
57 | ||
58 | #ifndef MAX_QUEUES | |
59 | #define MAX_QUEUES 128 | |
60 | #endif | |
61 | ||
62 | /* the maximum number of external ports supported */ | |
63 | #define MAX_SUP_PORTS 1 | |
64 | ||
65 | #define MBUF_CACHE_SIZE 128 | |
66 | #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE | |
67 | ||
68 | #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ | |
69 | #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ | |
70 | ||
71 | #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ | |
72 | #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ | |
73 | ||
74 | #define JUMBO_FRAME_MAX_SIZE 0x2600 | |
75 | ||
76 | /* State of virtio device. */ | |
77 | #define DEVICE_MAC_LEARNING 0 | |
78 | #define DEVICE_RX 1 | |
79 | #define DEVICE_SAFE_REMOVE 2 | |
80 | ||
81 | /* Configurable number of RX/TX ring descriptors */ | |
82 | #define RTE_TEST_RX_DESC_DEFAULT 1024 | |
83 | #define RTE_TEST_TX_DESC_DEFAULT 512 | |
84 | ||
85 | #define INVALID_PORT_ID 0xFF | |
86 | ||
87 | /* Max number of devices. Limited by vmdq. */ | |
88 | #define MAX_DEVICES 64 | |
89 | ||
90 | /* Size of buffers used for snprintfs. */ | |
91 | #define MAX_PRINT_BUFF 6072 | |
92 | ||
93 | /* Maximum long option length for option parsing. */ | |
94 | #define MAX_LONG_OPT_SZ 64 | |
95 | ||
96 | /* mask of enabled ports */ | |
97 | static uint32_t enabled_port_mask = 0; | |
98 | ||
99 | /* Promiscuous mode */ | |
100 | static uint32_t promiscuous; | |
101 | ||
102 | /* number of devices/queues to support*/ | |
103 | static uint32_t num_queues = 0; | |
104 | static uint32_t num_devices; | |
105 | ||
106 | static struct rte_mempool *mbuf_pool; | |
107 | static int mergeable; | |
108 | ||
109 | /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ | |
110 | typedef enum { | |
111 | VM2VM_DISABLED = 0, | |
112 | VM2VM_SOFTWARE = 1, | |
113 | VM2VM_HARDWARE = 2, | |
114 | VM2VM_LAST | |
115 | } vm2vm_type; | |
116 | static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; | |
117 | ||
118 | /* Enable stats. */ | |
119 | static uint32_t enable_stats = 0; | |
120 | /* Enable retries on RX. */ | |
121 | static uint32_t enable_retry = 1; | |
122 | ||
123 | /* Disable TX checksum offload */ | |
124 | static uint32_t enable_tx_csum; | |
125 | ||
126 | /* Disable TSO offload */ | |
127 | static uint32_t enable_tso; | |
128 | ||
129 | static int client_mode; | |
130 | static int dequeue_zero_copy; | |
131 | ||
132 | /* Specify timeout (in useconds) between retries on RX. */ | |
133 | static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; | |
134 | /* Specify the number of retries on RX. */ | |
135 | static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; | |
136 | ||
137 | /* Socket file paths. Can be set by user */ | |
138 | static char *socket_files; | |
139 | static int nb_sockets; | |
140 | ||
141 | /* empty vmdq configuration structure. Filled in programatically */ | |
142 | static struct rte_eth_conf vmdq_conf_default = { | |
143 | .rxmode = { | |
144 | .mq_mode = ETH_MQ_RX_VMDQ_ONLY, | |
145 | .split_hdr_size = 0, | |
146 | .header_split = 0, /**< Header Split disabled */ | |
147 | .hw_ip_checksum = 0, /**< IP checksum offload disabled */ | |
148 | .hw_vlan_filter = 0, /**< VLAN filtering disabled */ | |
149 | /* | |
150 | * It is necessary for 1G NIC such as I350, | |
151 | * this fixes bug of ipv4 forwarding in guest can't | |
152 | * forward pakets from one virtio dev to another virtio dev. | |
153 | */ | |
154 | .hw_vlan_strip = 1, /**< VLAN strip enabled. */ | |
155 | .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ | |
156 | .hw_strip_crc = 0, /**< CRC stripped by hardware */ | |
157 | }, | |
158 | ||
159 | .txmode = { | |
160 | .mq_mode = ETH_MQ_TX_NONE, | |
161 | }, | |
162 | .rx_adv_conf = { | |
163 | /* | |
164 | * should be overridden separately in code with | |
165 | * appropriate values | |
166 | */ | |
167 | .vmdq_rx_conf = { | |
168 | .nb_queue_pools = ETH_8_POOLS, | |
169 | .enable_default_pool = 0, | |
170 | .default_pool = 0, | |
171 | .nb_pool_maps = 0, | |
172 | .pool_map = {{0, 0},}, | |
173 | }, | |
174 | }, | |
175 | }; | |
176 | ||
177 | static unsigned lcore_ids[RTE_MAX_LCORE]; | |
178 | static uint8_t ports[RTE_MAX_ETHPORTS]; | |
179 | static unsigned num_ports = 0; /**< The number of ports specified in command line */ | |
180 | static uint16_t num_pf_queues, num_vmdq_queues; | |
181 | static uint16_t vmdq_pool_base, vmdq_queue_base; | |
182 | static uint16_t queues_per_pool; | |
183 | ||
184 | const uint16_t vlan_tags[] = { | |
185 | 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, | |
186 | 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, | |
187 | 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, | |
188 | 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, | |
189 | 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, | |
190 | 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, | |
191 | 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, | |
192 | 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, | |
193 | }; | |
194 | ||
195 | /* ethernet addresses of ports */ | |
196 | static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; | |
197 | ||
198 | static struct vhost_dev_tailq_list vhost_dev_list = | |
199 | TAILQ_HEAD_INITIALIZER(vhost_dev_list); | |
200 | ||
201 | static struct lcore_info lcore_info[RTE_MAX_LCORE]; | |
202 | ||
203 | /* Used for queueing bursts of TX packets. */ | |
204 | struct mbuf_table { | |
205 | unsigned len; | |
206 | unsigned txq_id; | |
207 | struct rte_mbuf *m_table[MAX_PKT_BURST]; | |
208 | }; | |
209 | ||
210 | /* TX queue for each data core. */ | |
211 | struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; | |
212 | ||
213 | #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ | |
214 | / US_PER_S * BURST_TX_DRAIN_US) | |
215 | #define VLAN_HLEN 4 | |
216 | ||
217 | /* | |
218 | * Builds up the correct configuration for VMDQ VLAN pool map | |
219 | * according to the pool & queue limits. | |
220 | */ | |
221 | static inline int | |
222 | get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) | |
223 | { | |
224 | struct rte_eth_vmdq_rx_conf conf; | |
225 | struct rte_eth_vmdq_rx_conf *def_conf = | |
226 | &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; | |
227 | unsigned i; | |
228 | ||
229 | memset(&conf, 0, sizeof(conf)); | |
230 | conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; | |
231 | conf.nb_pool_maps = num_devices; | |
232 | conf.enable_loop_back = def_conf->enable_loop_back; | |
233 | conf.rx_mode = def_conf->rx_mode; | |
234 | ||
235 | for (i = 0; i < conf.nb_pool_maps; i++) { | |
236 | conf.pool_map[i].vlan_id = vlan_tags[ i ]; | |
237 | conf.pool_map[i].pools = (1UL << i); | |
238 | } | |
239 | ||
240 | (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); | |
241 | (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, | |
242 | sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); | |
243 | return 0; | |
244 | } | |
245 | ||
246 | /* | |
247 | * Validate the device number according to the max pool number gotten form | |
248 | * dev_info. If the device number is invalid, give the error message and | |
249 | * return -1. Each device must have its own pool. | |
250 | */ | |
251 | static inline int | |
252 | validate_num_devices(uint32_t max_nb_devices) | |
253 | { | |
254 | if (num_devices > max_nb_devices) { | |
255 | RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); | |
256 | return -1; | |
257 | } | |
258 | return 0; | |
259 | } | |
260 | ||
261 | /* | |
262 | * Initialises a given port using global settings and with the rx buffers | |
263 | * coming from the mbuf_pool passed as parameter | |
264 | */ | |
265 | static inline int | |
266 | port_init(uint8_t port) | |
267 | { | |
268 | struct rte_eth_dev_info dev_info; | |
269 | struct rte_eth_conf port_conf; | |
270 | struct rte_eth_rxconf *rxconf; | |
271 | struct rte_eth_txconf *txconf; | |
272 | int16_t rx_rings, tx_rings; | |
273 | uint16_t rx_ring_size, tx_ring_size; | |
274 | int retval; | |
275 | uint16_t q; | |
276 | ||
277 | /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ | |
278 | rte_eth_dev_info_get (port, &dev_info); | |
279 | ||
280 | if (dev_info.max_rx_queues > MAX_QUEUES) { | |
281 | rte_exit(EXIT_FAILURE, | |
282 | "please define MAX_QUEUES no less than %u in %s\n", | |
283 | dev_info.max_rx_queues, __FILE__); | |
284 | } | |
285 | ||
286 | rxconf = &dev_info.default_rxconf; | |
287 | txconf = &dev_info.default_txconf; | |
288 | rxconf->rx_drop_en = 1; | |
289 | ||
290 | /* Enable vlan offload */ | |
291 | txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; | |
292 | ||
293 | /*configure the number of supported virtio devices based on VMDQ limits */ | |
294 | num_devices = dev_info.max_vmdq_pools; | |
295 | ||
296 | rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; | |
297 | tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; | |
298 | ||
299 | /* | |
300 | * When dequeue zero copy is enabled, guest Tx used vring will be | |
301 | * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc | |
302 | * (tx_ring_size here) must be small enough so that the driver will | |
303 | * hit the free threshold easily and free mbufs timely. Otherwise, | |
304 | * guest Tx vring would be starved. | |
305 | */ | |
306 | if (dequeue_zero_copy) | |
307 | tx_ring_size = 64; | |
308 | ||
309 | tx_rings = (uint16_t)rte_lcore_count(); | |
310 | ||
311 | retval = validate_num_devices(MAX_DEVICES); | |
312 | if (retval < 0) | |
313 | return retval; | |
314 | ||
315 | /* Get port configuration. */ | |
316 | retval = get_eth_conf(&port_conf, num_devices); | |
317 | if (retval < 0) | |
318 | return retval; | |
319 | /* NIC queues are divided into pf queues and vmdq queues. */ | |
320 | num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; | |
321 | queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; | |
322 | num_vmdq_queues = num_devices * queues_per_pool; | |
323 | num_queues = num_pf_queues + num_vmdq_queues; | |
324 | vmdq_queue_base = dev_info.vmdq_queue_base; | |
325 | vmdq_pool_base = dev_info.vmdq_pool_base; | |
326 | printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", | |
327 | num_pf_queues, num_devices, queues_per_pool); | |
328 | ||
329 | if (port >= rte_eth_dev_count()) return -1; | |
330 | ||
331 | if (enable_tx_csum == 0) | |
332 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); | |
333 | ||
334 | if (enable_tso == 0) { | |
335 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); | |
336 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); | |
337 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO4); | |
338 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO6); | |
339 | } | |
340 | ||
341 | rx_rings = (uint16_t)dev_info.max_rx_queues; | |
342 | /* Configure ethernet device. */ | |
343 | retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); | |
344 | if (retval != 0) { | |
345 | RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", | |
346 | port, strerror(-retval)); | |
347 | return retval; | |
348 | } | |
349 | ||
350 | /* Setup the queues. */ | |
351 | for (q = 0; q < rx_rings; q ++) { | |
352 | retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, | |
353 | rte_eth_dev_socket_id(port), | |
354 | rxconf, | |
355 | mbuf_pool); | |
356 | if (retval < 0) { | |
357 | RTE_LOG(ERR, VHOST_PORT, | |
358 | "Failed to setup rx queue %u of port %u: %s.\n", | |
359 | q, port, strerror(-retval)); | |
360 | return retval; | |
361 | } | |
362 | } | |
363 | for (q = 0; q < tx_rings; q ++) { | |
364 | retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, | |
365 | rte_eth_dev_socket_id(port), | |
366 | txconf); | |
367 | if (retval < 0) { | |
368 | RTE_LOG(ERR, VHOST_PORT, | |
369 | "Failed to setup tx queue %u of port %u: %s.\n", | |
370 | q, port, strerror(-retval)); | |
371 | return retval; | |
372 | } | |
373 | } | |
374 | ||
375 | /* Start the device. */ | |
376 | retval = rte_eth_dev_start(port); | |
377 | if (retval < 0) { | |
378 | RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", | |
379 | port, strerror(-retval)); | |
380 | return retval; | |
381 | } | |
382 | ||
383 | if (promiscuous) | |
384 | rte_eth_promiscuous_enable(port); | |
385 | ||
386 | rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); | |
387 | RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); | |
388 | RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 | |
389 | " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", | |
390 | (unsigned)port, | |
391 | vmdq_ports_eth_addr[port].addr_bytes[0], | |
392 | vmdq_ports_eth_addr[port].addr_bytes[1], | |
393 | vmdq_ports_eth_addr[port].addr_bytes[2], | |
394 | vmdq_ports_eth_addr[port].addr_bytes[3], | |
395 | vmdq_ports_eth_addr[port].addr_bytes[4], | |
396 | vmdq_ports_eth_addr[port].addr_bytes[5]); | |
397 | ||
398 | return 0; | |
399 | } | |
400 | ||
401 | /* | |
402 | * Set socket file path. | |
403 | */ | |
404 | static int | |
405 | us_vhost_parse_socket_path(const char *q_arg) | |
406 | { | |
407 | /* parse number string */ | |
408 | if (strnlen(q_arg, PATH_MAX) > PATH_MAX) | |
409 | return -1; | |
410 | ||
411 | socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); | |
412 | snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg); | |
413 | nb_sockets++; | |
414 | ||
415 | return 0; | |
416 | } | |
417 | ||
418 | /* | |
419 | * Parse the portmask provided at run time. | |
420 | */ | |
421 | static int | |
422 | parse_portmask(const char *portmask) | |
423 | { | |
424 | char *end = NULL; | |
425 | unsigned long pm; | |
426 | ||
427 | errno = 0; | |
428 | ||
429 | /* parse hexadecimal string */ | |
430 | pm = strtoul(portmask, &end, 16); | |
431 | if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) | |
432 | return -1; | |
433 | ||
434 | if (pm == 0) | |
435 | return -1; | |
436 | ||
437 | return pm; | |
438 | ||
439 | } | |
440 | ||
441 | /* | |
442 | * Parse num options at run time. | |
443 | */ | |
444 | static int | |
445 | parse_num_opt(const char *q_arg, uint32_t max_valid_value) | |
446 | { | |
447 | char *end = NULL; | |
448 | unsigned long num; | |
449 | ||
450 | errno = 0; | |
451 | ||
452 | /* parse unsigned int string */ | |
453 | num = strtoul(q_arg, &end, 10); | |
454 | if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) | |
455 | return -1; | |
456 | ||
457 | if (num > max_valid_value) | |
458 | return -1; | |
459 | ||
460 | return num; | |
461 | ||
462 | } | |
463 | ||
464 | /* | |
465 | * Display usage | |
466 | */ | |
467 | static void | |
468 | us_vhost_usage(const char *prgname) | |
469 | { | |
470 | RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" | |
471 | " --vm2vm [0|1|2]\n" | |
472 | " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" | |
473 | " --socket-file <path>\n" | |
474 | " --nb-devices ND\n" | |
475 | " -p PORTMASK: Set mask for ports to be used by application\n" | |
476 | " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" | |
477 | " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" | |
478 | " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" | |
479 | " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" | |
480 | " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" | |
481 | " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" | |
482 | " --socket-file: The path of the socket file.\n" | |
483 | " --tx-csum [0|1] disable/enable TX checksum offload.\n" | |
484 | " --tso [0|1] disable/enable TCP segment offload.\n" | |
485 | " --client register a vhost-user socket as client mode.\n" | |
486 | " --dequeue-zero-copy enables dequeue zero copy\n", | |
487 | prgname); | |
488 | } | |
489 | ||
490 | /* | |
491 | * Parse the arguments given in the command line of the application. | |
492 | */ | |
493 | static int | |
494 | us_vhost_parse_args(int argc, char **argv) | |
495 | { | |
496 | int opt, ret; | |
497 | int option_index; | |
498 | unsigned i; | |
499 | const char *prgname = argv[0]; | |
500 | static struct option long_option[] = { | |
501 | {"vm2vm", required_argument, NULL, 0}, | |
502 | {"rx-retry", required_argument, NULL, 0}, | |
503 | {"rx-retry-delay", required_argument, NULL, 0}, | |
504 | {"rx-retry-num", required_argument, NULL, 0}, | |
505 | {"mergeable", required_argument, NULL, 0}, | |
506 | {"stats", required_argument, NULL, 0}, | |
507 | {"socket-file", required_argument, NULL, 0}, | |
508 | {"tx-csum", required_argument, NULL, 0}, | |
509 | {"tso", required_argument, NULL, 0}, | |
510 | {"client", no_argument, &client_mode, 1}, | |
511 | {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1}, | |
512 | {NULL, 0, 0, 0}, | |
513 | }; | |
514 | ||
515 | /* Parse command line */ | |
516 | while ((opt = getopt_long(argc, argv, "p:P", | |
517 | long_option, &option_index)) != EOF) { | |
518 | switch (opt) { | |
519 | /* Portmask */ | |
520 | case 'p': | |
521 | enabled_port_mask = parse_portmask(optarg); | |
522 | if (enabled_port_mask == 0) { | |
523 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); | |
524 | us_vhost_usage(prgname); | |
525 | return -1; | |
526 | } | |
527 | break; | |
528 | ||
529 | case 'P': | |
530 | promiscuous = 1; | |
531 | vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = | |
532 | ETH_VMDQ_ACCEPT_BROADCAST | | |
533 | ETH_VMDQ_ACCEPT_MULTICAST; | |
534 | rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); | |
535 | ||
536 | break; | |
537 | ||
538 | case 0: | |
539 | /* Enable/disable vm2vm comms. */ | |
540 | if (!strncmp(long_option[option_index].name, "vm2vm", | |
541 | MAX_LONG_OPT_SZ)) { | |
542 | ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); | |
543 | if (ret == -1) { | |
544 | RTE_LOG(INFO, VHOST_CONFIG, | |
545 | "Invalid argument for " | |
546 | "vm2vm [0|1|2]\n"); | |
547 | us_vhost_usage(prgname); | |
548 | return -1; | |
549 | } else { | |
550 | vm2vm_mode = (vm2vm_type)ret; | |
551 | } | |
552 | } | |
553 | ||
554 | /* Enable/disable retries on RX. */ | |
555 | if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { | |
556 | ret = parse_num_opt(optarg, 1); | |
557 | if (ret == -1) { | |
558 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); | |
559 | us_vhost_usage(prgname); | |
560 | return -1; | |
561 | } else { | |
562 | enable_retry = ret; | |
563 | } | |
564 | } | |
565 | ||
566 | /* Enable/disable TX checksum offload. */ | |
567 | if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { | |
568 | ret = parse_num_opt(optarg, 1); | |
569 | if (ret == -1) { | |
570 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); | |
571 | us_vhost_usage(prgname); | |
572 | return -1; | |
573 | } else | |
574 | enable_tx_csum = ret; | |
575 | } | |
576 | ||
577 | /* Enable/disable TSO offload. */ | |
578 | if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { | |
579 | ret = parse_num_opt(optarg, 1); | |
580 | if (ret == -1) { | |
581 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); | |
582 | us_vhost_usage(prgname); | |
583 | return -1; | |
584 | } else | |
585 | enable_tso = ret; | |
586 | } | |
587 | ||
588 | /* Specify the retries delay time (in useconds) on RX. */ | |
589 | if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { | |
590 | ret = parse_num_opt(optarg, INT32_MAX); | |
591 | if (ret == -1) { | |
592 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); | |
593 | us_vhost_usage(prgname); | |
594 | return -1; | |
595 | } else { | |
596 | burst_rx_delay_time = ret; | |
597 | } | |
598 | } | |
599 | ||
600 | /* Specify the retries number on RX. */ | |
601 | if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { | |
602 | ret = parse_num_opt(optarg, INT32_MAX); | |
603 | if (ret == -1) { | |
604 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); | |
605 | us_vhost_usage(prgname); | |
606 | return -1; | |
607 | } else { | |
608 | burst_rx_retry_num = ret; | |
609 | } | |
610 | } | |
611 | ||
612 | /* Enable/disable RX mergeable buffers. */ | |
613 | if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { | |
614 | ret = parse_num_opt(optarg, 1); | |
615 | if (ret == -1) { | |
616 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); | |
617 | us_vhost_usage(prgname); | |
618 | return -1; | |
619 | } else { | |
620 | mergeable = !!ret; | |
621 | if (ret) { | |
622 | vmdq_conf_default.rxmode.jumbo_frame = 1; | |
623 | vmdq_conf_default.rxmode.max_rx_pkt_len | |
624 | = JUMBO_FRAME_MAX_SIZE; | |
625 | } | |
626 | } | |
627 | } | |
628 | ||
629 | /* Enable/disable stats. */ | |
630 | if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { | |
631 | ret = parse_num_opt(optarg, INT32_MAX); | |
632 | if (ret == -1) { | |
633 | RTE_LOG(INFO, VHOST_CONFIG, | |
634 | "Invalid argument for stats [0..N]\n"); | |
635 | us_vhost_usage(prgname); | |
636 | return -1; | |
637 | } else { | |
638 | enable_stats = ret; | |
639 | } | |
640 | } | |
641 | ||
642 | /* Set socket file path. */ | |
643 | if (!strncmp(long_option[option_index].name, | |
644 | "socket-file", MAX_LONG_OPT_SZ)) { | |
645 | if (us_vhost_parse_socket_path(optarg) == -1) { | |
646 | RTE_LOG(INFO, VHOST_CONFIG, | |
647 | "Invalid argument for socket name (Max %d characters)\n", | |
648 | PATH_MAX); | |
649 | us_vhost_usage(prgname); | |
650 | return -1; | |
651 | } | |
652 | } | |
653 | ||
654 | break; | |
655 | ||
656 | /* Invalid option - print options. */ | |
657 | default: | |
658 | us_vhost_usage(prgname); | |
659 | return -1; | |
660 | } | |
661 | } | |
662 | ||
663 | for (i = 0; i < RTE_MAX_ETHPORTS; i++) { | |
664 | if (enabled_port_mask & (1 << i)) | |
665 | ports[num_ports++] = (uint8_t)i; | |
666 | } | |
667 | ||
668 | if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { | |
669 | RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," | |
670 | "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); | |
671 | return -1; | |
672 | } | |
673 | ||
674 | return 0; | |
675 | } | |
676 | ||
677 | /* | |
678 | * Update the global var NUM_PORTS and array PORTS according to system ports number | |
679 | * and return valid ports number | |
680 | */ | |
681 | static unsigned check_ports_num(unsigned nb_ports) | |
682 | { | |
683 | unsigned valid_num_ports = num_ports; | |
684 | unsigned portid; | |
685 | ||
686 | if (num_ports > nb_ports) { | |
687 | RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", | |
688 | num_ports, nb_ports); | |
689 | num_ports = nb_ports; | |
690 | } | |
691 | ||
692 | for (portid = 0; portid < num_ports; portid ++) { | |
693 | if (ports[portid] >= nb_ports) { | |
694 | RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", | |
695 | ports[portid], (nb_ports - 1)); | |
696 | ports[portid] = INVALID_PORT_ID; | |
697 | valid_num_ports--; | |
698 | } | |
699 | } | |
700 | return valid_num_ports; | |
701 | } | |
702 | ||
703 | static inline struct vhost_dev *__attribute__((always_inline)) | |
704 | find_vhost_dev(struct ether_addr *mac) | |
705 | { | |
706 | struct vhost_dev *vdev; | |
707 | ||
708 | TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { | |
709 | if (vdev->ready == DEVICE_RX && | |
710 | is_same_ether_addr(mac, &vdev->mac_address)) | |
711 | return vdev; | |
712 | } | |
713 | ||
714 | return NULL; | |
715 | } | |
716 | ||
717 | /* | |
718 | * This function learns the MAC address of the device and registers this along with a | |
719 | * vlan tag to a VMDQ. | |
720 | */ | |
721 | static int | |
722 | link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) | |
723 | { | |
724 | struct ether_hdr *pkt_hdr; | |
725 | int i, ret; | |
726 | ||
727 | /* Learn MAC address of guest device from packet */ | |
728 | pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
729 | ||
730 | if (find_vhost_dev(&pkt_hdr->s_addr)) { | |
731 | RTE_LOG(ERR, VHOST_DATA, | |
732 | "(%d) device is using a registered MAC!\n", | |
733 | vdev->vid); | |
734 | return -1; | |
735 | } | |
736 | ||
737 | for (i = 0; i < ETHER_ADDR_LEN; i++) | |
738 | vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; | |
739 | ||
740 | /* vlan_tag currently uses the device_id. */ | |
741 | vdev->vlan_tag = vlan_tags[vdev->vid]; | |
742 | ||
743 | /* Print out VMDQ registration info. */ | |
744 | RTE_LOG(INFO, VHOST_DATA, | |
745 | "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", | |
746 | vdev->vid, | |
747 | vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], | |
748 | vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], | |
749 | vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], | |
750 | vdev->vlan_tag); | |
751 | ||
752 | /* Register the MAC address. */ | |
753 | ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, | |
754 | (uint32_t)vdev->vid + vmdq_pool_base); | |
755 | if (ret) | |
756 | RTE_LOG(ERR, VHOST_DATA, | |
757 | "(%d) failed to add device MAC address to VMDQ\n", | |
758 | vdev->vid); | |
759 | ||
760 | rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); | |
761 | ||
762 | /* Set device as ready for RX. */ | |
763 | vdev->ready = DEVICE_RX; | |
764 | ||
765 | return 0; | |
766 | } | |
767 | ||
768 | /* | |
769 | * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX | |
770 | * queue before disabling RX on the device. | |
771 | */ | |
772 | static inline void | |
773 | unlink_vmdq(struct vhost_dev *vdev) | |
774 | { | |
775 | unsigned i = 0; | |
776 | unsigned rx_count; | |
777 | struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; | |
778 | ||
779 | if (vdev->ready == DEVICE_RX) { | |
780 | /*clear MAC and VLAN settings*/ | |
781 | rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); | |
782 | for (i = 0; i < 6; i++) | |
783 | vdev->mac_address.addr_bytes[i] = 0; | |
784 | ||
785 | vdev->vlan_tag = 0; | |
786 | ||
787 | /*Clear out the receive buffers*/ | |
788 | rx_count = rte_eth_rx_burst(ports[0], | |
789 | (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); | |
790 | ||
791 | while (rx_count) { | |
792 | for (i = 0; i < rx_count; i++) | |
793 | rte_pktmbuf_free(pkts_burst[i]); | |
794 | ||
795 | rx_count = rte_eth_rx_burst(ports[0], | |
796 | (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); | |
797 | } | |
798 | ||
799 | vdev->ready = DEVICE_MAC_LEARNING; | |
800 | } | |
801 | } | |
802 | ||
803 | static inline void __attribute__((always_inline)) | |
804 | virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, | |
805 | struct rte_mbuf *m) | |
806 | { | |
807 | uint16_t ret; | |
808 | ||
809 | ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); | |
810 | if (enable_stats) { | |
811 | rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); | |
812 | rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); | |
813 | src_vdev->stats.tx_total++; | |
814 | src_vdev->stats.tx += ret; | |
815 | } | |
816 | } | |
817 | ||
818 | /* | |
819 | * Check if the packet destination MAC address is for a local device. If so then put | |
820 | * the packet on that devices RX queue. If not then return. | |
821 | */ | |
822 | static inline int __attribute__((always_inline)) | |
823 | virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) | |
824 | { | |
825 | struct ether_hdr *pkt_hdr; | |
826 | struct vhost_dev *dst_vdev; | |
827 | ||
828 | pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
829 | ||
830 | dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); | |
831 | if (!dst_vdev) | |
832 | return -1; | |
833 | ||
834 | if (vdev->vid == dst_vdev->vid) { | |
835 | RTE_LOG(DEBUG, VHOST_DATA, | |
836 | "(%d) TX: src and dst MAC is same. Dropping packet.\n", | |
837 | vdev->vid); | |
838 | return 0; | |
839 | } | |
840 | ||
841 | RTE_LOG(DEBUG, VHOST_DATA, | |
842 | "(%d) TX: MAC address is local\n", dst_vdev->vid); | |
843 | ||
844 | if (unlikely(dst_vdev->remove)) { | |
845 | RTE_LOG(DEBUG, VHOST_DATA, | |
846 | "(%d) device is marked for removal\n", dst_vdev->vid); | |
847 | return 0; | |
848 | } | |
849 | ||
850 | virtio_xmit(dst_vdev, vdev, m); | |
851 | return 0; | |
852 | } | |
853 | ||
854 | /* | |
855 | * Check if the destination MAC of a packet is one local VM, | |
856 | * and get its vlan tag, and offset if it is. | |
857 | */ | |
858 | static inline int __attribute__((always_inline)) | |
859 | find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, | |
860 | uint32_t *offset, uint16_t *vlan_tag) | |
861 | { | |
862 | struct vhost_dev *dst_vdev; | |
863 | struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
864 | ||
865 | dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); | |
866 | if (!dst_vdev) | |
867 | return 0; | |
868 | ||
869 | if (vdev->vid == dst_vdev->vid) { | |
870 | RTE_LOG(DEBUG, VHOST_DATA, | |
871 | "(%d) TX: src and dst MAC is same. Dropping packet.\n", | |
872 | vdev->vid); | |
873 | return -1; | |
874 | } | |
875 | ||
876 | /* | |
877 | * HW vlan strip will reduce the packet length | |
878 | * by minus length of vlan tag, so need restore | |
879 | * the packet length by plus it. | |
880 | */ | |
881 | *offset = VLAN_HLEN; | |
882 | *vlan_tag = vlan_tags[vdev->vid]; | |
883 | ||
884 | RTE_LOG(DEBUG, VHOST_DATA, | |
885 | "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", | |
886 | vdev->vid, dst_vdev->vid, *vlan_tag); | |
887 | ||
888 | return 0; | |
889 | } | |
890 | ||
891 | static uint16_t | |
892 | get_psd_sum(void *l3_hdr, uint64_t ol_flags) | |
893 | { | |
894 | if (ol_flags & PKT_TX_IPV4) | |
895 | return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); | |
896 | else /* assume ethertype == ETHER_TYPE_IPv6 */ | |
897 | return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); | |
898 | } | |
899 | ||
900 | static void virtio_tx_offload(struct rte_mbuf *m) | |
901 | { | |
902 | void *l3_hdr; | |
903 | struct ipv4_hdr *ipv4_hdr = NULL; | |
904 | struct tcp_hdr *tcp_hdr = NULL; | |
905 | struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
906 | ||
907 | l3_hdr = (char *)eth_hdr + m->l2_len; | |
908 | ||
909 | if (m->ol_flags & PKT_TX_IPV4) { | |
910 | ipv4_hdr = l3_hdr; | |
911 | ipv4_hdr->hdr_checksum = 0; | |
912 | m->ol_flags |= PKT_TX_IP_CKSUM; | |
913 | } | |
914 | ||
915 | tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); | |
916 | tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); | |
917 | } | |
918 | ||
919 | static inline void | |
920 | free_pkts(struct rte_mbuf **pkts, uint16_t n) | |
921 | { | |
922 | while (n--) | |
923 | rte_pktmbuf_free(pkts[n]); | |
924 | } | |
925 | ||
926 | static inline void __attribute__((always_inline)) | |
927 | do_drain_mbuf_table(struct mbuf_table *tx_q) | |
928 | { | |
929 | uint16_t count; | |
930 | ||
931 | count = rte_eth_tx_burst(ports[0], tx_q->txq_id, | |
932 | tx_q->m_table, tx_q->len); | |
933 | if (unlikely(count < tx_q->len)) | |
934 | free_pkts(&tx_q->m_table[count], tx_q->len - count); | |
935 | ||
936 | tx_q->len = 0; | |
937 | } | |
938 | ||
939 | /* | |
940 | * This function routes the TX packet to the correct interface. This | |
941 | * may be a local device or the physical port. | |
942 | */ | |
943 | static inline void __attribute__((always_inline)) | |
944 | virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) | |
945 | { | |
946 | struct mbuf_table *tx_q; | |
947 | unsigned offset = 0; | |
948 | const uint16_t lcore_id = rte_lcore_id(); | |
949 | struct ether_hdr *nh; | |
950 | ||
951 | ||
952 | nh = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
953 | if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { | |
954 | struct vhost_dev *vdev2; | |
955 | ||
956 | TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { | |
957 | virtio_xmit(vdev2, vdev, m); | |
958 | } | |
959 | goto queue2nic; | |
960 | } | |
961 | ||
962 | /*check if destination is local VM*/ | |
963 | if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { | |
964 | rte_pktmbuf_free(m); | |
965 | return; | |
966 | } | |
967 | ||
968 | if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { | |
969 | if (unlikely(find_local_dest(vdev, m, &offset, | |
970 | &vlan_tag) != 0)) { | |
971 | rte_pktmbuf_free(m); | |
972 | return; | |
973 | } | |
974 | } | |
975 | ||
976 | RTE_LOG(DEBUG, VHOST_DATA, | |
977 | "(%d) TX: MAC address is external\n", vdev->vid); | |
978 | ||
979 | queue2nic: | |
980 | ||
981 | /*Add packet to the port tx queue*/ | |
982 | tx_q = &lcore_tx_queue[lcore_id]; | |
983 | ||
984 | nh = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
985 | if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { | |
986 | /* Guest has inserted the vlan tag. */ | |
987 | struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); | |
988 | uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); | |
989 | if ((vm2vm_mode == VM2VM_HARDWARE) && | |
990 | (vh->vlan_tci != vlan_tag_be)) | |
991 | vh->vlan_tci = vlan_tag_be; | |
992 | } else { | |
993 | m->ol_flags |= PKT_TX_VLAN_PKT; | |
994 | ||
995 | /* | |
996 | * Find the right seg to adjust the data len when offset is | |
997 | * bigger than tail room size. | |
998 | */ | |
999 | if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { | |
1000 | if (likely(offset <= rte_pktmbuf_tailroom(m))) | |
1001 | m->data_len += offset; | |
1002 | else { | |
1003 | struct rte_mbuf *seg = m; | |
1004 | ||
1005 | while ((seg->next != NULL) && | |
1006 | (offset > rte_pktmbuf_tailroom(seg))) | |
1007 | seg = seg->next; | |
1008 | ||
1009 | seg->data_len += offset; | |
1010 | } | |
1011 | m->pkt_len += offset; | |
1012 | } | |
1013 | ||
1014 | m->vlan_tci = vlan_tag; | |
1015 | } | |
1016 | ||
1017 | if (m->ol_flags & PKT_TX_TCP_SEG) | |
1018 | virtio_tx_offload(m); | |
1019 | ||
1020 | tx_q->m_table[tx_q->len++] = m; | |
1021 | if (enable_stats) { | |
1022 | vdev->stats.tx_total++; | |
1023 | vdev->stats.tx++; | |
1024 | } | |
1025 | ||
1026 | if (unlikely(tx_q->len == MAX_PKT_BURST)) | |
1027 | do_drain_mbuf_table(tx_q); | |
1028 | } | |
1029 | ||
1030 | ||
1031 | static inline void __attribute__((always_inline)) | |
1032 | drain_mbuf_table(struct mbuf_table *tx_q) | |
1033 | { | |
1034 | static uint64_t prev_tsc; | |
1035 | uint64_t cur_tsc; | |
1036 | ||
1037 | if (tx_q->len == 0) | |
1038 | return; | |
1039 | ||
1040 | cur_tsc = rte_rdtsc(); | |
1041 | if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { | |
1042 | prev_tsc = cur_tsc; | |
1043 | ||
1044 | RTE_LOG(DEBUG, VHOST_DATA, | |
1045 | "TX queue drained after timeout with burst size %u\n", | |
1046 | tx_q->len); | |
1047 | do_drain_mbuf_table(tx_q); | |
1048 | } | |
1049 | } | |
1050 | ||
1051 | static inline void __attribute__((always_inline)) | |
1052 | drain_eth_rx(struct vhost_dev *vdev) | |
1053 | { | |
1054 | uint16_t rx_count, enqueue_count; | |
1055 | struct rte_mbuf *pkts[MAX_PKT_BURST]; | |
1056 | ||
1057 | rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, | |
1058 | pkts, MAX_PKT_BURST); | |
1059 | if (!rx_count) | |
1060 | return; | |
1061 | ||
1062 | /* | |
1063 | * When "enable_retry" is set, here we wait and retry when there | |
1064 | * is no enough free slots in the queue to hold @rx_count packets, | |
1065 | * to diminish packet loss. | |
1066 | */ | |
1067 | if (enable_retry && | |
1068 | unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, | |
1069 | VIRTIO_RXQ))) { | |
1070 | uint32_t retry; | |
1071 | ||
1072 | for (retry = 0; retry < burst_rx_retry_num; retry++) { | |
1073 | rte_delay_us(burst_rx_delay_time); | |
1074 | if (rx_count <= rte_vhost_avail_entries(vdev->vid, | |
1075 | VIRTIO_RXQ)) | |
1076 | break; | |
1077 | } | |
1078 | } | |
1079 | ||
1080 | enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, | |
1081 | pkts, rx_count); | |
1082 | if (enable_stats) { | |
1083 | rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); | |
1084 | rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); | |
1085 | } | |
1086 | ||
1087 | free_pkts(pkts, rx_count); | |
1088 | } | |
1089 | ||
1090 | static inline void __attribute__((always_inline)) | |
1091 | drain_virtio_tx(struct vhost_dev *vdev) | |
1092 | { | |
1093 | struct rte_mbuf *pkts[MAX_PKT_BURST]; | |
1094 | uint16_t count; | |
1095 | uint16_t i; | |
1096 | ||
1097 | count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool, | |
1098 | pkts, MAX_PKT_BURST); | |
1099 | ||
1100 | /* setup VMDq for the first packet */ | |
1101 | if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { | |
1102 | if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) | |
1103 | free_pkts(pkts, count); | |
1104 | } | |
1105 | ||
1106 | for (i = 0; i < count; ++i) | |
1107 | virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); | |
1108 | } | |
1109 | ||
1110 | /* | |
1111 | * Main function of vhost-switch. It basically does: | |
1112 | * | |
1113 | * for each vhost device { | |
1114 | * - drain_eth_rx() | |
1115 | * | |
1116 | * Which drains the host eth Rx queue linked to the vhost device, | |
1117 | * and deliver all of them to guest virito Rx ring associated with | |
1118 | * this vhost device. | |
1119 | * | |
1120 | * - drain_virtio_tx() | |
1121 | * | |
1122 | * Which drains the guest virtio Tx queue and deliver all of them | |
1123 | * to the target, which could be another vhost device, or the | |
1124 | * physical eth dev. The route is done in function "virtio_tx_route". | |
1125 | * } | |
1126 | */ | |
1127 | static int | |
1128 | switch_worker(void *arg __rte_unused) | |
1129 | { | |
1130 | unsigned i; | |
1131 | unsigned lcore_id = rte_lcore_id(); | |
1132 | struct vhost_dev *vdev; | |
1133 | struct mbuf_table *tx_q; | |
1134 | ||
1135 | RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); | |
1136 | ||
1137 | tx_q = &lcore_tx_queue[lcore_id]; | |
1138 | for (i = 0; i < rte_lcore_count(); i++) { | |
1139 | if (lcore_ids[i] == lcore_id) { | |
1140 | tx_q->txq_id = i; | |
1141 | break; | |
1142 | } | |
1143 | } | |
1144 | ||
1145 | while(1) { | |
1146 | drain_mbuf_table(tx_q); | |
1147 | ||
1148 | /* | |
1149 | * Inform the configuration core that we have exited the | |
1150 | * linked list and that no devices are in use if requested. | |
1151 | */ | |
1152 | if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) | |
1153 | lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; | |
1154 | ||
1155 | /* | |
1156 | * Process vhost devices | |
1157 | */ | |
1158 | TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, | |
1159 | lcore_vdev_entry) { | |
1160 | if (unlikely(vdev->remove)) { | |
1161 | unlink_vmdq(vdev); | |
1162 | vdev->ready = DEVICE_SAFE_REMOVE; | |
1163 | continue; | |
1164 | } | |
1165 | ||
1166 | if (likely(vdev->ready == DEVICE_RX)) | |
1167 | drain_eth_rx(vdev); | |
1168 | ||
1169 | if (likely(!vdev->remove)) | |
1170 | drain_virtio_tx(vdev); | |
1171 | } | |
1172 | } | |
1173 | ||
1174 | return 0; | |
1175 | } | |
1176 | ||
1177 | /* | |
1178 | * Remove a device from the specific data core linked list and from the | |
1179 | * main linked list. Synchonization occurs through the use of the | |
1180 | * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering | |
1181 | * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. | |
1182 | */ | |
1183 | static void | |
1184 | destroy_device(int vid) | |
1185 | { | |
1186 | struct vhost_dev *vdev = NULL; | |
1187 | int lcore; | |
1188 | ||
1189 | TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { | |
1190 | if (vdev->vid == vid) | |
1191 | break; | |
1192 | } | |
1193 | if (!vdev) | |
1194 | return; | |
1195 | /*set the remove flag. */ | |
1196 | vdev->remove = 1; | |
1197 | while(vdev->ready != DEVICE_SAFE_REMOVE) { | |
1198 | rte_pause(); | |
1199 | } | |
1200 | ||
1201 | TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, | |
1202 | lcore_vdev_entry); | |
1203 | TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); | |
1204 | ||
1205 | ||
1206 | /* Set the dev_removal_flag on each lcore. */ | |
1207 | RTE_LCORE_FOREACH_SLAVE(lcore) | |
1208 | lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; | |
1209 | ||
1210 | /* | |
1211 | * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL | |
1212 | * we can be sure that they can no longer access the device removed | |
1213 | * from the linked lists and that the devices are no longer in use. | |
1214 | */ | |
1215 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1216 | while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) | |
1217 | rte_pause(); | |
1218 | } | |
1219 | ||
1220 | lcore_info[vdev->coreid].device_num--; | |
1221 | ||
1222 | RTE_LOG(INFO, VHOST_DATA, | |
1223 | "(%d) device has been removed from data core\n", | |
1224 | vdev->vid); | |
1225 | ||
1226 | rte_free(vdev); | |
1227 | } | |
1228 | ||
1229 | /* | |
1230 | * A new device is added to a data core. First the device is added to the main linked list | |
1231 | * and the allocated to a specific data core. | |
1232 | */ | |
1233 | static int | |
1234 | new_device(int vid) | |
1235 | { | |
1236 | int lcore, core_add = 0; | |
1237 | uint32_t device_num_min = num_devices; | |
1238 | struct vhost_dev *vdev; | |
1239 | ||
1240 | vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); | |
1241 | if (vdev == NULL) { | |
1242 | RTE_LOG(INFO, VHOST_DATA, | |
1243 | "(%d) couldn't allocate memory for vhost dev\n", | |
1244 | vid); | |
1245 | return -1; | |
1246 | } | |
1247 | vdev->vid = vid; | |
1248 | ||
1249 | TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); | |
1250 | vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; | |
1251 | ||
1252 | /*reset ready flag*/ | |
1253 | vdev->ready = DEVICE_MAC_LEARNING; | |
1254 | vdev->remove = 0; | |
1255 | ||
1256 | /* Find a suitable lcore to add the device. */ | |
1257 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1258 | if (lcore_info[lcore].device_num < device_num_min) { | |
1259 | device_num_min = lcore_info[lcore].device_num; | |
1260 | core_add = lcore; | |
1261 | } | |
1262 | } | |
1263 | vdev->coreid = core_add; | |
1264 | ||
1265 | TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, | |
1266 | lcore_vdev_entry); | |
1267 | lcore_info[vdev->coreid].device_num++; | |
1268 | ||
1269 | /* Disable notifications. */ | |
1270 | rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); | |
1271 | rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); | |
1272 | ||
1273 | RTE_LOG(INFO, VHOST_DATA, | |
1274 | "(%d) device has been added to data core %d\n", | |
1275 | vid, vdev->coreid); | |
1276 | ||
1277 | return 0; | |
1278 | } | |
1279 | ||
1280 | /* | |
1281 | * These callback allow devices to be added to the data core when configuration | |
1282 | * has been fully complete. | |
1283 | */ | |
1284 | static const struct virtio_net_device_ops virtio_net_device_ops = | |
1285 | { | |
1286 | .new_device = new_device, | |
1287 | .destroy_device = destroy_device, | |
1288 | }; | |
1289 | ||
1290 | /* | |
1291 | * This is a thread will wake up after a period to print stats if the user has | |
1292 | * enabled them. | |
1293 | */ | |
1294 | static void | |
1295 | print_stats(void) | |
1296 | { | |
1297 | struct vhost_dev *vdev; | |
1298 | uint64_t tx_dropped, rx_dropped; | |
1299 | uint64_t tx, tx_total, rx, rx_total; | |
1300 | const char clr[] = { 27, '[', '2', 'J', '\0' }; | |
1301 | const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; | |
1302 | ||
1303 | while(1) { | |
1304 | sleep(enable_stats); | |
1305 | ||
1306 | /* Clear screen and move to top left */ | |
1307 | printf("%s%s\n", clr, top_left); | |
1308 | printf("Device statistics =================================\n"); | |
1309 | ||
1310 | TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { | |
1311 | tx_total = vdev->stats.tx_total; | |
1312 | tx = vdev->stats.tx; | |
1313 | tx_dropped = tx_total - tx; | |
1314 | ||
1315 | rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); | |
1316 | rx = rte_atomic64_read(&vdev->stats.rx_atomic); | |
1317 | rx_dropped = rx_total - rx; | |
1318 | ||
1319 | printf("Statistics for device %d\n" | |
1320 | "-----------------------\n" | |
1321 | "TX total: %" PRIu64 "\n" | |
1322 | "TX dropped: %" PRIu64 "\n" | |
1323 | "TX successful: %" PRIu64 "\n" | |
1324 | "RX total: %" PRIu64 "\n" | |
1325 | "RX dropped: %" PRIu64 "\n" | |
1326 | "RX successful: %" PRIu64 "\n", | |
1327 | vdev->vid, | |
1328 | tx_total, tx_dropped, tx, | |
1329 | rx_total, rx_dropped, rx); | |
1330 | } | |
1331 | ||
1332 | printf("===================================================\n"); | |
1333 | } | |
1334 | } | |
1335 | ||
1336 | static void | |
1337 | unregister_drivers(int socket_num) | |
1338 | { | |
1339 | int i, ret; | |
1340 | ||
1341 | for (i = 0; i < socket_num; i++) { | |
1342 | ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); | |
1343 | if (ret != 0) | |
1344 | RTE_LOG(ERR, VHOST_CONFIG, | |
1345 | "Fail to unregister vhost driver for %s.\n", | |
1346 | socket_files + i * PATH_MAX); | |
1347 | } | |
1348 | } | |
1349 | ||
1350 | /* When we receive a INT signal, unregister vhost driver */ | |
1351 | static void | |
1352 | sigint_handler(__rte_unused int signum) | |
1353 | { | |
1354 | /* Unregister vhost driver. */ | |
1355 | unregister_drivers(nb_sockets); | |
1356 | ||
1357 | exit(0); | |
1358 | } | |
1359 | ||
1360 | /* | |
1361 | * While creating an mbuf pool, one key thing is to figure out how | |
1362 | * many mbuf entries is enough for our use. FYI, here are some | |
1363 | * guidelines: | |
1364 | * | |
1365 | * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage | |
1366 | * | |
1367 | * - For each switch core (A CPU core does the packet switch), we need | |
1368 | * also make some reservation for receiving the packets from virtio | |
1369 | * Tx queue. How many is enough depends on the usage. It's normally | |
1370 | * a simple calculation like following: | |
1371 | * | |
1372 | * MAX_PKT_BURST * max packet size / mbuf size | |
1373 | * | |
1374 | * So, we definitely need allocate more mbufs when TSO is enabled. | |
1375 | * | |
1376 | * - Similarly, for each switching core, we should serve @nr_rx_desc | |
1377 | * mbufs for receiving the packets from physical NIC device. | |
1378 | * | |
1379 | * - We also need make sure, for each switch core, we have allocated | |
1380 | * enough mbufs to fill up the mbuf cache. | |
1381 | */ | |
1382 | static void | |
1383 | create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, | |
1384 | uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) | |
1385 | { | |
1386 | uint32_t nr_mbufs; | |
1387 | uint32_t nr_mbufs_per_core; | |
1388 | uint32_t mtu = 1500; | |
1389 | ||
1390 | if (mergeable) | |
1391 | mtu = 9000; | |
1392 | if (enable_tso) | |
1393 | mtu = 64 * 1024; | |
1394 | ||
1395 | nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / | |
1396 | (mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST; | |
1397 | nr_mbufs_per_core += nr_rx_desc; | |
1398 | nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); | |
1399 | ||
1400 | nr_mbufs = nr_queues * nr_rx_desc; | |
1401 | nr_mbufs += nr_mbufs_per_core * nr_switch_core; | |
1402 | nr_mbufs *= nr_port; | |
1403 | ||
1404 | mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, | |
1405 | nr_mbuf_cache, 0, mbuf_size, | |
1406 | rte_socket_id()); | |
1407 | if (mbuf_pool == NULL) | |
1408 | rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); | |
1409 | } | |
1410 | ||
1411 | /* | |
1412 | * Main function, does initialisation and calls the per-lcore functions. | |
1413 | */ | |
1414 | int | |
1415 | main(int argc, char *argv[]) | |
1416 | { | |
1417 | unsigned lcore_id, core_id = 0; | |
1418 | unsigned nb_ports, valid_num_ports; | |
1419 | int ret, i; | |
1420 | uint8_t portid; | |
1421 | static pthread_t tid; | |
1422 | char thread_name[RTE_MAX_THREAD_NAME_LEN]; | |
1423 | uint64_t flags = 0; | |
1424 | ||
1425 | signal(SIGINT, sigint_handler); | |
1426 | ||
1427 | /* init EAL */ | |
1428 | ret = rte_eal_init(argc, argv); | |
1429 | if (ret < 0) | |
1430 | rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); | |
1431 | argc -= ret; | |
1432 | argv += ret; | |
1433 | ||
1434 | /* parse app arguments */ | |
1435 | ret = us_vhost_parse_args(argc, argv); | |
1436 | if (ret < 0) | |
1437 | rte_exit(EXIT_FAILURE, "Invalid argument\n"); | |
1438 | ||
1439 | for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) | |
1440 | TAILQ_INIT(&lcore_info[lcore_id].vdev_list); | |
1441 | ||
1442 | if (rte_lcore_is_enabled(lcore_id)) | |
1443 | lcore_ids[core_id ++] = lcore_id; | |
1444 | ||
1445 | if (rte_lcore_count() > RTE_MAX_LCORE) | |
1446 | rte_exit(EXIT_FAILURE,"Not enough cores\n"); | |
1447 | ||
1448 | /* Get the number of physical ports. */ | |
1449 | nb_ports = rte_eth_dev_count(); | |
1450 | ||
1451 | /* | |
1452 | * Update the global var NUM_PORTS and global array PORTS | |
1453 | * and get value of var VALID_NUM_PORTS according to system ports number | |
1454 | */ | |
1455 | valid_num_ports = check_ports_num(nb_ports); | |
1456 | ||
1457 | if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { | |
1458 | RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," | |
1459 | "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); | |
1460 | return -1; | |
1461 | } | |
1462 | ||
1463 | /* | |
1464 | * FIXME: here we are trying to allocate mbufs big enough for | |
1465 | * @MAX_QUEUES, but the truth is we're never going to use that | |
1466 | * many queues here. We probably should only do allocation for | |
1467 | * those queues we are going to use. | |
1468 | */ | |
1469 | create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, | |
1470 | MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); | |
1471 | ||
1472 | if (vm2vm_mode == VM2VM_HARDWARE) { | |
1473 | /* Enable VT loop back to let L2 switch to do it. */ | |
1474 | vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; | |
1475 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
1476 | "Enable loop back for L2 switch in vmdq.\n"); | |
1477 | } | |
1478 | ||
1479 | /* initialize all ports */ | |
1480 | for (portid = 0; portid < nb_ports; portid++) { | |
1481 | /* skip ports that are not enabled */ | |
1482 | if ((enabled_port_mask & (1 << portid)) == 0) { | |
1483 | RTE_LOG(INFO, VHOST_PORT, | |
1484 | "Skipping disabled port %d\n", portid); | |
1485 | continue; | |
1486 | } | |
1487 | if (port_init(portid) != 0) | |
1488 | rte_exit(EXIT_FAILURE, | |
1489 | "Cannot initialize network ports\n"); | |
1490 | } | |
1491 | ||
1492 | /* Enable stats if the user option is set. */ | |
1493 | if (enable_stats) { | |
1494 | ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); | |
1495 | if (ret != 0) | |
1496 | rte_exit(EXIT_FAILURE, | |
1497 | "Cannot create print-stats thread\n"); | |
1498 | ||
1499 | /* Set thread_name for aid in debugging. */ | |
1500 | snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); | |
1501 | ret = rte_thread_setname(tid, thread_name); | |
1502 | if (ret != 0) | |
1503 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
1504 | "Cannot set print-stats name\n"); | |
1505 | } | |
1506 | ||
1507 | /* Launch all data cores. */ | |
1508 | RTE_LCORE_FOREACH_SLAVE(lcore_id) | |
1509 | rte_eal_remote_launch(switch_worker, NULL, lcore_id); | |
1510 | ||
1511 | if (mergeable == 0) | |
1512 | rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); | |
1513 | ||
1514 | if (client_mode) | |
1515 | flags |= RTE_VHOST_USER_CLIENT; | |
1516 | ||
1517 | if (dequeue_zero_copy) | |
1518 | flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; | |
1519 | ||
1520 | /* Register vhost user driver to handle vhost messages. */ | |
1521 | for (i = 0; i < nb_sockets; i++) { | |
1522 | ret = rte_vhost_driver_register | |
1523 | (socket_files + i * PATH_MAX, flags); | |
1524 | if (ret != 0) { | |
1525 | unregister_drivers(i); | |
1526 | rte_exit(EXIT_FAILURE, | |
1527 | "vhost driver register failure.\n"); | |
1528 | } | |
1529 | } | |
1530 | ||
1531 | rte_vhost_driver_callback_register(&virtio_net_device_ops); | |
1532 | ||
1533 | rte_vhost_driver_session_start(); | |
1534 | return 0; | |
1535 | ||
1536 | } |