]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | #include <arpa/inet.h> | |
35 | #include <getopt.h> | |
36 | #include <linux/if_ether.h> | |
37 | #include <linux/if_vlan.h> | |
38 | #include <linux/virtio_net.h> | |
39 | #include <linux/virtio_ring.h> | |
40 | #include <signal.h> | |
41 | #include <stdint.h> | |
42 | #include <sys/eventfd.h> | |
43 | #include <sys/param.h> | |
44 | #include <unistd.h> | |
45 | ||
46 | #include <rte_atomic.h> | |
47 | #include <rte_cycles.h> | |
48 | #include <rte_ethdev.h> | |
49 | #include <rte_log.h> | |
50 | #include <rte_string_fns.h> | |
51 | ||
52 | #include "main.h" | |
53 | #include "virtio-net.h" | |
54 | #include "xen_vhost.h" | |
55 | ||
56 | #define MAX_QUEUES 128 | |
57 | ||
58 | /* the maximum number of external ports supported */ | |
59 | #define MAX_SUP_PORTS 1 | |
60 | ||
61 | /* | |
62 | * Calculate the number of buffers needed per port | |
63 | */ | |
64 | #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ | |
65 | (num_switching_cores*MAX_PKT_BURST) + \ | |
66 | (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ | |
67 | (num_switching_cores*MBUF_CACHE_SIZE)) | |
68 | ||
69 | #define MBUF_CACHE_SIZE 64 | |
70 | ||
71 | /* | |
72 | * RX and TX Prefetch, Host, and Write-back threshold values should be | |
73 | * carefully set for optimal performance. Consult the network | |
74 | * controller's datasheet and supporting DPDK documentation for guidance | |
75 | * on how these parameters should be set. | |
76 | */ | |
77 | #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ | |
78 | #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ | |
79 | #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ | |
80 | ||
81 | /* | |
82 | * These default values are optimized for use with the Intel(R) 82599 10 GbE | |
83 | * Controller and the DPDK ixgbe PMD. Consider using other values for other | |
84 | * network controllers and/or network drivers. | |
85 | */ | |
86 | #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ | |
87 | #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ | |
88 | #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ | |
89 | ||
90 | #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ | |
91 | #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ | |
92 | #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ | |
93 | ||
94 | /* State of virtio device. */ | |
95 | #define DEVICE_NOT_READY 0 | |
96 | #define DEVICE_READY 1 | |
97 | #define DEVICE_SAFE_REMOVE 2 | |
98 | ||
99 | /* Config_core_flag status definitions. */ | |
100 | #define REQUEST_DEV_REMOVAL 1 | |
101 | #define ACK_DEV_REMOVAL 0 | |
102 | ||
103 | /* Configurable number of RX/TX ring descriptors */ | |
104 | #define RTE_TEST_RX_DESC_DEFAULT 128 | |
105 | #define RTE_TEST_TX_DESC_DEFAULT 512 | |
106 | ||
107 | #define INVALID_PORT_ID 0xFF | |
108 | ||
109 | /* Max number of devices. Limited by vmdq. */ | |
110 | #define MAX_DEVICES 64 | |
111 | ||
112 | /* Size of buffers used for snprintfs. */ | |
113 | #define MAX_PRINT_BUFF 6072 | |
114 | ||
115 | ||
116 | /* Maximum long option length for option parsing. */ | |
117 | #define MAX_LONG_OPT_SZ 64 | |
118 | ||
119 | /* Used to compare MAC addresses. */ | |
120 | #define MAC_ADDR_CMP 0xFFFFFFFFFFFF | |
121 | ||
122 | /* mask of enabled ports */ | |
123 | static uint32_t enabled_port_mask = 0; | |
124 | ||
125 | /*Number of switching cores enabled*/ | |
126 | static uint32_t num_switching_cores = 0; | |
127 | ||
128 | /* number of devices/queues to support*/ | |
129 | static uint32_t num_queues = 0; | |
130 | uint32_t num_devices = 0; | |
131 | ||
132 | /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ | |
133 | static uint32_t enable_vm2vm = 1; | |
134 | /* Enable stats. */ | |
135 | static uint32_t enable_stats = 0; | |
136 | ||
137 | /* empty vmdq configuration structure. Filled in programatically */ | |
138 | static const struct rte_eth_conf vmdq_conf_default = { | |
139 | .rxmode = { | |
140 | .mq_mode = ETH_MQ_RX_VMDQ_ONLY, | |
141 | .split_hdr_size = 0, | |
142 | .header_split = 0, /**< Header Split disabled */ | |
143 | .hw_ip_checksum = 0, /**< IP checksum offload disabled */ | |
144 | .hw_vlan_filter = 0, /**< VLAN filtering disabled */ | |
145 | /* | |
146 | * It is necessary for 1G NIC such as I350, | |
147 | * this fixes bug of ipv4 forwarding in guest can't | |
148 | * forward pakets from one virtio dev to another virtio dev. | |
149 | */ | |
150 | .hw_vlan_strip = 1, /**< VLAN strip enabled. */ | |
151 | .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ | |
152 | .hw_strip_crc = 0, /**< CRC stripped by hardware */ | |
153 | }, | |
154 | ||
155 | .txmode = { | |
156 | .mq_mode = ETH_MQ_TX_NONE, | |
157 | }, | |
158 | .rx_adv_conf = { | |
159 | /* | |
160 | * should be overridden separately in code with | |
161 | * appropriate values | |
162 | */ | |
163 | .vmdq_rx_conf = { | |
164 | .nb_queue_pools = ETH_8_POOLS, | |
165 | .enable_default_pool = 0, | |
166 | .default_pool = 0, | |
167 | .nb_pool_maps = 0, | |
168 | .pool_map = {{0, 0},}, | |
169 | }, | |
170 | }, | |
171 | }; | |
172 | ||
173 | static unsigned lcore_ids[RTE_MAX_LCORE]; | |
174 | static uint8_t ports[RTE_MAX_ETHPORTS]; | |
175 | static unsigned num_ports = 0; /**< The number of ports specified in command line */ | |
176 | ||
177 | const uint16_t vlan_tags[] = { | |
178 | 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, | |
179 | 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, | |
180 | 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, | |
181 | 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, | |
182 | 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, | |
183 | 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, | |
184 | 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, | |
185 | 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, | |
186 | }; | |
187 | ||
188 | /* ethernet addresses of ports */ | |
189 | static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; | |
190 | ||
191 | /* heads for the main used and free linked lists for the data path. */ | |
192 | static struct virtio_net_data_ll *ll_root_used = NULL; | |
193 | static struct virtio_net_data_ll *ll_root_free = NULL; | |
194 | ||
195 | /* Array of data core structures containing information on individual core linked lists. */ | |
196 | static struct lcore_info lcore_info[RTE_MAX_LCORE]; | |
197 | ||
198 | /* Used for queueing bursts of TX packets. */ | |
199 | struct mbuf_table { | |
200 | unsigned len; | |
201 | unsigned txq_id; | |
202 | struct rte_mbuf *m_table[MAX_PKT_BURST]; | |
203 | }; | |
204 | ||
205 | /* TX queue for each data core. */ | |
206 | struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; | |
207 | ||
208 | /* Vlan header struct used to insert vlan tags on TX. */ | |
209 | struct vlan_ethhdr { | |
210 | unsigned char h_dest[ETH_ALEN]; | |
211 | unsigned char h_source[ETH_ALEN]; | |
212 | __be16 h_vlan_proto; | |
213 | __be16 h_vlan_TCI; | |
214 | __be16 h_vlan_encapsulated_proto; | |
215 | }; | |
216 | ||
217 | /* Header lengths. */ | |
218 | #define VLAN_HLEN 4 | |
219 | #define VLAN_ETH_HLEN 18 | |
220 | ||
221 | /* Per-device statistics struct */ | |
222 | struct device_statistics { | |
223 | uint64_t tx_total; | |
224 | rte_atomic64_t rx_total; | |
225 | uint64_t tx; | |
226 | rte_atomic64_t rx; | |
227 | } __rte_cache_aligned; | |
228 | struct device_statistics dev_statistics[MAX_DEVICES]; | |
229 | ||
230 | /* | |
231 | * Builds up the correct configuration for VMDQ VLAN pool map | |
232 | * according to the pool & queue limits. | |
233 | */ | |
234 | static inline int | |
235 | get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) | |
236 | { | |
237 | struct rte_eth_vmdq_rx_conf conf; | |
238 | unsigned i; | |
239 | ||
240 | memset(&conf, 0, sizeof(conf)); | |
241 | conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; | |
242 | conf.nb_pool_maps = num_devices; | |
243 | ||
244 | for (i = 0; i < conf.nb_pool_maps; i++) { | |
245 | conf.pool_map[i].vlan_id = vlan_tags[ i ]; | |
246 | conf.pool_map[i].pools = (1UL << i); | |
247 | } | |
248 | ||
249 | (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); | |
250 | (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, | |
251 | sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); | |
252 | return 0; | |
253 | } | |
254 | ||
255 | /* | |
256 | * Validate the device number according to the max pool number gotten form dev_info | |
257 | * If the device number is invalid, give the error message and return -1. | |
258 | * Each device must have its own pool. | |
259 | */ | |
260 | static inline int | |
261 | validate_num_devices(uint32_t max_nb_devices) | |
262 | { | |
263 | if (num_devices > max_nb_devices) { | |
264 | RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); | |
265 | return -1; | |
266 | } | |
267 | return 0; | |
268 | } | |
269 | ||
270 | /* | |
271 | * Initialises a given port using global settings and with the rx buffers | |
272 | * coming from the mbuf_pool passed as parameter | |
273 | */ | |
274 | static inline int | |
275 | port_init(uint8_t port, struct rte_mempool *mbuf_pool) | |
276 | { | |
277 | struct rte_eth_dev_info dev_info; | |
278 | struct rte_eth_rxconf *rxconf; | |
279 | struct rte_eth_conf port_conf; | |
280 | uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count(); | |
281 | const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT, tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; | |
282 | int retval; | |
283 | uint16_t q; | |
284 | ||
285 | /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ | |
286 | rte_eth_dev_info_get (port, &dev_info); | |
287 | ||
288 | /*configure the number of supported virtio devices based on VMDQ limits */ | |
289 | num_devices = dev_info.max_vmdq_pools; | |
290 | num_queues = dev_info.max_rx_queues; | |
291 | ||
292 | retval = validate_num_devices(MAX_DEVICES); | |
293 | if (retval < 0) | |
294 | return retval; | |
295 | ||
296 | /* Get port configuration. */ | |
297 | retval = get_eth_conf(&port_conf, num_devices); | |
298 | if (retval < 0) | |
299 | return retval; | |
300 | ||
301 | if (port >= rte_eth_dev_count()) return -1; | |
302 | ||
303 | rx_rings = (uint16_t)num_queues, | |
304 | /* Configure ethernet device. */ | |
305 | retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); | |
306 | if (retval != 0) | |
307 | return retval; | |
308 | ||
309 | rte_eth_dev_info_get(port, &dev_info); | |
310 | rxconf = &dev_info.default_rxconf; | |
311 | rxconf->rx_drop_en = 1; | |
312 | /* Setup the queues. */ | |
313 | for (q = 0; q < rx_rings; q ++) { | |
314 | retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, | |
315 | rte_eth_dev_socket_id(port), rxconf, | |
316 | mbuf_pool); | |
317 | if (retval < 0) | |
318 | return retval; | |
319 | } | |
320 | for (q = 0; q < tx_rings; q ++) { | |
321 | retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, | |
322 | rte_eth_dev_socket_id(port), | |
323 | NULL); | |
324 | if (retval < 0) | |
325 | return retval; | |
326 | } | |
327 | ||
328 | /* Start the device. */ | |
329 | retval = rte_eth_dev_start(port); | |
330 | if (retval < 0) | |
331 | return retval; | |
332 | ||
333 | rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); | |
334 | RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); | |
335 | RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 | |
336 | " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", | |
337 | (unsigned)port, | |
338 | vmdq_ports_eth_addr[port].addr_bytes[0], | |
339 | vmdq_ports_eth_addr[port].addr_bytes[1], | |
340 | vmdq_ports_eth_addr[port].addr_bytes[2], | |
341 | vmdq_ports_eth_addr[port].addr_bytes[3], | |
342 | vmdq_ports_eth_addr[port].addr_bytes[4], | |
343 | vmdq_ports_eth_addr[port].addr_bytes[5]); | |
344 | ||
345 | return 0; | |
346 | } | |
347 | ||
348 | /* | |
349 | * Parse the portmask provided at run time. | |
350 | */ | |
351 | static int | |
352 | parse_portmask(const char *portmask) | |
353 | { | |
354 | char *end = NULL; | |
355 | unsigned long pm; | |
356 | ||
357 | errno = 0; | |
358 | ||
359 | /* parse hexadecimal string */ | |
360 | pm = strtoul(portmask, &end, 16); | |
361 | if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) | |
362 | return -1; | |
363 | ||
364 | if (pm == 0) | |
365 | return -1; | |
366 | ||
367 | return pm; | |
368 | ||
369 | } | |
370 | ||
371 | /* | |
372 | * Parse num options at run time. | |
373 | */ | |
374 | static int | |
375 | parse_num_opt(const char *q_arg, uint32_t max_valid_value) | |
376 | { | |
377 | char *end = NULL; | |
378 | unsigned long num; | |
379 | ||
380 | errno = 0; | |
381 | ||
382 | /* parse unsigned int string */ | |
383 | num = strtoul(q_arg, &end, 10); | |
384 | if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) | |
385 | return -1; | |
386 | ||
387 | if (num > max_valid_value) | |
388 | return -1; | |
389 | ||
390 | return num; | |
391 | ||
392 | } | |
393 | ||
394 | /* | |
395 | * Display usage | |
396 | */ | |
397 | static void | |
398 | us_vhost_usage(const char *prgname) | |
399 | { | |
400 | RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK --vm2vm [0|1] --stats [0-N] --nb-devices ND\n" | |
401 | " -p PORTMASK: Set mask for ports to be used by application\n" | |
402 | " --vm2vm [0|1]: disable/enable(default) vm2vm comms\n" | |
403 | " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n", | |
404 | prgname); | |
405 | } | |
406 | ||
407 | /* | |
408 | * Parse the arguments given in the command line of the application. | |
409 | */ | |
410 | static int | |
411 | us_vhost_parse_args(int argc, char **argv) | |
412 | { | |
413 | int opt, ret; | |
414 | int option_index; | |
415 | unsigned i; | |
416 | const char *prgname = argv[0]; | |
417 | static struct option long_option[] = { | |
418 | {"vm2vm", required_argument, NULL, 0}, | |
419 | {"stats", required_argument, NULL, 0}, | |
420 | {NULL, 0, 0, 0} | |
421 | }; | |
422 | ||
423 | /* Parse command line */ | |
424 | while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { | |
425 | switch (opt) { | |
426 | /* Portmask */ | |
427 | case 'p': | |
428 | enabled_port_mask = parse_portmask(optarg); | |
429 | if (enabled_port_mask == 0) { | |
430 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); | |
431 | us_vhost_usage(prgname); | |
432 | return -1; | |
433 | } | |
434 | break; | |
435 | ||
436 | case 0: | |
437 | /* Enable/disable vm2vm comms. */ | |
438 | if (!strncmp(long_option[option_index].name, "vm2vm", MAX_LONG_OPT_SZ)) { | |
439 | ret = parse_num_opt(optarg, 1); | |
440 | if (ret == -1) { | |
441 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for vm2vm [0|1]\n"); | |
442 | us_vhost_usage(prgname); | |
443 | return -1; | |
444 | } else { | |
445 | enable_vm2vm = ret; | |
446 | } | |
447 | } | |
448 | ||
449 | /* Enable/disable stats. */ | |
450 | if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { | |
451 | ret = parse_num_opt(optarg, INT32_MAX); | |
452 | if (ret == -1) { | |
453 | RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); | |
454 | us_vhost_usage(prgname); | |
455 | return -1; | |
456 | } else { | |
457 | enable_stats = ret; | |
458 | } | |
459 | } | |
460 | break; | |
461 | ||
462 | /* Invalid option - print options. */ | |
463 | default: | |
464 | us_vhost_usage(prgname); | |
465 | return -1; | |
466 | } | |
467 | } | |
468 | ||
469 | for (i = 0; i < RTE_MAX_ETHPORTS; i++) { | |
470 | if (enabled_port_mask & (1 << i)) | |
471 | ports[num_ports++] = (uint8_t)i; | |
472 | } | |
473 | ||
474 | if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { | |
475 | RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," | |
476 | "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); | |
477 | return -1; | |
478 | } | |
479 | ||
480 | return 0; | |
481 | } | |
482 | ||
483 | /* | |
484 | * Update the global var NUM_PORTS and array PORTS according to system ports number | |
485 | * and return valid ports number | |
486 | */ | |
487 | static unsigned check_ports_num(unsigned nb_ports) | |
488 | { | |
489 | unsigned valid_num_ports = num_ports; | |
490 | unsigned portid; | |
491 | ||
492 | if (num_ports > nb_ports) { | |
493 | RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", | |
494 | num_ports, nb_ports); | |
495 | num_ports = nb_ports; | |
496 | } | |
497 | ||
498 | for (portid = 0; portid < num_ports; portid ++) { | |
499 | if (ports[portid] >= nb_ports) { | |
500 | RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", | |
501 | ports[portid], (nb_ports - 1)); | |
502 | ports[portid] = INVALID_PORT_ID; | |
503 | valid_num_ports--; | |
504 | } | |
505 | } | |
506 | return valid_num_ports; | |
507 | } | |
508 | ||
509 | /* | |
510 | * Function to convert guest physical addresses to vhost virtual addresses. This | |
511 | * is used to convert virtio buffer addresses. | |
512 | */ | |
513 | static inline uint64_t __attribute__((always_inline)) | |
514 | gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) | |
515 | { | |
516 | struct virtio_memory_regions *region; | |
517 | uint32_t regionidx; | |
518 | uint64_t vhost_va = 0; | |
519 | ||
520 | for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { | |
521 | region = &dev->mem->regions[regionidx]; | |
522 | if ((guest_pa >= region->guest_phys_address) && | |
523 | (guest_pa <= region->guest_phys_address_end)) { | |
524 | vhost_va = region->address_offset + guest_pa; | |
525 | break; | |
526 | } | |
527 | } | |
528 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") GPA %p| VVA %p\n", | |
529 | dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va); | |
530 | ||
531 | return vhost_va; | |
532 | } | |
533 | ||
534 | /* | |
535 | * This function adds buffers to the virtio devices RX virtqueue. Buffers can | |
536 | * be received from the physical port or from another virtio device. A packet | |
537 | * count is returned to indicate the number of packets that were succesfully | |
538 | * added to the RX queue. | |
539 | */ | |
540 | static inline uint32_t __attribute__((always_inline)) | |
541 | virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) | |
542 | { | |
543 | struct vhost_virtqueue *vq; | |
544 | struct vring_desc *desc; | |
545 | struct rte_mbuf *buff; | |
546 | /* The virtio_hdr is initialised to 0. */ | |
547 | struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0}; | |
548 | uint64_t buff_addr = 0; | |
549 | uint64_t buff_hdr_addr = 0; | |
550 | uint32_t head[MAX_PKT_BURST], packet_len = 0; | |
551 | uint32_t head_idx, packet_success = 0; | |
552 | uint16_t avail_idx, res_cur_idx; | |
553 | uint16_t res_base_idx, res_end_idx; | |
554 | uint16_t free_entries; | |
555 | uint8_t success = 0; | |
556 | void *userdata; | |
557 | ||
558 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_rx()\n", dev->device_fh); | |
559 | vq = dev->virtqueue_rx; | |
560 | count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; | |
561 | /* As many data cores may want access to available buffers, they need to be reserved. */ | |
562 | do { | |
563 | ||
564 | res_base_idx = vq->last_used_idx_res; | |
565 | ||
566 | avail_idx = *((volatile uint16_t *)&vq->avail->idx); | |
567 | ||
568 | free_entries = (avail_idx - res_base_idx); | |
569 | ||
570 | /*check that we have enough buffers*/ | |
571 | if (unlikely(count > free_entries)) | |
572 | count = free_entries; | |
573 | ||
574 | if (count == 0) | |
575 | return 0; | |
576 | ||
577 | res_end_idx = res_base_idx + count; | |
578 | /* vq->last_used_idx_res is atomically updated. */ | |
579 | success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx, | |
580 | res_end_idx); | |
581 | } while (unlikely(success == 0)); | |
582 | res_cur_idx = res_base_idx; | |
583 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Current Index %d| End Index %d\n", | |
584 | dev->device_fh, res_cur_idx, res_end_idx); | |
585 | ||
586 | /* Prefetch available ring to retrieve indexes. */ | |
587 | rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); | |
588 | ||
589 | /* Retrieve all of the head indexes first to avoid caching issues. */ | |
590 | for (head_idx = 0; head_idx < count; head_idx++) | |
591 | head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; | |
592 | ||
593 | /*Prefetch descriptor index. */ | |
594 | rte_prefetch0(&vq->desc[head[packet_success]]); | |
595 | ||
596 | while (res_cur_idx != res_end_idx) { | |
597 | /* Get descriptor from available ring */ | |
598 | desc = &vq->desc[head[packet_success]]; | |
599 | /* Prefetch descriptor address. */ | |
600 | rte_prefetch0(desc); | |
601 | ||
602 | buff = pkts[packet_success]; | |
603 | ||
604 | /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */ | |
605 | buff_addr = gpa_to_vva(dev, desc->addr); | |
606 | /* Prefetch buffer address. */ | |
607 | rte_prefetch0((void*)(uintptr_t)buff_addr); | |
608 | ||
609 | { | |
610 | /* Copy virtio_hdr to packet and increment buffer address */ | |
611 | buff_hdr_addr = buff_addr; | |
612 | packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; | |
613 | ||
614 | /* | |
615 | * If the descriptors are chained the header and data are placed in | |
616 | * separate buffers. | |
617 | */ | |
618 | if (desc->flags & VRING_DESC_F_NEXT) { | |
619 | desc->len = vq->vhost_hlen; | |
620 | desc = &vq->desc[desc->next]; | |
621 | /* Buffer address translation. */ | |
622 | buff_addr = gpa_to_vva(dev, desc->addr); | |
623 | desc->len = rte_pktmbuf_data_len(buff); | |
624 | } else { | |
625 | buff_addr += vq->vhost_hlen; | |
626 | desc->len = packet_len; | |
627 | } | |
628 | } | |
629 | ||
630 | /* Update used ring with desc information */ | |
631 | vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; | |
632 | vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; | |
633 | ||
634 | /* Copy mbuf data to buffer */ | |
635 | userdata = rte_pktmbuf_mtod(buff, void *); | |
636 | rte_memcpy((void *)(uintptr_t)buff_addr, userdata, rte_pktmbuf_data_len(buff)); | |
637 | ||
638 | res_cur_idx++; | |
639 | packet_success++; | |
640 | ||
641 | /* mergeable is disabled then a header is required per buffer. */ | |
642 | rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); | |
643 | if (res_cur_idx < res_end_idx) { | |
644 | /* Prefetch descriptor index. */ | |
645 | rte_prefetch0(&vq->desc[head[packet_success]]); | |
646 | } | |
647 | } | |
648 | ||
649 | rte_compiler_barrier(); | |
650 | ||
651 | /* Wait until it's our turn to add our buffer to the used ring. */ | |
652 | while (unlikely(vq->last_used_idx != res_base_idx)) | |
653 | rte_pause(); | |
654 | ||
655 | *(volatile uint16_t *)&vq->used->idx += count; | |
656 | ||
657 | vq->last_used_idx = res_end_idx; | |
658 | ||
659 | return count; | |
660 | } | |
661 | ||
662 | /* | |
663 | * Compares a packet destination MAC address to a device MAC address. | |
664 | */ | |
665 | static inline int __attribute__((always_inline)) | |
666 | ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) | |
667 | { | |
668 | return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; | |
669 | } | |
670 | ||
671 | /* | |
672 | * This function registers mac along with a | |
673 | * vlan tag to a VMDQ. | |
674 | */ | |
675 | static int | |
676 | link_vmdq(struct virtio_net *dev) | |
677 | { | |
678 | int ret; | |
679 | struct virtio_net_data_ll *dev_ll; | |
680 | ||
681 | dev_ll = ll_root_used; | |
682 | ||
683 | while (dev_ll != NULL) { | |
684 | if ((dev != dev_ll->dev) && ether_addr_cmp(&dev->mac_address, &dev_ll->dev->mac_address)) { | |
685 | RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); | |
686 | return -1; | |
687 | } | |
688 | dev_ll = dev_ll->next; | |
689 | } | |
690 | ||
691 | /* vlan_tag currently uses the device_id. */ | |
692 | dev->vlan_tag = vlan_tags[dev->device_fh]; | |
693 | dev->vmdq_rx_q = dev->device_fh * (num_queues/num_devices); | |
694 | ||
695 | /* Print out VMDQ registration info. */ | |
696 | RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", | |
697 | dev->device_fh, | |
698 | dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1], | |
699 | dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3], | |
700 | dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5], | |
701 | dev->vlan_tag); | |
702 | ||
703 | /* Register the MAC address. */ | |
704 | ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh); | |
705 | if (ret) { | |
706 | RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", | |
707 | dev->device_fh); | |
708 | return -1; | |
709 | } | |
710 | ||
711 | /* Enable stripping of the vlan tag as we handle routing. */ | |
712 | rte_eth_dev_set_vlan_strip_on_queue(ports[0], dev->vmdq_rx_q, 1); | |
713 | ||
714 | rte_compiler_barrier(); | |
715 | /* Set device as ready for RX. */ | |
716 | dev->ready = DEVICE_READY; | |
717 | ||
718 | return 0; | |
719 | } | |
720 | ||
721 | /* | |
722 | * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX | |
723 | * queue before disabling RX on the device. | |
724 | */ | |
725 | static inline void | |
726 | unlink_vmdq(struct virtio_net *dev) | |
727 | { | |
728 | unsigned i = 0; | |
729 | unsigned rx_count; | |
730 | struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; | |
731 | ||
732 | if (dev->ready == DEVICE_READY) { | |
733 | /*clear MAC and VLAN settings*/ | |
734 | rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address); | |
735 | for (i = 0; i < 6; i++) | |
736 | dev->mac_address.addr_bytes[i] = 0; | |
737 | ||
738 | dev->vlan_tag = 0; | |
739 | ||
740 | /*Clear out the receive buffers*/ | |
741 | rx_count = rte_eth_rx_burst(ports[0], | |
742 | (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); | |
743 | ||
744 | while (rx_count) { | |
745 | for (i = 0; i < rx_count; i++) | |
746 | rte_pktmbuf_free(pkts_burst[i]); | |
747 | ||
748 | rx_count = rte_eth_rx_burst(ports[0], | |
749 | (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); | |
750 | } | |
751 | ||
752 | dev->ready = DEVICE_NOT_READY; | |
753 | } | |
754 | } | |
755 | ||
756 | /* | |
757 | * Check if the packet destination MAC address is for a local device. If so then put | |
758 | * the packet on that devices RX queue. If not then return. | |
759 | */ | |
760 | static inline unsigned __attribute__((always_inline)) | |
761 | virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m) | |
762 | { | |
763 | struct virtio_net_data_ll *dev_ll; | |
764 | struct ether_hdr *pkt_hdr; | |
765 | uint64_t ret = 0; | |
766 | ||
767 | pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); | |
768 | ||
769 | /*get the used devices list*/ | |
770 | dev_ll = ll_root_used; | |
771 | ||
772 | while (dev_ll != NULL) { | |
773 | if (likely(dev_ll->dev->ready == DEVICE_READY) && ether_addr_cmp(&(pkt_hdr->d_addr), | |
774 | &dev_ll->dev->mac_address)) { | |
775 | ||
776 | /* Drop the packet if the TX packet is destined for the TX device. */ | |
777 | if (dev_ll->dev->device_fh == dev->device_fh) { | |
778 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " | |
779 | "Source and destination MAC addresses are the same. " | |
780 | "Dropping packet.\n", | |
781 | dev_ll->dev->device_fh); | |
782 | return 0; | |
783 | } | |
784 | ||
785 | ||
786 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " | |
787 | "MAC address is local\n", dev_ll->dev->device_fh); | |
788 | ||
789 | if (dev_ll->dev->remove) { | |
790 | /*drop the packet if the device is marked for removal*/ | |
791 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") " | |
792 | "Device is marked for removal\n", | |
793 | dev_ll->dev->device_fh); | |
794 | } else { | |
795 | /*send the packet to the local virtio device*/ | |
796 | ret = virtio_dev_rx(dev_ll->dev, &m, 1); | |
797 | if (enable_stats) { | |
798 | rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, 1); | |
799 | rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret); | |
800 | dev_statistics[dev->device_fh].tx_total++; | |
801 | dev_statistics[dev->device_fh].tx += ret; | |
802 | } | |
803 | } | |
804 | ||
805 | return 0; | |
806 | } | |
807 | dev_ll = dev_ll->next; | |
808 | } | |
809 | ||
810 | return -1; | |
811 | } | |
812 | ||
813 | /* | |
814 | * This function routes the TX packet to the correct interface. This may be a local device | |
815 | * or the physical port. | |
816 | */ | |
817 | static inline void __attribute__((always_inline)) | |
818 | virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) | |
819 | { | |
820 | struct mbuf_table *tx_q; | |
821 | struct vlan_ethhdr *vlan_hdr; | |
822 | struct rte_mbuf **m_table; | |
823 | struct rte_mbuf *mbuf; | |
824 | unsigned len, ret; | |
825 | const uint16_t lcore_id = rte_lcore_id(); | |
826 | ||
827 | /*check if destination is local VM*/ | |
828 | if (enable_vm2vm && (virtio_tx_local(dev, m) == 0)) { | |
829 | return; | |
830 | } | |
831 | ||
832 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " | |
833 | "MAC address is external\n", dev->device_fh); | |
834 | ||
835 | /*Add packet to the port tx queue*/ | |
836 | tx_q = &lcore_tx_queue[lcore_id]; | |
837 | len = tx_q->len; | |
838 | ||
839 | /* Allocate an mbuf and populate the structure. */ | |
840 | mbuf = rte_pktmbuf_alloc(mbuf_pool); | |
841 | if(!mbuf) | |
842 | return; | |
843 | ||
844 | mbuf->data_len = m->data_len + VLAN_HLEN; | |
845 | mbuf->pkt_len = mbuf->data_len; | |
846 | ||
847 | /* Copy ethernet header to mbuf. */ | |
848 | rte_memcpy(rte_pktmbuf_mtod(mbuf, void*), | |
849 | rte_pktmbuf_mtod(m, const void*), ETH_HLEN); | |
850 | ||
851 | ||
852 | /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ | |
853 | vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); | |
854 | vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; | |
855 | vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); | |
856 | vlan_hdr->h_vlan_TCI = htons(vlan_tag); | |
857 | ||
858 | /* Copy the remaining packet contents to the mbuf. */ | |
859 | rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, VLAN_ETH_HLEN), | |
860 | rte_pktmbuf_mtod_offset(m, const void *, ETH_HLEN), | |
861 | (m->data_len - ETH_HLEN)); | |
862 | tx_q->m_table[len] = mbuf; | |
863 | len++; | |
864 | if (enable_stats) { | |
865 | dev_statistics[dev->device_fh].tx_total++; | |
866 | dev_statistics[dev->device_fh].tx++; | |
867 | } | |
868 | ||
869 | if (unlikely(len == MAX_PKT_BURST)) { | |
870 | m_table = (struct rte_mbuf **)tx_q->m_table; | |
871 | ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); | |
872 | /* Free any buffers not handled by TX and update the port stats. */ | |
873 | if (unlikely(ret < len)) { | |
874 | do { | |
875 | rte_pktmbuf_free(m_table[ret]); | |
876 | } while (++ret < len); | |
877 | } | |
878 | ||
879 | len = 0; | |
880 | } | |
881 | ||
882 | tx_q->len = len; | |
883 | return; | |
884 | } | |
885 | ||
886 | static inline void __attribute__((always_inline)) | |
887 | virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool) | |
888 | { | |
889 | struct rte_mbuf m; | |
890 | struct vhost_virtqueue *vq; | |
891 | struct vring_desc *desc; | |
892 | uint64_t buff_addr = 0; | |
893 | uint32_t head[MAX_PKT_BURST]; | |
894 | uint32_t used_idx; | |
895 | uint32_t i; | |
896 | uint16_t free_entries, packet_success = 0; | |
897 | uint16_t avail_idx; | |
898 | ||
899 | vq = dev->virtqueue_tx; | |
900 | avail_idx = *((volatile uint16_t *)&vq->avail->idx); | |
901 | ||
902 | /* If there are no available buffers then return. */ | |
903 | if (vq->last_used_idx == avail_idx) | |
904 | return; | |
905 | ||
906 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_tx()\n", | |
907 | dev->device_fh); | |
908 | ||
909 | /* Prefetch available ring to retrieve head indexes. */ | |
910 | rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); | |
911 | ||
912 | /*get the number of free entries in the ring*/ | |
913 | free_entries = avail_idx - vq->last_used_idx; | |
914 | free_entries = unlikely(free_entries < MAX_PKT_BURST) ? free_entries : MAX_PKT_BURST; | |
915 | ||
916 | RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Buffers available %d\n", | |
917 | dev->device_fh, free_entries); | |
918 | /* Retrieve all of the head indexes first to avoid caching issues. */ | |
919 | for (i = 0; i < free_entries; i++) | |
920 | head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; | |
921 | ||
922 | /* Prefetch descriptor index. */ | |
923 | rte_prefetch0(&vq->desc[head[packet_success]]); | |
924 | ||
925 | while (packet_success < free_entries) { | |
926 | desc = &vq->desc[head[packet_success]]; | |
927 | /* Prefetch descriptor address. */ | |
928 | rte_prefetch0(desc); | |
929 | ||
930 | if (packet_success < (free_entries - 1)) { | |
931 | /* Prefetch descriptor index. */ | |
932 | rte_prefetch0(&vq->desc[head[packet_success+1]]); | |
933 | } | |
934 | ||
935 | /* Update used index buffer information. */ | |
936 | used_idx = vq->last_used_idx & (vq->size - 1); | |
937 | vq->used->ring[used_idx].id = head[packet_success]; | |
938 | vq->used->ring[used_idx].len = 0; | |
939 | ||
940 | /* Discard first buffer as it is the virtio header */ | |
941 | desc = &vq->desc[desc->next]; | |
942 | ||
943 | /* Buffer address translation. */ | |
944 | buff_addr = gpa_to_vva(dev, desc->addr); | |
945 | /* Prefetch buffer address. */ | |
946 | rte_prefetch0((void*)(uintptr_t)buff_addr); | |
947 | ||
948 | /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */ | |
949 | m.data_len = desc->len; | |
950 | m.data_off = 0; | |
951 | m.nb_segs = 1; | |
952 | ||
953 | virtio_tx_route(dev, &m, mbuf_pool, 0); | |
954 | ||
955 | vq->last_used_idx++; | |
956 | packet_success++; | |
957 | } | |
958 | ||
959 | rte_compiler_barrier(); | |
960 | vq->used->idx += packet_success; | |
961 | /* Kick guest if required. */ | |
962 | } | |
963 | ||
964 | /* | |
965 | * This function is called by each data core. It handles all RX/TX registered with the | |
966 | * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared | |
967 | * with all devices in the main linked list. | |
968 | */ | |
969 | static int | |
970 | switch_worker(__attribute__((unused)) void *arg) | |
971 | { | |
972 | struct rte_mempool *mbuf_pool = arg; | |
973 | struct virtio_net *dev = NULL; | |
974 | struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; | |
975 | struct virtio_net_data_ll *dev_ll; | |
976 | struct mbuf_table *tx_q; | |
977 | volatile struct lcore_ll_info *lcore_ll; | |
978 | const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; | |
979 | uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; | |
980 | unsigned ret, i; | |
981 | const uint16_t lcore_id = rte_lcore_id(); | |
982 | const uint16_t num_cores = (uint16_t)rte_lcore_count(); | |
983 | uint16_t rx_count = 0; | |
984 | ||
985 | RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id); | |
986 | lcore_ll = lcore_info[lcore_id].lcore_ll; | |
987 | prev_tsc = 0; | |
988 | ||
989 | tx_q = &lcore_tx_queue[lcore_id]; | |
990 | for (i = 0; i < num_cores; i ++) { | |
991 | if (lcore_ids[i] == lcore_id) { | |
992 | tx_q->txq_id = i; | |
993 | break; | |
994 | } | |
995 | } | |
996 | ||
997 | while(1) { | |
998 | cur_tsc = rte_rdtsc(); | |
999 | /* | |
1000 | * TX burst queue drain | |
1001 | */ | |
1002 | diff_tsc = cur_tsc - prev_tsc; | |
1003 | if (unlikely(diff_tsc > drain_tsc)) { | |
1004 | ||
1005 | if (tx_q->len) { | |
1006 | RTE_LOG(DEBUG, VHOST_DATA, | |
1007 | "TX queue drained after timeout with burst size %u\n", | |
1008 | tx_q->len); | |
1009 | ||
1010 | /*Tx any packets in the queue*/ | |
1011 | ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, | |
1012 | (struct rte_mbuf **)tx_q->m_table, | |
1013 | (uint16_t)tx_q->len); | |
1014 | if (unlikely(ret < tx_q->len)) { | |
1015 | do { | |
1016 | rte_pktmbuf_free(tx_q->m_table[ret]); | |
1017 | } while (++ret < tx_q->len); | |
1018 | } | |
1019 | ||
1020 | tx_q->len = 0; | |
1021 | } | |
1022 | ||
1023 | prev_tsc = cur_tsc; | |
1024 | ||
1025 | } | |
1026 | ||
1027 | /* | |
1028 | * Inform the configuration core that we have exited the linked list and that no devices are | |
1029 | * in use if requested. | |
1030 | */ | |
1031 | if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) | |
1032 | lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; | |
1033 | ||
1034 | /* | |
1035 | * Process devices | |
1036 | */ | |
1037 | dev_ll = lcore_ll->ll_root_used; | |
1038 | ||
1039 | while (dev_ll != NULL) { | |
1040 | /*get virtio device ID*/ | |
1041 | dev = dev_ll->dev; | |
1042 | ||
1043 | if (unlikely(dev->remove)) { | |
1044 | dev_ll = dev_ll->next; | |
1045 | unlink_vmdq(dev); | |
1046 | dev->ready = DEVICE_SAFE_REMOVE; | |
1047 | continue; | |
1048 | } | |
1049 | if (likely(dev->ready == DEVICE_READY)) { | |
1050 | /*Handle guest RX*/ | |
1051 | rx_count = rte_eth_rx_burst(ports[0], | |
1052 | (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); | |
1053 | ||
1054 | if (rx_count) { | |
1055 | ret_count = virtio_dev_rx(dev, pkts_burst, rx_count); | |
1056 | if (enable_stats) { | |
1057 | rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, rx_count); | |
1058 | rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret_count); | |
1059 | } | |
1060 | while (likely(rx_count)) { | |
1061 | rx_count--; | |
1062 | rte_pktmbuf_free_seg(pkts_burst[rx_count]); | |
1063 | } | |
1064 | ||
1065 | } | |
1066 | } | |
1067 | ||
1068 | if (likely(!dev->remove)) | |
1069 | /*Handle guest TX*/ | |
1070 | virtio_dev_tx(dev, mbuf_pool); | |
1071 | ||
1072 | /*move to the next device in the list*/ | |
1073 | dev_ll = dev_ll->next; | |
1074 | } | |
1075 | } | |
1076 | ||
1077 | return 0; | |
1078 | } | |
1079 | ||
1080 | /* | |
1081 | * Add an entry to a used linked list. A free entry must first be found in the free linked list | |
1082 | * using get_data_ll_free_entry(); | |
1083 | */ | |
1084 | static void | |
1085 | add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev) | |
1086 | { | |
1087 | struct virtio_net_data_ll *ll = *ll_root_addr; | |
1088 | ||
1089 | /* Set next as NULL and use a compiler barrier to avoid reordering. */ | |
1090 | ll_dev->next = NULL; | |
1091 | rte_compiler_barrier(); | |
1092 | ||
1093 | /* If ll == NULL then this is the first device. */ | |
1094 | if (ll) { | |
1095 | /* Increment to the tail of the linked list. */ | |
1096 | while ((ll->next != NULL) ) | |
1097 | ll = ll->next; | |
1098 | ||
1099 | ll->next = ll_dev; | |
1100 | } else { | |
1101 | *ll_root_addr = ll_dev; | |
1102 | } | |
1103 | } | |
1104 | ||
1105 | /* | |
1106 | * Remove an entry from a used linked list. The entry must then be added to the free linked list | |
1107 | * using put_data_ll_free_entry(). | |
1108 | */ | |
1109 | static void | |
1110 | rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev, struct virtio_net_data_ll *ll_dev_last) | |
1111 | { | |
1112 | struct virtio_net_data_ll *ll = *ll_root_addr; | |
1113 | ||
1114 | if (ll_dev == ll) | |
1115 | *ll_root_addr = ll_dev->next; | |
1116 | else | |
1117 | ll_dev_last->next = ll_dev->next; | |
1118 | } | |
1119 | ||
1120 | /* | |
1121 | * Find and return an entry from the free linked list. | |
1122 | */ | |
1123 | static struct virtio_net_data_ll * | |
1124 | get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) | |
1125 | { | |
1126 | struct virtio_net_data_ll *ll_free = *ll_root_addr; | |
1127 | struct virtio_net_data_ll *ll_dev; | |
1128 | ||
1129 | if (ll_free == NULL) | |
1130 | return NULL; | |
1131 | ||
1132 | ll_dev = ll_free; | |
1133 | *ll_root_addr = ll_free->next; | |
1134 | ||
1135 | return ll_dev; | |
1136 | } | |
1137 | ||
1138 | /* | |
1139 | * Place an entry back on to the free linked list. | |
1140 | */ | |
1141 | static void | |
1142 | put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev) | |
1143 | { | |
1144 | struct virtio_net_data_ll *ll_free = *ll_root_addr; | |
1145 | ||
1146 | ll_dev->next = ll_free; | |
1147 | *ll_root_addr = ll_dev; | |
1148 | } | |
1149 | ||
1150 | /* | |
1151 | * Creates a linked list of a given size. | |
1152 | */ | |
1153 | static struct virtio_net_data_ll * | |
1154 | alloc_data_ll(uint32_t size) | |
1155 | { | |
1156 | struct virtio_net_data_ll *ll_new; | |
1157 | uint32_t i; | |
1158 | ||
1159 | /* Malloc and then chain the linked list. */ | |
1160 | ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); | |
1161 | if (ll_new == NULL) { | |
1162 | RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); | |
1163 | return NULL; | |
1164 | } | |
1165 | ||
1166 | for (i = 0; i < size - 1; i++) { | |
1167 | ll_new[i].dev = NULL; | |
1168 | ll_new[i].next = &ll_new[i+1]; | |
1169 | } | |
1170 | ll_new[i].next = NULL; | |
1171 | ||
1172 | return ll_new; | |
1173 | } | |
1174 | ||
1175 | /* | |
1176 | * Create the main linked list along with each individual cores linked list. A used and a free list | |
1177 | * are created to manage entries. | |
1178 | */ | |
1179 | static int | |
1180 | init_data_ll (void) | |
1181 | { | |
1182 | int lcore; | |
1183 | ||
1184 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1185 | lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); | |
1186 | if (lcore_info[lcore].lcore_ll == NULL) { | |
1187 | RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); | |
1188 | return -1; | |
1189 | } | |
1190 | ||
1191 | lcore_info[lcore].lcore_ll->device_num = 0; | |
1192 | lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; | |
1193 | lcore_info[lcore].lcore_ll->ll_root_used = NULL; | |
1194 | if (num_devices % num_switching_cores) | |
1195 | lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); | |
1196 | else | |
1197 | lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); | |
1198 | } | |
1199 | ||
1200 | /* Allocate devices up to a maximum of MAX_DEVICES. */ | |
1201 | ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); | |
1202 | ||
1203 | return 0; | |
1204 | } | |
1205 | /* | |
1206 | * Remove a device from the specific data core linked list and from the main linked list. The | |
1207 | * rx/tx thread must be set the flag to indicate that it is safe to remove the device. | |
1208 | * used. | |
1209 | */ | |
1210 | static void | |
1211 | destroy_device (volatile struct virtio_net *dev) | |
1212 | { | |
1213 | struct virtio_net_data_ll *ll_lcore_dev_cur; | |
1214 | struct virtio_net_data_ll *ll_main_dev_cur; | |
1215 | struct virtio_net_data_ll *ll_lcore_dev_last = NULL; | |
1216 | struct virtio_net_data_ll *ll_main_dev_last = NULL; | |
1217 | int lcore; | |
1218 | ||
1219 | dev->flags &= ~VIRTIO_DEV_RUNNING; | |
1220 | ||
1221 | /*set the remove flag. */ | |
1222 | dev->remove = 1; | |
1223 | ||
1224 | while(dev->ready != DEVICE_SAFE_REMOVE) { | |
1225 | rte_pause(); | |
1226 | } | |
1227 | ||
1228 | /* Search for entry to be removed from lcore ll */ | |
1229 | ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used; | |
1230 | while (ll_lcore_dev_cur != NULL) { | |
1231 | if (ll_lcore_dev_cur->dev == dev) { | |
1232 | break; | |
1233 | } else { | |
1234 | ll_lcore_dev_last = ll_lcore_dev_cur; | |
1235 | ll_lcore_dev_cur = ll_lcore_dev_cur->next; | |
1236 | } | |
1237 | } | |
1238 | ||
1239 | /* Search for entry to be removed from main ll */ | |
1240 | ll_main_dev_cur = ll_root_used; | |
1241 | ll_main_dev_last = NULL; | |
1242 | while (ll_main_dev_cur != NULL) { | |
1243 | if (ll_main_dev_cur->dev == dev) { | |
1244 | break; | |
1245 | } else { | |
1246 | ll_main_dev_last = ll_main_dev_cur; | |
1247 | ll_main_dev_cur = ll_main_dev_cur->next; | |
1248 | } | |
1249 | } | |
1250 | ||
1251 | if (ll_lcore_dev_cur == NULL || ll_main_dev_cur == NULL) { | |
1252 | RTE_LOG(ERR, XENHOST, "%s: could find device in per_cpu list or main_list\n", __func__); | |
1253 | return; | |
1254 | } | |
1255 | ||
1256 | /* Remove entries from the lcore and main ll. */ | |
1257 | rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); | |
1258 | rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); | |
1259 | ||
1260 | /* Set the dev_removal_flag on each lcore. */ | |
1261 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1262 | lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; | |
1263 | } | |
1264 | ||
1265 | /* | |
1266 | * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that | |
1267 | * they can no longer access the device removed from the linked lists and that the devices | |
1268 | * are no longer in use. | |
1269 | */ | |
1270 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1271 | while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { | |
1272 | rte_pause(); | |
1273 | } | |
1274 | } | |
1275 | ||
1276 | /* Add the entries back to the lcore and main free ll.*/ | |
1277 | put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); | |
1278 | put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); | |
1279 | ||
1280 | /* Decrement number of device on the lcore. */ | |
1281 | lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--; | |
1282 | ||
1283 | RTE_LOG(INFO, VHOST_DATA, " #####(%"PRIu64") Device has been removed from data core\n", dev->device_fh); | |
1284 | } | |
1285 | ||
1286 | /* | |
1287 | * A new device is added to a data core. First the device is added to the main linked list | |
1288 | * and the allocated to a specific data core. | |
1289 | */ | |
1290 | static int | |
1291 | new_device (struct virtio_net *dev) | |
1292 | { | |
1293 | struct virtio_net_data_ll *ll_dev; | |
1294 | int lcore, core_add = 0; | |
1295 | uint32_t device_num_min = num_devices; | |
1296 | ||
1297 | /* Add device to main ll */ | |
1298 | ll_dev = get_data_ll_free_entry(&ll_root_free); | |
1299 | if (ll_dev == NULL) { | |
1300 | RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " | |
1301 | "of %d devices per core has been reached\n", | |
1302 | dev->device_fh, num_devices); | |
1303 | return -1; | |
1304 | } | |
1305 | ll_dev->dev = dev; | |
1306 | add_data_ll_entry(&ll_root_used, ll_dev); | |
1307 | ||
1308 | /*reset ready flag*/ | |
1309 | dev->ready = DEVICE_NOT_READY; | |
1310 | dev->remove = 0; | |
1311 | ||
1312 | /* Find a suitable lcore to add the device. */ | |
1313 | RTE_LCORE_FOREACH_SLAVE(lcore) { | |
1314 | if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { | |
1315 | device_num_min = lcore_info[lcore].lcore_ll->device_num; | |
1316 | core_add = lcore; | |
1317 | } | |
1318 | } | |
1319 | /* Add device to lcore ll */ | |
1320 | ll_dev->dev->coreid = core_add; | |
1321 | ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); | |
1322 | if (ll_dev == NULL) { | |
1323 | RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); | |
1324 | destroy_device(dev); | |
1325 | return -1; | |
1326 | } | |
1327 | ll_dev->dev = dev; | |
1328 | add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); | |
1329 | ||
1330 | /* Initialize device stats */ | |
1331 | memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); | |
1332 | ||
1333 | lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++; | |
1334 | dev->flags |= VIRTIO_DEV_RUNNING; | |
1335 | ||
1336 | RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid); | |
1337 | ||
1338 | link_vmdq(dev); | |
1339 | ||
1340 | return 0; | |
1341 | } | |
1342 | ||
1343 | /* | |
1344 | * These callback allow devices to be added to the data core when configuration | |
1345 | * has been fully complete. | |
1346 | */ | |
1347 | static const struct virtio_net_device_ops virtio_net_device_ops = | |
1348 | { | |
1349 | .new_device = new_device, | |
1350 | .destroy_device = destroy_device, | |
1351 | }; | |
1352 | ||
1353 | /* | |
1354 | * This is a thread will wake up after a period to print stats if the user has | |
1355 | * enabled them. | |
1356 | */ | |
1357 | static void | |
1358 | print_stats(void) | |
1359 | { | |
1360 | struct virtio_net_data_ll *dev_ll; | |
1361 | uint64_t tx_dropped, rx_dropped; | |
1362 | uint64_t tx, tx_total, rx, rx_total; | |
1363 | uint32_t device_fh; | |
1364 | const char clr[] = { 27, '[', '2', 'J', '\0' }; | |
1365 | const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; | |
1366 | ||
1367 | while(1) { | |
1368 | sleep(enable_stats); | |
1369 | ||
1370 | /* Clear screen and move to top left */ | |
1371 | printf("%s%s", clr, top_left); | |
1372 | ||
1373 | printf("\nDevice statistics ===================================="); | |
1374 | ||
1375 | dev_ll = ll_root_used; | |
1376 | while (dev_ll != NULL) { | |
1377 | device_fh = (uint32_t)dev_ll->dev->device_fh; | |
1378 | tx_total = dev_statistics[device_fh].tx_total; | |
1379 | tx = dev_statistics[device_fh].tx; | |
1380 | tx_dropped = tx_total - tx; | |
1381 | rx_total = rte_atomic64_read(&dev_statistics[device_fh].rx_total); | |
1382 | rx = rte_atomic64_read(&dev_statistics[device_fh].rx); | |
1383 | rx_dropped = rx_total - rx; | |
1384 | ||
1385 | printf("\nStatistics for device %"PRIu32" ------------------------------" | |
1386 | "\nTX total: %"PRIu64"" | |
1387 | "\nTX dropped: %"PRIu64"" | |
1388 | "\nTX successful: %"PRIu64"" | |
1389 | "\nRX total: %"PRIu64"" | |
1390 | "\nRX dropped: %"PRIu64"" | |
1391 | "\nRX successful: %"PRIu64"", | |
1392 | device_fh, | |
1393 | tx_total, | |
1394 | tx_dropped, | |
1395 | tx, | |
1396 | rx_total, | |
1397 | rx_dropped, | |
1398 | rx); | |
1399 | ||
1400 | dev_ll = dev_ll->next; | |
1401 | } | |
1402 | printf("\n======================================================\n"); | |
1403 | } | |
1404 | } | |
1405 | ||
1406 | ||
1407 | int init_virtio_net(struct virtio_net_device_ops const * const ops); | |
1408 | ||
1409 | /* | |
1410 | * Main function, does initialisation and calls the per-lcore functions. | |
1411 | */ | |
1412 | int | |
1413 | main(int argc, char *argv[]) | |
1414 | { | |
1415 | struct rte_mempool *mbuf_pool; | |
1416 | unsigned lcore_id, core_id = 0; | |
1417 | unsigned nb_ports, valid_num_ports; | |
1418 | int ret; | |
1419 | uint8_t portid; | |
1420 | static pthread_t tid; | |
1421 | char thread_name[RTE_MAX_THREAD_NAME_LEN]; | |
1422 | ||
1423 | /* init EAL */ | |
1424 | ret = rte_eal_init(argc, argv); | |
1425 | if (ret < 0) | |
1426 | rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); | |
1427 | argc -= ret; | |
1428 | argv += ret; | |
1429 | ||
1430 | /* parse app arguments */ | |
1431 | ret = us_vhost_parse_args(argc, argv); | |
1432 | if (ret < 0) | |
1433 | rte_exit(EXIT_FAILURE, "Invalid argument\n"); | |
1434 | ||
1435 | for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) | |
1436 | if (rte_lcore_is_enabled(lcore_id)) | |
1437 | lcore_ids[core_id ++] = lcore_id; | |
1438 | ||
1439 | if (rte_lcore_count() > RTE_MAX_LCORE) | |
1440 | rte_exit(EXIT_FAILURE,"Not enough cores\n"); | |
1441 | ||
1442 | /*set the number of swithcing cores available*/ | |
1443 | num_switching_cores = rte_lcore_count()-1; | |
1444 | ||
1445 | /* Get the number of physical ports. */ | |
1446 | nb_ports = rte_eth_dev_count(); | |
1447 | ||
1448 | /* | |
1449 | * Update the global var NUM_PORTS and global array PORTS | |
1450 | * and get value of var VALID_NUM_PORTS according to system ports number | |
1451 | */ | |
1452 | valid_num_ports = check_ports_num(nb_ports); | |
1453 | ||
1454 | if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { | |
1455 | RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," | |
1456 | "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); | |
1457 | return -1; | |
1458 | } | |
1459 | ||
1460 | /* Create the mbuf pool. */ | |
1461 | mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", | |
1462 | NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 0, | |
1463 | RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); | |
1464 | if (mbuf_pool == NULL) | |
1465 | rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); | |
1466 | ||
1467 | /* initialize all ports */ | |
1468 | for (portid = 0; portid < nb_ports; portid++) { | |
1469 | /* skip ports that are not enabled */ | |
1470 | if ((enabled_port_mask & (1 << portid)) == 0) { | |
1471 | RTE_LOG(INFO, VHOST_PORT, "Skipping disabled port %d\n", portid); | |
1472 | continue; | |
1473 | } | |
1474 | if (port_init(portid, mbuf_pool) != 0) | |
1475 | rte_exit(EXIT_FAILURE, "Cannot initialize network ports\n"); | |
1476 | } | |
1477 | ||
1478 | /* Initialise all linked lists. */ | |
1479 | if (init_data_ll() == -1) | |
1480 | rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); | |
1481 | ||
1482 | /* Initialize device stats */ | |
1483 | memset(&dev_statistics, 0, sizeof(dev_statistics)); | |
1484 | ||
1485 | /* Enable stats if the user option is set. */ | |
1486 | if (enable_stats) { | |
1487 | ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); | |
1488 | if (ret != 0) | |
1489 | rte_exit(EXIT_FAILURE, | |
1490 | "Cannot create print-stats thread\n"); | |
1491 | ||
1492 | /* Set thread_name for aid in debugging. */ | |
1493 | snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-xen-stats"); | |
1494 | ret = rte_thread_setname(tid, thread_name); | |
1495 | if (ret != 0) | |
1496 | RTE_LOG(DEBUG, VHOST_CONFIG, | |
1497 | "Cannot set print-stats name\n"); | |
1498 | } | |
1499 | ||
1500 | /* Launch all data cores. */ | |
1501 | RTE_LCORE_FOREACH_SLAVE(lcore_id) { | |
1502 | rte_eal_remote_launch(switch_worker, mbuf_pool, lcore_id); | |
1503 | } | |
1504 | ||
1505 | init_virtio_xen(&virtio_net_device_ops); | |
1506 | ||
1507 | virtio_monitor_loop(); | |
1508 | return 0; | |
1509 | } |