]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: use the open_type when creating the local port
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <netinet/in.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/socket.h>
31 #include <sys/stat.h>
32 #include <unistd.h>
33
34 #include "bitmap.h"
35 #include "cmap.h"
36 #include "coverage.h"
37 #include "csum.h"
38 #include "dp-packet.h"
39 #include "dpif.h"
40 #include "dpif-provider.h"
41 #include "dummy.h"
42 #include "fat-rwlock.h"
43 #include "flow.h"
44 #include "hmapx.h"
45 #include "latch.h"
46 #include "netdev.h"
47 #include "netdev-dpdk.h"
48 #include "netdev-vport.h"
49 #include "netlink.h"
50 #include "odp-execute.h"
51 #include "odp-util.h"
52 #include "openvswitch/dynamic-string.h"
53 #include "openvswitch/list.h"
54 #include "openvswitch/match.h"
55 #include "openvswitch/ofp-print.h"
56 #include "openvswitch/ofp-util.h"
57 #include "openvswitch/ofpbuf.h"
58 #include "openvswitch/shash.h"
59 #include "openvswitch/vlog.h"
60 #include "ovs-numa.h"
61 #include "ovs-rcu.h"
62 #include "packets.h"
63 #include "poll-loop.h"
64 #include "pvector.h"
65 #include "random.h"
66 #include "seq.h"
67 #include "smap.h"
68 #include "sset.h"
69 #include "timeval.h"
70 #include "tnl-neigh-cache.h"
71 #include "tnl-ports.h"
72 #include "unixctl.h"
73 #include "util.h"
74
75 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
76
77 #define FLOW_DUMP_MAX_BATCH 50
78 /* Use per thread recirc_depth to prevent recirculation loop. */
79 #define MAX_RECIRC_DEPTH 5
80 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
81
82 /* Configuration parameters. */
83 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
84
85 /* Protects against changes to 'dp_netdevs'. */
86 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
87
88 /* Contains all 'struct dp_netdev's. */
89 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
90 = SHASH_INITIALIZER(&dp_netdevs);
91
92 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
93
94 static struct odp_support dp_netdev_support = {
95 .max_mpls_depth = SIZE_MAX,
96 .recirc = true,
97 };
98
99 /* Stores a miniflow with inline values */
100
101 struct netdev_flow_key {
102 uint32_t hash; /* Hash function differs for different users. */
103 uint32_t len; /* Length of the following miniflow (incl. map). */
104 struct miniflow mf;
105 uint64_t buf[FLOW_MAX_PACKET_U64S];
106 };
107
108 /* Exact match cache for frequently used flows
109 *
110 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
111 * search its entries for a miniflow that matches exactly the miniflow of the
112 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
113 *
114 * A cache entry holds a reference to its 'dp_netdev_flow'.
115 *
116 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
117 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
118 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
119 * value is the index of a cache entry where the miniflow could be.
120 *
121 *
122 * Thread-safety
123 * =============
124 *
125 * Each pmd_thread has its own private exact match cache.
126 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
127 */
128
129 #define EM_FLOW_HASH_SHIFT 13
130 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
131 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
132 #define EM_FLOW_HASH_SEGS 2
133
134 struct emc_entry {
135 struct dp_netdev_flow *flow;
136 struct netdev_flow_key key; /* key.hash used for emc hash value. */
137 };
138
139 struct emc_cache {
140 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
141 int sweep_idx; /* For emc_cache_slow_sweep(). */
142 };
143
144 /* Iterate in the exact match cache through every entry that might contain a
145 * miniflow with hash 'HASH'. */
146 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
147 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
148 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
149 i__ < EM_FLOW_HASH_SEGS; \
150 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
151 \f
152 /* Simple non-wildcarding single-priority classifier. */
153
154 struct dpcls {
155 struct cmap subtables_map;
156 struct pvector subtables;
157 };
158
159 /* A rule to be inserted to the classifier. */
160 struct dpcls_rule {
161 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
162 struct netdev_flow_key *mask; /* Subtable's mask. */
163 struct netdev_flow_key flow; /* Matching key. */
164 /* 'flow' must be the last field, additional space is allocated here. */
165 };
166
167 static void dpcls_init(struct dpcls *);
168 static void dpcls_destroy(struct dpcls *);
169 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
170 const struct netdev_flow_key *mask);
171 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
172 static bool dpcls_lookup(const struct dpcls *cls,
173 const struct netdev_flow_key keys[],
174 struct dpcls_rule **rules, size_t cnt);
175 \f
176 /* Datapath based on the network device interface from netdev.h.
177 *
178 *
179 * Thread-safety
180 * =============
181 *
182 * Some members, marked 'const', are immutable. Accessing other members
183 * requires synchronization, as noted in more detail below.
184 *
185 * Acquisition order is, from outermost to innermost:
186 *
187 * dp_netdev_mutex (global)
188 * port_mutex
189 * non_pmd_mutex
190 */
191 struct dp_netdev {
192 const struct dpif_class *const class;
193 const char *const name;
194 struct dpif *dpif;
195 struct ovs_refcount ref_cnt;
196 atomic_flag destroyed;
197
198 /* Ports.
199 *
200 * Any lookup into 'ports' or any access to the dp_netdev_ports found
201 * through 'ports' requires taking 'port_mutex'. */
202 struct ovs_mutex port_mutex;
203 struct hmap ports;
204 struct seq *port_seq; /* Incremented whenever a port changes. */
205
206 /* Protects access to ofproto-dpif-upcall interface during revalidator
207 * thread synchronization. */
208 struct fat_rwlock upcall_rwlock;
209 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
210 void *upcall_aux;
211
212 /* Callback function for notifying the purging of dp flows (during
213 * reseting pmd deletion). */
214 dp_purge_callback *dp_purge_cb;
215 void *dp_purge_aux;
216
217 /* Stores all 'struct dp_netdev_pmd_thread's. */
218 struct cmap poll_threads;
219
220 /* Protects the access of the 'struct dp_netdev_pmd_thread'
221 * instance for non-pmd thread. */
222 struct ovs_mutex non_pmd_mutex;
223
224 /* Each pmd thread will store its pointer to
225 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
226 ovsthread_key_t per_pmd_key;
227
228 struct seq *reconfigure_seq;
229 uint64_t last_reconfigure_seq;
230
231 /* Cpu mask for pin of pmd threads. */
232 char *pmd_cmask;
233
234 uint64_t last_tnl_conf_seq;
235 };
236
237 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
238 odp_port_t)
239 OVS_REQUIRES(dp->port_mutex);
240
241 enum dp_stat_type {
242 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
243 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
244 DP_STAT_MISS, /* Packets that did not match. */
245 DP_STAT_LOST, /* Packets not passed up to the client. */
246 DP_N_STATS
247 };
248
249 enum pmd_cycles_counter_type {
250 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
251 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
252 PMD_N_CYCLES
253 };
254
255 #define XPS_TIMEOUT_MS 500LL
256
257 /* Contained by struct dp_netdev_port's 'rxqs' member. */
258 struct dp_netdev_rxq {
259 struct netdev_rxq *rxq;
260 unsigned core_id; /* Сore to which this queue is pinned. */
261 };
262
263 /* A port in a netdev-based datapath. */
264 struct dp_netdev_port {
265 odp_port_t port_no;
266 struct netdev *netdev;
267 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
268 struct netdev_saved_flags *sf;
269 unsigned n_rxq; /* Number of elements in 'rxq' */
270 struct dp_netdev_rxq *rxqs;
271 bool dynamic_txqs; /* If true XPS will be used. */
272 unsigned *txq_used; /* Number of threads that uses each tx queue. */
273 struct ovs_mutex txq_used_mutex;
274 char *type; /* Port type as requested by user. */
275 char *rxq_affinity_list; /* Requested affinity of rx queues. */
276 };
277
278 /* Contained by struct dp_netdev_flow's 'stats' member. */
279 struct dp_netdev_flow_stats {
280 atomic_llong used; /* Last used time, in monotonic msecs. */
281 atomic_ullong packet_count; /* Number of packets matched. */
282 atomic_ullong byte_count; /* Number of bytes matched. */
283 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
284 };
285
286 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
287 *
288 *
289 * Thread-safety
290 * =============
291 *
292 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
293 * its pmd thread's classifier. The text below calls this classifier 'cls'.
294 *
295 * Motivation
296 * ----------
297 *
298 * The thread safety rules described here for "struct dp_netdev_flow" are
299 * motivated by two goals:
300 *
301 * - Prevent threads that read members of "struct dp_netdev_flow" from
302 * reading bad data due to changes by some thread concurrently modifying
303 * those members.
304 *
305 * - Prevent two threads making changes to members of a given "struct
306 * dp_netdev_flow" from interfering with each other.
307 *
308 *
309 * Rules
310 * -----
311 *
312 * A flow 'flow' may be accessed without a risk of being freed during an RCU
313 * grace period. Code that needs to hold onto a flow for a while
314 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
315 *
316 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
317 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
318 * from modification.
319 *
320 * Some members, marked 'const', are immutable. Accessing other members
321 * requires synchronization, as noted in more detail below.
322 */
323 struct dp_netdev_flow {
324 const struct flow flow; /* Unmasked flow that created this entry. */
325 /* Hash table index by unmasked flow. */
326 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
327 /* 'flow_table'. */
328 const ovs_u128 ufid; /* Unique flow identifier. */
329 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
330 /* flow. */
331
332 /* Number of references.
333 * The classifier owns one reference.
334 * Any thread trying to keep a rule from being freed should hold its own
335 * reference. */
336 struct ovs_refcount ref_cnt;
337
338 bool dead;
339
340 /* Statistics. */
341 struct dp_netdev_flow_stats stats;
342
343 /* Actions. */
344 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
345
346 /* While processing a group of input packets, the datapath uses the next
347 * member to store a pointer to the output batch for the flow. It is
348 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
349 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
350 struct packet_batch_per_flow *batch;
351
352 /* Packet classification. */
353 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
354 /* 'cr' must be the last member. */
355 };
356
357 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
358 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
359 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
360 struct flow *);
361
362 /* A set of datapath actions within a "struct dp_netdev_flow".
363 *
364 *
365 * Thread-safety
366 * =============
367 *
368 * A struct dp_netdev_actions 'actions' is protected with RCU. */
369 struct dp_netdev_actions {
370 /* These members are immutable: they do not change during the struct's
371 * lifetime. */
372 unsigned int size; /* Size of 'actions', in bytes. */
373 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
374 };
375
376 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
377 size_t);
378 struct dp_netdev_actions *dp_netdev_flow_get_actions(
379 const struct dp_netdev_flow *);
380 static void dp_netdev_actions_free(struct dp_netdev_actions *);
381
382 /* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
383 struct dp_netdev_pmd_stats {
384 /* Indexed by DP_STAT_*. */
385 atomic_ullong n[DP_N_STATS];
386 };
387
388 /* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
389 struct dp_netdev_pmd_cycles {
390 /* Indexed by PMD_CYCLES_*. */
391 atomic_ullong n[PMD_N_CYCLES];
392 };
393
394 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
395 struct rxq_poll {
396 struct dp_netdev_port *port;
397 struct netdev_rxq *rx;
398 struct ovs_list node;
399 };
400
401 /* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
402 struct tx_port {
403 struct dp_netdev_port *port;
404 int qid;
405 long long last_used;
406 struct hmap_node node;
407 };
408
409 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
410 * the performance overhead of interrupt processing. Therefore netdev can
411 * not implement rx-wait for these devices. dpif-netdev needs to poll
412 * these device to check for recv buffer. pmd-thread does polling for
413 * devices assigned to itself.
414 *
415 * DPDK used PMD for accessing NIC.
416 *
417 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
418 * I/O of all non-pmd threads. There will be no actual thread created
419 * for the instance.
420 *
421 * Each struct has its own flow table and classifier. Packets received
422 * from managed ports are looked up in the corresponding pmd thread's
423 * flow table, and are executed with the found actions.
424 * */
425 struct dp_netdev_pmd_thread {
426 struct dp_netdev *dp;
427 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
428 struct cmap_node node; /* In 'dp->poll_threads'. */
429
430 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
431 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
432
433 /* Per thread exact-match cache. Note, the instance for cpu core
434 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
435 * need to be protected by 'non_pmd_mutex'. Every other instance
436 * will only be accessed by its own pmd thread. */
437 struct emc_cache flow_cache;
438
439 /* Classifier and Flow-Table.
440 *
441 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
442 * changes to 'cls' must be made while still holding the 'flow_mutex'.
443 */
444 struct ovs_mutex flow_mutex;
445 struct dpcls cls;
446 struct cmap flow_table OVS_GUARDED; /* Flow table. */
447
448 /* Statistics. */
449 struct dp_netdev_pmd_stats stats;
450
451 /* Cycles counters */
452 struct dp_netdev_pmd_cycles cycles;
453
454 /* Used to count cicles. See 'cycles_counter_end()' */
455 unsigned long long last_cycles;
456
457 struct latch exit_latch; /* For terminating the pmd thread. */
458 atomic_uint change_seq; /* For reloading pmd ports. */
459 pthread_t thread;
460 unsigned core_id; /* CPU core id of this pmd thread. */
461 int numa_id; /* numa node id of this pmd thread. */
462 bool isolated;
463
464 /* Queue id used by this pmd thread to send packets on all netdevs if
465 * XPS disabled for this netdev. All static_tx_qid's are unique and less
466 * than 'ovs_numa_get_n_cores() + 1'. */
467 atomic_int static_tx_qid;
468
469 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
470 /* List of rx queues to poll. */
471 struct ovs_list poll_list OVS_GUARDED;
472 /* Number of elements in 'poll_list' */
473 int poll_cnt;
474 /* Map of 'tx_port's used for transmission. Written by the main thread,
475 * read by the pmd thread. */
476 struct hmap tx_ports OVS_GUARDED;
477
478 /* Map of 'tx_port' used in the fast path. This is a thread-local copy of
479 * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
480 * by multiple threads, and thusly need to be protected by 'non_pmd_mutex'.
481 * Every other instance will only be accessed by its own pmd thread. */
482 struct hmap port_cache;
483
484 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
485 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
486 * values and subtracts them from 'stats' and 'cycles' before
487 * reporting to the user */
488 unsigned long long stats_zero[DP_N_STATS];
489 uint64_t cycles_zero[PMD_N_CYCLES];
490 };
491
492 #define PMD_INITIAL_SEQ 1
493
494 /* Interface to netdev-based datapath. */
495 struct dpif_netdev {
496 struct dpif dpif;
497 struct dp_netdev *dp;
498 uint64_t last_port_seq;
499 };
500
501 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
502 struct dp_netdev_port **portp)
503 OVS_REQUIRES(dp->port_mutex);
504 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
505 struct dp_netdev_port **portp)
506 OVS_REQUIRES(dp->port_mutex);
507 static void dp_netdev_free(struct dp_netdev *)
508 OVS_REQUIRES(dp_netdev_mutex);
509 static int do_add_port(struct dp_netdev *dp, const char *devname,
510 const char *type, odp_port_t port_no)
511 OVS_REQUIRES(dp->port_mutex);
512 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
513 OVS_REQUIRES(dp->port_mutex);
514 static int dpif_netdev_open(const struct dpif_class *, const char *name,
515 bool create, struct dpif **);
516 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
517 struct dp_packet_batch *,
518 bool may_steal,
519 const struct nlattr *actions,
520 size_t actions_len,
521 long long now);
522 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
523 struct dp_packet_batch *, odp_port_t port_no);
524 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
525 struct dp_packet_batch *);
526
527 static void dp_netdev_disable_upcall(struct dp_netdev *);
528 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
529 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
530 struct dp_netdev *dp, unsigned core_id,
531 int numa_id);
532 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
533 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
534 OVS_REQUIRES(dp->port_mutex);
535
536 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
537 unsigned core_id);
538 static struct dp_netdev_pmd_thread *
539 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
540 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
541 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
542 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
543 OVS_REQUIRES(dp->port_mutex);
544 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
545 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
546 struct dp_netdev_port *port);
547 static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
548 struct dp_netdev_port *port);
549 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
550 struct dp_netdev_port *port);
551 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
552 struct dp_netdev_port *port,
553 struct netdev_rxq *rx);
554 static struct dp_netdev_pmd_thread *
555 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
556 static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
557 OVS_REQUIRES(dp->port_mutex);
558 static void reconfigure_pmd_threads(struct dp_netdev *dp)
559 OVS_REQUIRES(dp->port_mutex);
560 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
561 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
562 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
563 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
564 OVS_REQUIRES(pmd->port_mutex);
565
566 static void
567 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
568 long long now, bool purge);
569 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
570 struct tx_port *tx, long long now);
571
572 static inline bool emc_entry_alive(struct emc_entry *ce);
573 static void emc_clear_entry(struct emc_entry *ce);
574
575 static void
576 emc_cache_init(struct emc_cache *flow_cache)
577 {
578 int i;
579
580 flow_cache->sweep_idx = 0;
581 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
582 flow_cache->entries[i].flow = NULL;
583 flow_cache->entries[i].key.hash = 0;
584 flow_cache->entries[i].key.len = sizeof(struct miniflow);
585 flowmap_init(&flow_cache->entries[i].key.mf.map);
586 }
587 }
588
589 static void
590 emc_cache_uninit(struct emc_cache *flow_cache)
591 {
592 int i;
593
594 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
595 emc_clear_entry(&flow_cache->entries[i]);
596 }
597 }
598
599 /* Check and clear dead flow references slowly (one entry at each
600 * invocation). */
601 static void
602 emc_cache_slow_sweep(struct emc_cache *flow_cache)
603 {
604 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
605
606 if (!emc_entry_alive(entry)) {
607 emc_clear_entry(entry);
608 }
609 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
610 }
611
612 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
613 bool
614 dpif_is_netdev(const struct dpif *dpif)
615 {
616 return dpif->dpif_class->open == dpif_netdev_open;
617 }
618
619 static struct dpif_netdev *
620 dpif_netdev_cast(const struct dpif *dpif)
621 {
622 ovs_assert(dpif_is_netdev(dpif));
623 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
624 }
625
626 static struct dp_netdev *
627 get_dp_netdev(const struct dpif *dpif)
628 {
629 return dpif_netdev_cast(dpif)->dp;
630 }
631 \f
632 enum pmd_info_type {
633 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
634 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
635 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
636 };
637
638 static void
639 pmd_info_show_stats(struct ds *reply,
640 struct dp_netdev_pmd_thread *pmd,
641 unsigned long long stats[DP_N_STATS],
642 uint64_t cycles[PMD_N_CYCLES])
643 {
644 unsigned long long total_packets = 0;
645 uint64_t total_cycles = 0;
646 int i;
647
648 /* These loops subtracts reference values ('*_zero') from the counters.
649 * Since loads and stores are relaxed, it might be possible for a '*_zero'
650 * value to be more recent than the current value we're reading from the
651 * counter. This is not a big problem, since these numbers are not
652 * supposed to be too accurate, but we should at least make sure that
653 * the result is not negative. */
654 for (i = 0; i < DP_N_STATS; i++) {
655 if (stats[i] > pmd->stats_zero[i]) {
656 stats[i] -= pmd->stats_zero[i];
657 } else {
658 stats[i] = 0;
659 }
660
661 if (i != DP_STAT_LOST) {
662 /* Lost packets are already included in DP_STAT_MISS */
663 total_packets += stats[i];
664 }
665 }
666
667 for (i = 0; i < PMD_N_CYCLES; i++) {
668 if (cycles[i] > pmd->cycles_zero[i]) {
669 cycles[i] -= pmd->cycles_zero[i];
670 } else {
671 cycles[i] = 0;
672 }
673
674 total_cycles += cycles[i];
675 }
676
677 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
678 ? "main thread" : "pmd thread");
679
680 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
681 ds_put_format(reply, " numa_id %d", pmd->numa_id);
682 }
683 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
684 ds_put_format(reply, " core_id %u", pmd->core_id);
685 }
686 ds_put_cstr(reply, ":\n");
687
688 ds_put_format(reply,
689 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
690 "\tmiss:%llu\n\tlost:%llu\n",
691 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
692 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
693
694 if (total_cycles == 0) {
695 return;
696 }
697
698 ds_put_format(reply,
699 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
700 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
701 cycles[PMD_CYCLES_POLLING],
702 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
703 cycles[PMD_CYCLES_PROCESSING],
704 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
705
706 if (total_packets == 0) {
707 return;
708 }
709
710 ds_put_format(reply,
711 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
712 total_cycles / (double)total_packets,
713 total_cycles, total_packets);
714
715 ds_put_format(reply,
716 "\tavg processing cycles per packet: "
717 "%.02f (%"PRIu64"/%llu)\n",
718 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
719 cycles[PMD_CYCLES_PROCESSING], total_packets);
720 }
721
722 static void
723 pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
724 struct dp_netdev_pmd_thread *pmd,
725 unsigned long long stats[DP_N_STATS],
726 uint64_t cycles[PMD_N_CYCLES])
727 {
728 int i;
729
730 /* We cannot write 'stats' and 'cycles' (because they're written by other
731 * threads) and we shouldn't change 'stats' (because they're used to count
732 * datapath stats, which must not be cleared here). Instead, we save the
733 * current values and subtract them from the values to be displayed in the
734 * future */
735 for (i = 0; i < DP_N_STATS; i++) {
736 pmd->stats_zero[i] = stats[i];
737 }
738 for (i = 0; i < PMD_N_CYCLES; i++) {
739 pmd->cycles_zero[i] = cycles[i];
740 }
741 }
742
743 static void
744 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
745 {
746 if (pmd->core_id != NON_PMD_CORE_ID) {
747 struct rxq_poll *poll;
748 const char *prev_name = NULL;
749
750 ds_put_format(reply,
751 "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
752 pmd->numa_id, pmd->core_id, (pmd->isolated)
753 ? "true" : "false");
754
755 ovs_mutex_lock(&pmd->port_mutex);
756 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
757 const char *name = netdev_get_name(poll->port->netdev);
758
759 if (!prev_name || strcmp(name, prev_name)) {
760 if (prev_name) {
761 ds_put_cstr(reply, "\n");
762 }
763 ds_put_format(reply, "\tport: %s\tqueue-id:",
764 netdev_get_name(poll->port->netdev));
765 }
766 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
767 prev_name = name;
768 }
769 ovs_mutex_unlock(&pmd->port_mutex);
770 ds_put_cstr(reply, "\n");
771 }
772 }
773
774 static void
775 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
776 void *aux)
777 {
778 struct ds reply = DS_EMPTY_INITIALIZER;
779 struct dp_netdev_pmd_thread *pmd;
780 struct dp_netdev *dp = NULL;
781 enum pmd_info_type type = *(enum pmd_info_type *) aux;
782
783 ovs_mutex_lock(&dp_netdev_mutex);
784
785 if (argc == 2) {
786 dp = shash_find_data(&dp_netdevs, argv[1]);
787 } else if (shash_count(&dp_netdevs) == 1) {
788 /* There's only one datapath */
789 dp = shash_first(&dp_netdevs)->data;
790 }
791
792 if (!dp) {
793 ovs_mutex_unlock(&dp_netdev_mutex);
794 unixctl_command_reply_error(conn,
795 "please specify an existing datapath");
796 return;
797 }
798
799 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
800 if (type == PMD_INFO_SHOW_RXQ) {
801 pmd_info_show_rxq(&reply, pmd);
802 } else {
803 unsigned long long stats[DP_N_STATS];
804 uint64_t cycles[PMD_N_CYCLES];
805 int i;
806
807 /* Read current stats and cycle counters */
808 for (i = 0; i < ARRAY_SIZE(stats); i++) {
809 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
810 }
811 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
812 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
813 }
814
815 if (type == PMD_INFO_CLEAR_STATS) {
816 pmd_info_clear_stats(&reply, pmd, stats, cycles);
817 } else if (type == PMD_INFO_SHOW_STATS) {
818 pmd_info_show_stats(&reply, pmd, stats, cycles);
819 }
820 }
821 }
822
823 ovs_mutex_unlock(&dp_netdev_mutex);
824
825 unixctl_command_reply(conn, ds_cstr(&reply));
826 ds_destroy(&reply);
827 }
828 \f
829 static int
830 dpif_netdev_init(void)
831 {
832 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
833 clear_aux = PMD_INFO_CLEAR_STATS,
834 poll_aux = PMD_INFO_SHOW_RXQ;
835
836 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
837 0, 1, dpif_netdev_pmd_info,
838 (void *)&show_aux);
839 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
840 0, 1, dpif_netdev_pmd_info,
841 (void *)&clear_aux);
842 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
843 0, 1, dpif_netdev_pmd_info,
844 (void *)&poll_aux);
845 return 0;
846 }
847
848 static int
849 dpif_netdev_enumerate(struct sset *all_dps,
850 const struct dpif_class *dpif_class)
851 {
852 struct shash_node *node;
853
854 ovs_mutex_lock(&dp_netdev_mutex);
855 SHASH_FOR_EACH(node, &dp_netdevs) {
856 struct dp_netdev *dp = node->data;
857 if (dpif_class != dp->class) {
858 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
859 * If the class doesn't match, skip this dpif. */
860 continue;
861 }
862 sset_add(all_dps, node->name);
863 }
864 ovs_mutex_unlock(&dp_netdev_mutex);
865
866 return 0;
867 }
868
869 static bool
870 dpif_netdev_class_is_dummy(const struct dpif_class *class)
871 {
872 return class != &dpif_netdev_class;
873 }
874
875 static const char *
876 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
877 {
878 return strcmp(type, "internal") ? type
879 : dpif_netdev_class_is_dummy(class) ? "dummy"
880 : "tap";
881 }
882
883 static struct dpif *
884 create_dpif_netdev(struct dp_netdev *dp)
885 {
886 uint16_t netflow_id = hash_string(dp->name, 0);
887 struct dpif_netdev *dpif;
888
889 ovs_refcount_ref(&dp->ref_cnt);
890
891 dpif = xmalloc(sizeof *dpif);
892 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
893 dpif->dp = dp;
894 dpif->last_port_seq = seq_read(dp->port_seq);
895
896 return &dpif->dpif;
897 }
898
899 /* Choose an unused, non-zero port number and return it on success.
900 * Return ODPP_NONE on failure. */
901 static odp_port_t
902 choose_port(struct dp_netdev *dp, const char *name)
903 OVS_REQUIRES(dp->port_mutex)
904 {
905 uint32_t port_no;
906
907 if (dp->class != &dpif_netdev_class) {
908 const char *p;
909 int start_no = 0;
910
911 /* If the port name begins with "br", start the number search at
912 * 100 to make writing tests easier. */
913 if (!strncmp(name, "br", 2)) {
914 start_no = 100;
915 }
916
917 /* If the port name contains a number, try to assign that port number.
918 * This can make writing unit tests easier because port numbers are
919 * predictable. */
920 for (p = name; *p != '\0'; p++) {
921 if (isdigit((unsigned char) *p)) {
922 port_no = start_no + strtol(p, NULL, 10);
923 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
924 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
925 return u32_to_odp(port_no);
926 }
927 break;
928 }
929 }
930 }
931
932 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
933 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
934 return u32_to_odp(port_no);
935 }
936 }
937
938 return ODPP_NONE;
939 }
940
941 static int
942 create_dp_netdev(const char *name, const struct dpif_class *class,
943 struct dp_netdev **dpp)
944 OVS_REQUIRES(dp_netdev_mutex)
945 {
946 struct dp_netdev *dp;
947 int error;
948
949 dp = xzalloc(sizeof *dp);
950 shash_add(&dp_netdevs, name, dp);
951
952 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
953 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
954 ovs_refcount_init(&dp->ref_cnt);
955 atomic_flag_clear(&dp->destroyed);
956
957 ovs_mutex_init(&dp->port_mutex);
958 hmap_init(&dp->ports);
959 dp->port_seq = seq_create();
960 fat_rwlock_init(&dp->upcall_rwlock);
961
962 dp->reconfigure_seq = seq_create();
963 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
964
965 /* Disable upcalls by default. */
966 dp_netdev_disable_upcall(dp);
967 dp->upcall_aux = NULL;
968 dp->upcall_cb = NULL;
969
970 cmap_init(&dp->poll_threads);
971 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
972 ovsthread_key_create(&dp->per_pmd_key, NULL);
973
974 ovs_mutex_lock(&dp->port_mutex);
975 dp_netdev_set_nonpmd(dp);
976
977 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
978 "internal"),
979 ODPP_LOCAL);
980 ovs_mutex_unlock(&dp->port_mutex);
981 if (error) {
982 dp_netdev_free(dp);
983 return error;
984 }
985
986 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
987 *dpp = dp;
988 return 0;
989 }
990
991 static void
992 dp_netdev_request_reconfigure(struct dp_netdev *dp)
993 {
994 seq_change(dp->reconfigure_seq);
995 }
996
997 static bool
998 dp_netdev_is_reconf_required(struct dp_netdev *dp)
999 {
1000 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1001 }
1002
1003 static int
1004 dpif_netdev_open(const struct dpif_class *class, const char *name,
1005 bool create, struct dpif **dpifp)
1006 {
1007 struct dp_netdev *dp;
1008 int error;
1009
1010 ovs_mutex_lock(&dp_netdev_mutex);
1011 dp = shash_find_data(&dp_netdevs, name);
1012 if (!dp) {
1013 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1014 } else {
1015 error = (dp->class != class ? EINVAL
1016 : create ? EEXIST
1017 : 0);
1018 }
1019 if (!error) {
1020 *dpifp = create_dpif_netdev(dp);
1021 dp->dpif = *dpifp;
1022 }
1023 ovs_mutex_unlock(&dp_netdev_mutex);
1024
1025 return error;
1026 }
1027
1028 static void
1029 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1030 OVS_NO_THREAD_SAFETY_ANALYSIS
1031 {
1032 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1033 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1034
1035 /* Before freeing a lock we should release it */
1036 fat_rwlock_unlock(&dp->upcall_rwlock);
1037 fat_rwlock_destroy(&dp->upcall_rwlock);
1038 }
1039
1040 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1041 * through the 'dp_netdevs' shash while freeing 'dp'. */
1042 static void
1043 dp_netdev_free(struct dp_netdev *dp)
1044 OVS_REQUIRES(dp_netdev_mutex)
1045 {
1046 struct dp_netdev_port *port, *next;
1047
1048 shash_find_and_delete(&dp_netdevs, dp->name);
1049
1050 dp_netdev_destroy_all_pmds(dp);
1051 ovs_mutex_destroy(&dp->non_pmd_mutex);
1052 ovsthread_key_delete(dp->per_pmd_key);
1053
1054 ovs_mutex_lock(&dp->port_mutex);
1055 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1056 do_del_port(dp, port);
1057 }
1058 ovs_mutex_unlock(&dp->port_mutex);
1059 cmap_destroy(&dp->poll_threads);
1060
1061 seq_destroy(dp->reconfigure_seq);
1062
1063 seq_destroy(dp->port_seq);
1064 hmap_destroy(&dp->ports);
1065 ovs_mutex_destroy(&dp->port_mutex);
1066
1067 /* Upcalls must be disabled at this point */
1068 dp_netdev_destroy_upcall_lock(dp);
1069
1070 free(dp->pmd_cmask);
1071 free(CONST_CAST(char *, dp->name));
1072 free(dp);
1073 }
1074
1075 static void
1076 dp_netdev_unref(struct dp_netdev *dp)
1077 {
1078 if (dp) {
1079 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1080 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1081 ovs_mutex_lock(&dp_netdev_mutex);
1082 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1083 dp_netdev_free(dp);
1084 }
1085 ovs_mutex_unlock(&dp_netdev_mutex);
1086 }
1087 }
1088
1089 static void
1090 dpif_netdev_close(struct dpif *dpif)
1091 {
1092 struct dp_netdev *dp = get_dp_netdev(dpif);
1093
1094 dp_netdev_unref(dp);
1095 free(dpif);
1096 }
1097
1098 static int
1099 dpif_netdev_destroy(struct dpif *dpif)
1100 {
1101 struct dp_netdev *dp = get_dp_netdev(dpif);
1102
1103 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1104 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1105 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1106 OVS_NOT_REACHED();
1107 }
1108 }
1109
1110 return 0;
1111 }
1112
1113 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1114 * load/store semantics. While the increment is not atomic, the load and
1115 * store operations are, making it impossible to read inconsistent values.
1116 *
1117 * This is used to update thread local stats counters. */
1118 static void
1119 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1120 {
1121 unsigned long long tmp;
1122
1123 atomic_read_relaxed(var, &tmp);
1124 tmp += n;
1125 atomic_store_relaxed(var, tmp);
1126 }
1127
1128 static int
1129 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1130 {
1131 struct dp_netdev *dp = get_dp_netdev(dpif);
1132 struct dp_netdev_pmd_thread *pmd;
1133
1134 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1135 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1136 unsigned long long n;
1137 stats->n_flows += cmap_count(&pmd->flow_table);
1138
1139 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1140 stats->n_hit += n;
1141 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
1142 stats->n_hit += n;
1143 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1144 stats->n_missed += n;
1145 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1146 stats->n_lost += n;
1147 }
1148 stats->n_masks = UINT32_MAX;
1149 stats->n_mask_hit = UINT64_MAX;
1150
1151 return 0;
1152 }
1153
1154 static void
1155 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1156 {
1157 int old_seq;
1158
1159 if (pmd->core_id == NON_PMD_CORE_ID) {
1160 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1161 ovs_mutex_lock(&pmd->port_mutex);
1162 pmd_load_cached_ports(pmd);
1163 ovs_mutex_unlock(&pmd->port_mutex);
1164 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1165 return;
1166 }
1167
1168 ovs_mutex_lock(&pmd->cond_mutex);
1169 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
1170 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1171 ovs_mutex_unlock(&pmd->cond_mutex);
1172 }
1173
1174 static uint32_t
1175 hash_port_no(odp_port_t port_no)
1176 {
1177 return hash_int(odp_to_u32(port_no), 0);
1178 }
1179
1180 static int
1181 port_create(const char *devname, const char *type,
1182 odp_port_t port_no, struct dp_netdev_port **portp)
1183 {
1184 struct netdev_saved_flags *sf;
1185 struct dp_netdev_port *port;
1186 enum netdev_flags flags;
1187 struct netdev *netdev;
1188 int n_open_rxqs = 0;
1189 int n_cores = 0;
1190 int i, error;
1191 bool dynamic_txqs = false;
1192
1193 *portp = NULL;
1194
1195 /* Open and validate network device. */
1196 error = netdev_open(devname, type, &netdev);
1197 if (error) {
1198 return error;
1199 }
1200 /* XXX reject non-Ethernet devices */
1201
1202 netdev_get_flags(netdev, &flags);
1203 if (flags & NETDEV_LOOPBACK) {
1204 VLOG_ERR("%s: cannot add a loopback device", devname);
1205 error = EINVAL;
1206 goto out;
1207 }
1208
1209 if (netdev_is_pmd(netdev)) {
1210 n_cores = ovs_numa_get_n_cores();
1211
1212 if (n_cores == OVS_CORE_UNSPEC) {
1213 VLOG_ERR("%s, cannot get cpu core info", devname);
1214 error = ENOENT;
1215 goto out;
1216 }
1217 /* There can only be ovs_numa_get_n_cores() pmd threads,
1218 * so creates a txq for each, and one extra for the non
1219 * pmd threads. */
1220 error = netdev_set_tx_multiq(netdev, n_cores + 1);
1221 if (error && (error != EOPNOTSUPP)) {
1222 VLOG_ERR("%s, cannot set multiq", devname);
1223 goto out;
1224 }
1225 }
1226
1227 if (netdev_is_reconf_required(netdev)) {
1228 error = netdev_reconfigure(netdev);
1229 if (error) {
1230 goto out;
1231 }
1232 }
1233
1234 if (netdev_is_pmd(netdev)) {
1235 if (netdev_n_txq(netdev) < n_cores + 1) {
1236 dynamic_txqs = true;
1237 }
1238 }
1239
1240 port = xzalloc(sizeof *port);
1241 port->port_no = port_no;
1242 port->netdev = netdev;
1243 port->n_rxq = netdev_n_rxq(netdev);
1244 port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
1245 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
1246 port->type = xstrdup(type);
1247 ovs_mutex_init(&port->txq_used_mutex);
1248 port->dynamic_txqs = dynamic_txqs;
1249
1250 for (i = 0; i < port->n_rxq; i++) {
1251 error = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
1252 if (error) {
1253 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1254 devname, ovs_strerror(errno));
1255 goto out_rxq_close;
1256 }
1257 port->rxqs[i].core_id = -1;
1258 n_open_rxqs++;
1259 }
1260
1261 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1262 if (error) {
1263 goto out_rxq_close;
1264 }
1265 port->sf = sf;
1266
1267 *portp = port;
1268
1269 return 0;
1270
1271 out_rxq_close:
1272 for (i = 0; i < n_open_rxqs; i++) {
1273 netdev_rxq_close(port->rxqs[i].rxq);
1274 }
1275 ovs_mutex_destroy(&port->txq_used_mutex);
1276 free(port->type);
1277 free(port->txq_used);
1278 free(port->rxqs);
1279 free(port);
1280
1281 out:
1282 netdev_close(netdev);
1283 return error;
1284 }
1285
1286 static int
1287 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1288 odp_port_t port_no)
1289 OVS_REQUIRES(dp->port_mutex)
1290 {
1291 struct dp_netdev_port *port;
1292 int error;
1293
1294 /* Reject devices already in 'dp'. */
1295 if (!get_port_by_name(dp, devname, &port)) {
1296 return EEXIST;
1297 }
1298
1299 error = port_create(devname, type, port_no, &port);
1300 if (error) {
1301 return error;
1302 }
1303
1304 if (netdev_is_pmd(port->netdev)) {
1305 int numa_id = netdev_get_numa_id(port->netdev);
1306
1307 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1308 dp_netdev_set_pmds_on_numa(dp, numa_id);
1309 }
1310
1311 dp_netdev_add_port_to_pmds(dp, port);
1312
1313 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1314 seq_change(dp->port_seq);
1315
1316 return 0;
1317 }
1318
1319 static int
1320 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1321 odp_port_t *port_nop)
1322 {
1323 struct dp_netdev *dp = get_dp_netdev(dpif);
1324 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1325 const char *dpif_port;
1326 odp_port_t port_no;
1327 int error;
1328
1329 ovs_mutex_lock(&dp->port_mutex);
1330 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1331 if (*port_nop != ODPP_NONE) {
1332 port_no = *port_nop;
1333 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1334 } else {
1335 port_no = choose_port(dp, dpif_port);
1336 error = port_no == ODPP_NONE ? EFBIG : 0;
1337 }
1338 if (!error) {
1339 *port_nop = port_no;
1340 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1341 }
1342 ovs_mutex_unlock(&dp->port_mutex);
1343
1344 return error;
1345 }
1346
1347 static int
1348 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1349 {
1350 struct dp_netdev *dp = get_dp_netdev(dpif);
1351 int error;
1352
1353 ovs_mutex_lock(&dp->port_mutex);
1354 if (port_no == ODPP_LOCAL) {
1355 error = EINVAL;
1356 } else {
1357 struct dp_netdev_port *port;
1358
1359 error = get_port_by_number(dp, port_no, &port);
1360 if (!error) {
1361 do_del_port(dp, port);
1362 }
1363 }
1364 ovs_mutex_unlock(&dp->port_mutex);
1365
1366 return error;
1367 }
1368
1369 static bool
1370 is_valid_port_number(odp_port_t port_no)
1371 {
1372 return port_no != ODPP_NONE;
1373 }
1374
1375 static struct dp_netdev_port *
1376 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1377 OVS_REQUIRES(dp->port_mutex)
1378 {
1379 struct dp_netdev_port *port;
1380
1381 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1382 if (port->port_no == port_no) {
1383 return port;
1384 }
1385 }
1386 return NULL;
1387 }
1388
1389 static int
1390 get_port_by_number(struct dp_netdev *dp,
1391 odp_port_t port_no, struct dp_netdev_port **portp)
1392 OVS_REQUIRES(dp->port_mutex)
1393 {
1394 if (!is_valid_port_number(port_no)) {
1395 *portp = NULL;
1396 return EINVAL;
1397 } else {
1398 *portp = dp_netdev_lookup_port(dp, port_no);
1399 return *portp ? 0 : ENOENT;
1400 }
1401 }
1402
1403 static void
1404 port_destroy(struct dp_netdev_port *port)
1405 {
1406 if (!port) {
1407 return;
1408 }
1409
1410 netdev_close(port->netdev);
1411 netdev_restore_flags(port->sf);
1412
1413 for (unsigned i = 0; i < port->n_rxq; i++) {
1414 netdev_rxq_close(port->rxqs[i].rxq);
1415 }
1416 ovs_mutex_destroy(&port->txq_used_mutex);
1417 free(port->rxq_affinity_list);
1418 free(port->txq_used);
1419 free(port->rxqs);
1420 free(port->type);
1421 free(port);
1422 }
1423
1424 static int
1425 get_port_by_name(struct dp_netdev *dp,
1426 const char *devname, struct dp_netdev_port **portp)
1427 OVS_REQUIRES(dp->port_mutex)
1428 {
1429 struct dp_netdev_port *port;
1430
1431 HMAP_FOR_EACH (port, node, &dp->ports) {
1432 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1433 *portp = port;
1434 return 0;
1435 }
1436 }
1437 return ENOENT;
1438 }
1439
1440 static int
1441 get_n_pmd_threads(struct dp_netdev *dp)
1442 {
1443 /* There is one non pmd thread in dp->poll_threads */
1444 return cmap_count(&dp->poll_threads) - 1;
1445 }
1446
1447 static int
1448 get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1449 {
1450 struct dp_netdev_pmd_thread *pmd;
1451 int n_pmds = 0;
1452
1453 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1454 if (pmd->numa_id == numa_id) {
1455 n_pmds++;
1456 }
1457 }
1458
1459 return n_pmds;
1460 }
1461
1462 /* Returns 'true' if there is a port with pmd netdev and the netdev
1463 * is on numa node 'numa_id'. */
1464 static bool
1465 has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
1466 OVS_REQUIRES(dp->port_mutex)
1467 {
1468 struct dp_netdev_port *port;
1469
1470 HMAP_FOR_EACH (port, node, &dp->ports) {
1471 if (netdev_is_pmd(port->netdev)
1472 && netdev_get_numa_id(port->netdev) == numa_id) {
1473 return true;
1474 }
1475 }
1476
1477 return false;
1478 }
1479
1480
1481 static void
1482 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1483 OVS_REQUIRES(dp->port_mutex)
1484 {
1485 hmap_remove(&dp->ports, &port->node);
1486 seq_change(dp->port_seq);
1487
1488 dp_netdev_del_port_from_all_pmds(dp, port);
1489
1490 if (netdev_is_pmd(port->netdev)) {
1491 int numa_id = netdev_get_numa_id(port->netdev);
1492
1493 /* PMD threads can not be on invalid numa node. */
1494 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1495 /* If there is no netdev on the numa node, deletes the pmd threads
1496 * for that numa. */
1497 if (!has_pmd_port_for_numa(dp, numa_id)) {
1498 dp_netdev_del_pmds_on_numa(dp, numa_id);
1499 }
1500 }
1501
1502 port_destroy(port);
1503 }
1504
1505 static void
1506 answer_port_query(const struct dp_netdev_port *port,
1507 struct dpif_port *dpif_port)
1508 {
1509 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1510 dpif_port->type = xstrdup(port->type);
1511 dpif_port->port_no = port->port_no;
1512 }
1513
1514 static int
1515 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1516 struct dpif_port *dpif_port)
1517 {
1518 struct dp_netdev *dp = get_dp_netdev(dpif);
1519 struct dp_netdev_port *port;
1520 int error;
1521
1522 ovs_mutex_lock(&dp->port_mutex);
1523 error = get_port_by_number(dp, port_no, &port);
1524 if (!error && dpif_port) {
1525 answer_port_query(port, dpif_port);
1526 }
1527 ovs_mutex_unlock(&dp->port_mutex);
1528
1529 return error;
1530 }
1531
1532 static int
1533 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1534 struct dpif_port *dpif_port)
1535 {
1536 struct dp_netdev *dp = get_dp_netdev(dpif);
1537 struct dp_netdev_port *port;
1538 int error;
1539
1540 ovs_mutex_lock(&dp->port_mutex);
1541 error = get_port_by_name(dp, devname, &port);
1542 if (!error && dpif_port) {
1543 answer_port_query(port, dpif_port);
1544 }
1545 ovs_mutex_unlock(&dp->port_mutex);
1546
1547 return error;
1548 }
1549
1550 static void
1551 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1552 {
1553 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1554 free(flow);
1555 }
1556
1557 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1558 {
1559 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1560 ovsrcu_postpone(dp_netdev_flow_free, flow);
1561 }
1562 }
1563
1564 static uint32_t
1565 dp_netdev_flow_hash(const ovs_u128 *ufid)
1566 {
1567 return ufid->u32[0];
1568 }
1569
1570 static void
1571 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1572 struct dp_netdev_flow *flow)
1573 OVS_REQUIRES(pmd->flow_mutex)
1574 {
1575 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
1576
1577 dpcls_remove(&pmd->cls, &flow->cr);
1578 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
1579 flow->dead = true;
1580
1581 dp_netdev_flow_unref(flow);
1582 }
1583
1584 static void
1585 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
1586 {
1587 struct dp_netdev_flow *netdev_flow;
1588
1589 ovs_mutex_lock(&pmd->flow_mutex);
1590 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1591 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
1592 }
1593 ovs_mutex_unlock(&pmd->flow_mutex);
1594 }
1595
1596 static int
1597 dpif_netdev_flow_flush(struct dpif *dpif)
1598 {
1599 struct dp_netdev *dp = get_dp_netdev(dpif);
1600 struct dp_netdev_pmd_thread *pmd;
1601
1602 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1603 dp_netdev_pmd_flow_flush(pmd);
1604 }
1605
1606 return 0;
1607 }
1608
1609 struct dp_netdev_port_state {
1610 struct hmap_position position;
1611 char *name;
1612 };
1613
1614 static int
1615 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1616 {
1617 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1618 return 0;
1619 }
1620
1621 static int
1622 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
1623 struct dpif_port *dpif_port)
1624 {
1625 struct dp_netdev_port_state *state = state_;
1626 struct dp_netdev *dp = get_dp_netdev(dpif);
1627 struct hmap_node *node;
1628 int retval;
1629
1630 ovs_mutex_lock(&dp->port_mutex);
1631 node = hmap_at_position(&dp->ports, &state->position);
1632 if (node) {
1633 struct dp_netdev_port *port;
1634
1635 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1636
1637 free(state->name);
1638 state->name = xstrdup(netdev_get_name(port->netdev));
1639 dpif_port->name = state->name;
1640 dpif_port->type = port->type;
1641 dpif_port->port_no = port->port_no;
1642
1643 retval = 0;
1644 } else {
1645 retval = EOF;
1646 }
1647 ovs_mutex_unlock(&dp->port_mutex);
1648
1649 return retval;
1650 }
1651
1652 static int
1653 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
1654 {
1655 struct dp_netdev_port_state *state = state_;
1656 free(state->name);
1657 free(state);
1658 return 0;
1659 }
1660
1661 static int
1662 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
1663 {
1664 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1665 uint64_t new_port_seq;
1666 int error;
1667
1668 new_port_seq = seq_read(dpif->dp->port_seq);
1669 if (dpif->last_port_seq != new_port_seq) {
1670 dpif->last_port_seq = new_port_seq;
1671 error = ENOBUFS;
1672 } else {
1673 error = EAGAIN;
1674 }
1675
1676 return error;
1677 }
1678
1679 static void
1680 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1681 {
1682 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1683
1684 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
1685 }
1686
1687 static struct dp_netdev_flow *
1688 dp_netdev_flow_cast(const struct dpcls_rule *cr)
1689 {
1690 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
1691 }
1692
1693 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1694 {
1695 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1696 }
1697
1698 /* netdev_flow_key utilities.
1699 *
1700 * netdev_flow_key is basically a miniflow. We use these functions
1701 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1702 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1703 *
1704 * - Since we are dealing exclusively with miniflows created by
1705 * miniflow_extract(), if the map is different the miniflow is different.
1706 * Therefore we can be faster by comparing the map and the miniflow in a
1707 * single memcmp().
1708 * - These functions can be inlined by the compiler. */
1709
1710 /* Given the number of bits set in miniflow's maps, returns the size of the
1711 * 'netdev_flow_key.mf' */
1712 static inline size_t
1713 netdev_flow_key_size(size_t flow_u64s)
1714 {
1715 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
1716 }
1717
1718 static inline bool
1719 netdev_flow_key_equal(const struct netdev_flow_key *a,
1720 const struct netdev_flow_key *b)
1721 {
1722 /* 'b->len' may be not set yet. */
1723 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
1724 }
1725
1726 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1727 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
1728 * generated by miniflow_extract. */
1729 static inline bool
1730 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1731 const struct miniflow *mf)
1732 {
1733 return !memcmp(&key->mf, mf, key->len);
1734 }
1735
1736 static inline void
1737 netdev_flow_key_clone(struct netdev_flow_key *dst,
1738 const struct netdev_flow_key *src)
1739 {
1740 memcpy(dst, src,
1741 offsetof(struct netdev_flow_key, mf) + src->len);
1742 }
1743
1744 /* Slow. */
1745 static void
1746 netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1747 const struct flow *src)
1748 {
1749 struct dp_packet packet;
1750 uint64_t buf_stub[512 / 8];
1751
1752 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1753 pkt_metadata_from_flow(&packet.md, src);
1754 flow_compose(&packet, src);
1755 miniflow_extract(&packet, &dst->mf);
1756 dp_packet_uninit(&packet);
1757
1758 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
1759 dst->hash = 0; /* Not computed yet. */
1760 }
1761
1762 /* Initialize a netdev_flow_key 'mask' from 'match'. */
1763 static inline void
1764 netdev_flow_mask_init(struct netdev_flow_key *mask,
1765 const struct match *match)
1766 {
1767 uint64_t *dst = miniflow_values(&mask->mf);
1768 struct flowmap fmap;
1769 uint32_t hash = 0;
1770 size_t idx;
1771
1772 /* Only check masks that make sense for the flow. */
1773 flow_wc_map(&match->flow, &fmap);
1774 flowmap_init(&mask->mf.map);
1775
1776 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1777 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
1778
1779 if (mask_u64) {
1780 flowmap_set(&mask->mf.map, idx, 1);
1781 *dst++ = mask_u64;
1782 hash = hash_add64(hash, mask_u64);
1783 }
1784 }
1785
1786 map_t map;
1787
1788 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1789 hash = hash_add64(hash, map);
1790 }
1791
1792 size_t n = dst - miniflow_get_values(&mask->mf);
1793
1794 mask->hash = hash_finish(hash, n * 8);
1795 mask->len = netdev_flow_key_size(n);
1796 }
1797
1798 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
1799 static inline void
1800 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1801 const struct flow *flow,
1802 const struct netdev_flow_key *mask)
1803 {
1804 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1805 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
1806 uint32_t hash = 0;
1807 uint64_t value;
1808
1809 dst->len = mask->len;
1810 dst->mf = mask->mf; /* Copy maps. */
1811
1812 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
1813 *dst_u64 = value & *mask_u64++;
1814 hash = hash_add64(hash, *dst_u64++);
1815 }
1816 dst->hash = hash_finish(hash,
1817 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
1818 }
1819
1820 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1821 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1822 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
1823
1824 /* Returns a hash value for the bits of 'key' where there are 1-bits in
1825 * 'mask'. */
1826 static inline uint32_t
1827 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1828 const struct netdev_flow_key *mask)
1829 {
1830 const uint64_t *p = miniflow_get_values(&mask->mf);
1831 uint32_t hash = 0;
1832 uint64_t value;
1833
1834 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1835 hash = hash_add64(hash, value & *p++);
1836 }
1837
1838 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
1839 }
1840
1841 static inline bool
1842 emc_entry_alive(struct emc_entry *ce)
1843 {
1844 return ce->flow && !ce->flow->dead;
1845 }
1846
1847 static void
1848 emc_clear_entry(struct emc_entry *ce)
1849 {
1850 if (ce->flow) {
1851 dp_netdev_flow_unref(ce->flow);
1852 ce->flow = NULL;
1853 }
1854 }
1855
1856 static inline void
1857 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
1858 const struct netdev_flow_key *key)
1859 {
1860 if (ce->flow != flow) {
1861 if (ce->flow) {
1862 dp_netdev_flow_unref(ce->flow);
1863 }
1864
1865 if (dp_netdev_flow_ref(flow)) {
1866 ce->flow = flow;
1867 } else {
1868 ce->flow = NULL;
1869 }
1870 }
1871 if (key) {
1872 netdev_flow_key_clone(&ce->key, key);
1873 }
1874 }
1875
1876 static inline void
1877 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
1878 struct dp_netdev_flow *flow)
1879 {
1880 struct emc_entry *to_be_replaced = NULL;
1881 struct emc_entry *current_entry;
1882
1883 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1884 if (netdev_flow_key_equal(&current_entry->key, key)) {
1885 /* We found the entry with the 'mf' miniflow */
1886 emc_change_entry(current_entry, flow, NULL);
1887 return;
1888 }
1889
1890 /* Replacement policy: put the flow in an empty (not alive) entry, or
1891 * in the first entry where it can be */
1892 if (!to_be_replaced
1893 || (emc_entry_alive(to_be_replaced)
1894 && !emc_entry_alive(current_entry))
1895 || current_entry->key.hash < to_be_replaced->key.hash) {
1896 to_be_replaced = current_entry;
1897 }
1898 }
1899 /* We didn't find the miniflow in the cache.
1900 * The 'to_be_replaced' entry is where the new flow will be stored */
1901
1902 emc_change_entry(to_be_replaced, flow, key);
1903 }
1904
1905 static inline struct dp_netdev_flow *
1906 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
1907 {
1908 struct emc_entry *current_entry;
1909
1910 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1911 if (current_entry->key.hash == key->hash
1912 && emc_entry_alive(current_entry)
1913 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
1914
1915 /* We found the entry with the 'key->mf' miniflow */
1916 return current_entry->flow;
1917 }
1918 }
1919
1920 return NULL;
1921 }
1922
1923 static struct dp_netdev_flow *
1924 dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
1925 const struct netdev_flow_key *key)
1926 {
1927 struct dp_netdev_flow *netdev_flow;
1928 struct dpcls_rule *rule;
1929
1930 dpcls_lookup(&pmd->cls, key, &rule, 1);
1931 netdev_flow = dp_netdev_flow_cast(rule);
1932
1933 return netdev_flow;
1934 }
1935
1936 static struct dp_netdev_flow *
1937 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
1938 const ovs_u128 *ufidp, const struct nlattr *key,
1939 size_t key_len)
1940 {
1941 struct dp_netdev_flow *netdev_flow;
1942 struct flow flow;
1943 ovs_u128 ufid;
1944
1945 /* If a UFID is not provided, determine one based on the key. */
1946 if (!ufidp && key && key_len
1947 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1948 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
1949 ufidp = &ufid;
1950 }
1951
1952 if (ufidp) {
1953 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1954 &pmd->flow_table) {
1955 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
1956 return netdev_flow;
1957 }
1958 }
1959 }
1960
1961 return NULL;
1962 }
1963
1964 static void
1965 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1966 struct dpif_flow_stats *stats)
1967 {
1968 struct dp_netdev_flow *netdev_flow;
1969 unsigned long long n;
1970 long long used;
1971 uint16_t flags;
1972
1973 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
1974
1975 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
1976 stats->n_packets = n;
1977 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
1978 stats->n_bytes = n;
1979 atomic_read_relaxed(&netdev_flow->stats.used, &used);
1980 stats->used = used;
1981 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
1982 stats->tcp_flags = flags;
1983 }
1984
1985 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
1986 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
1987 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
1988 * protect them. */
1989 static void
1990 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
1991 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
1992 struct dpif_flow *flow, bool terse)
1993 {
1994 if (terse) {
1995 memset(flow, 0, sizeof *flow);
1996 } else {
1997 struct flow_wildcards wc;
1998 struct dp_netdev_actions *actions;
1999 size_t offset;
2000 struct odp_flow_key_parms odp_parms = {
2001 .flow = &netdev_flow->flow,
2002 .mask = &wc.masks,
2003 .support = dp_netdev_support,
2004 };
2005
2006 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2007
2008 /* Key */
2009 offset = key_buf->size;
2010 flow->key = ofpbuf_tail(key_buf);
2011 odp_flow_key_from_flow(&odp_parms, key_buf);
2012 flow->key_len = key_buf->size - offset;
2013
2014 /* Mask */
2015 offset = mask_buf->size;
2016 flow->mask = ofpbuf_tail(mask_buf);
2017 odp_parms.key_buf = key_buf;
2018 odp_flow_key_from_mask(&odp_parms, mask_buf);
2019 flow->mask_len = mask_buf->size - offset;
2020
2021 /* Actions */
2022 actions = dp_netdev_flow_get_actions(netdev_flow);
2023 flow->actions = actions->actions;
2024 flow->actions_len = actions->size;
2025 }
2026
2027 flow->ufid = netdev_flow->ufid;
2028 flow->ufid_present = true;
2029 flow->pmd_id = netdev_flow->pmd_id;
2030 get_dpif_flow_stats(netdev_flow, &flow->stats);
2031 }
2032
2033 static int
2034 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2035 const struct nlattr *mask_key,
2036 uint32_t mask_key_len, const struct flow *flow,
2037 struct flow_wildcards *wc)
2038 {
2039 enum odp_key_fitness fitness;
2040
2041 fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key,
2042 key_len, wc, flow);
2043 if (fitness) {
2044 /* This should not happen: it indicates that
2045 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2046 * disagree on the acceptable form of a mask. Log the problem
2047 * as an error, with enough details to enable debugging. */
2048 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2049
2050 if (!VLOG_DROP_ERR(&rl)) {
2051 struct ds s;
2052
2053 ds_init(&s);
2054 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2055 true);
2056 VLOG_ERR("internal error parsing flow mask %s (%s)",
2057 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2058 ds_destroy(&s);
2059 }
2060
2061 return EINVAL;
2062 }
2063
2064 return 0;
2065 }
2066
2067 static int
2068 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2069 struct flow *flow)
2070 {
2071 odp_port_t in_port;
2072
2073 if (odp_flow_key_to_flow_udpif(key, key_len, flow)) {
2074 /* This should not happen: it indicates that odp_flow_key_from_flow()
2075 * and odp_flow_key_to_flow() disagree on the acceptable form of a
2076 * flow. Log the problem as an error, with enough details to enable
2077 * debugging. */
2078 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2079
2080 if (!VLOG_DROP_ERR(&rl)) {
2081 struct ds s;
2082
2083 ds_init(&s);
2084 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2085 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2086 ds_destroy(&s);
2087 }
2088
2089 return EINVAL;
2090 }
2091
2092 in_port = flow->in_port.odp_port;
2093 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
2094 return EINVAL;
2095 }
2096
2097 /* Userspace datapath doesn't support conntrack. */
2098 if (flow->ct_state || flow->ct_zone || flow->ct_mark
2099 || !ovs_u128_is_zero(flow->ct_label)) {
2100 return EINVAL;
2101 }
2102
2103 return 0;
2104 }
2105
2106 static int
2107 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2108 {
2109 struct dp_netdev *dp = get_dp_netdev(dpif);
2110 struct dp_netdev_flow *netdev_flow;
2111 struct dp_netdev_pmd_thread *pmd;
2112 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2113 struct hmapx_node *node;
2114 int error = EINVAL;
2115
2116 if (get->pmd_id == PMD_ID_NULL) {
2117 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2118 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2119 dp_netdev_pmd_unref(pmd);
2120 }
2121 }
2122 } else {
2123 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2124 if (!pmd) {
2125 goto out;
2126 }
2127 hmapx_add(&to_find, pmd);
2128 }
2129
2130 if (!hmapx_count(&to_find)) {
2131 goto out;
2132 }
2133
2134 HMAPX_FOR_EACH (node, &to_find) {
2135 pmd = (struct dp_netdev_pmd_thread *) node->data;
2136 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2137 get->key_len);
2138 if (netdev_flow) {
2139 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2140 get->flow, false);
2141 error = 0;
2142 break;
2143 } else {
2144 error = ENOENT;
2145 }
2146 }
2147
2148 HMAPX_FOR_EACH (node, &to_find) {
2149 pmd = (struct dp_netdev_pmd_thread *) node->data;
2150 dp_netdev_pmd_unref(pmd);
2151 }
2152 out:
2153 hmapx_destroy(&to_find);
2154 return error;
2155 }
2156
2157 static struct dp_netdev_flow *
2158 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2159 struct match *match, const ovs_u128 *ufid,
2160 const struct nlattr *actions, size_t actions_len)
2161 OVS_REQUIRES(pmd->flow_mutex)
2162 {
2163 struct dp_netdev_flow *flow;
2164 struct netdev_flow_key mask;
2165
2166 netdev_flow_mask_init(&mask, match);
2167 /* Make sure wc does not have metadata. */
2168 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2169 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2170
2171 /* Do not allocate extra space. */
2172 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2173 memset(&flow->stats, 0, sizeof flow->stats);
2174 flow->dead = false;
2175 flow->batch = NULL;
2176 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2177 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2178 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2179 ovs_refcount_init(&flow->ref_cnt);
2180 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2181
2182 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2183 dpcls_insert(&pmd->cls, &flow->cr, &mask);
2184
2185 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2186 dp_netdev_flow_hash(&flow->ufid));
2187
2188 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
2189 struct ds ds = DS_EMPTY_INITIALIZER;
2190 struct ofpbuf key_buf, mask_buf;
2191 struct odp_flow_key_parms odp_parms = {
2192 .flow = &match->flow,
2193 .mask = &match->wc.masks,
2194 .support = dp_netdev_support,
2195 };
2196
2197 ofpbuf_init(&key_buf, 0);
2198 ofpbuf_init(&mask_buf, 0);
2199
2200 odp_flow_key_from_flow(&odp_parms, &key_buf);
2201 odp_parms.key_buf = &key_buf;
2202 odp_flow_key_from_mask(&odp_parms, &mask_buf);
2203
2204 ds_put_cstr(&ds, "flow_add: ");
2205 odp_format_ufid(ufid, &ds);
2206 ds_put_cstr(&ds, " ");
2207 odp_flow_format(key_buf.data, key_buf.size,
2208 mask_buf.data, mask_buf.size,
2209 NULL, &ds, false);
2210 ds_put_cstr(&ds, ", actions:");
2211 format_odp_actions(&ds, actions, actions_len);
2212
2213 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2214
2215 ofpbuf_uninit(&key_buf);
2216 ofpbuf_uninit(&mask_buf);
2217 ds_destroy(&ds);
2218 }
2219
2220 return flow;
2221 }
2222
2223 static int
2224 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2225 {
2226 struct dp_netdev *dp = get_dp_netdev(dpif);
2227 struct dp_netdev_flow *netdev_flow;
2228 struct netdev_flow_key key;
2229 struct dp_netdev_pmd_thread *pmd;
2230 struct match match;
2231 ovs_u128 ufid;
2232 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2233 ? NON_PMD_CORE_ID : put->pmd_id;
2234 int error;
2235
2236 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
2237 if (error) {
2238 return error;
2239 }
2240 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2241 put->mask, put->mask_len,
2242 &match.flow, &match.wc);
2243 if (error) {
2244 return error;
2245 }
2246
2247 pmd = dp_netdev_get_pmd(dp, pmd_id);
2248 if (!pmd) {
2249 return EINVAL;
2250 }
2251
2252 /* Must produce a netdev_flow_key for lookup.
2253 * This interface is no longer performance critical, since it is not used
2254 * for upcall processing any more. */
2255 netdev_flow_key_from_flow(&key, &match.flow);
2256
2257 if (put->ufid) {
2258 ufid = *put->ufid;
2259 } else {
2260 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2261 }
2262
2263 ovs_mutex_lock(&pmd->flow_mutex);
2264 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key);
2265 if (!netdev_flow) {
2266 if (put->flags & DPIF_FP_CREATE) {
2267 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2268 if (put->stats) {
2269 memset(put->stats, 0, sizeof *put->stats);
2270 }
2271 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
2272 put->actions_len);
2273 error = 0;
2274 } else {
2275 error = EFBIG;
2276 }
2277 } else {
2278 error = ENOENT;
2279 }
2280 } else {
2281 if (put->flags & DPIF_FP_MODIFY
2282 && flow_equal(&match.flow, &netdev_flow->flow)) {
2283 struct dp_netdev_actions *new_actions;
2284 struct dp_netdev_actions *old_actions;
2285
2286 new_actions = dp_netdev_actions_create(put->actions,
2287 put->actions_len);
2288
2289 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2290 ovsrcu_set(&netdev_flow->actions, new_actions);
2291
2292 if (put->stats) {
2293 get_dpif_flow_stats(netdev_flow, put->stats);
2294 }
2295 if (put->flags & DPIF_FP_ZERO_STATS) {
2296 /* XXX: The userspace datapath uses thread local statistics
2297 * (for flows), which should be updated only by the owning
2298 * thread. Since we cannot write on stats memory here,
2299 * we choose not to support this flag. Please note:
2300 * - This feature is currently used only by dpctl commands with
2301 * option --clear.
2302 * - Should the need arise, this operation can be implemented
2303 * by keeping a base value (to be update here) for each
2304 * counter, and subtracting it before outputting the stats */
2305 error = EOPNOTSUPP;
2306 }
2307
2308 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2309 } else if (put->flags & DPIF_FP_CREATE) {
2310 error = EEXIST;
2311 } else {
2312 /* Overlapping flow. */
2313 error = EINVAL;
2314 }
2315 }
2316 ovs_mutex_unlock(&pmd->flow_mutex);
2317 dp_netdev_pmd_unref(pmd);
2318
2319 return error;
2320 }
2321
2322 static int
2323 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
2324 {
2325 struct dp_netdev *dp = get_dp_netdev(dpif);
2326 struct dp_netdev_flow *netdev_flow;
2327 struct dp_netdev_pmd_thread *pmd;
2328 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2329 ? NON_PMD_CORE_ID : del->pmd_id;
2330 int error = 0;
2331
2332 pmd = dp_netdev_get_pmd(dp, pmd_id);
2333 if (!pmd) {
2334 return EINVAL;
2335 }
2336
2337 ovs_mutex_lock(&pmd->flow_mutex);
2338 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2339 del->key_len);
2340 if (netdev_flow) {
2341 if (del->stats) {
2342 get_dpif_flow_stats(netdev_flow, del->stats);
2343 }
2344 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2345 } else {
2346 error = ENOENT;
2347 }
2348 ovs_mutex_unlock(&pmd->flow_mutex);
2349 dp_netdev_pmd_unref(pmd);
2350
2351 return error;
2352 }
2353
2354 struct dpif_netdev_flow_dump {
2355 struct dpif_flow_dump up;
2356 struct cmap_position poll_thread_pos;
2357 struct cmap_position flow_pos;
2358 struct dp_netdev_pmd_thread *cur_pmd;
2359 int status;
2360 struct ovs_mutex mutex;
2361 };
2362
2363 static struct dpif_netdev_flow_dump *
2364 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
2365 {
2366 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
2367 }
2368
2369 static struct dpif_flow_dump *
2370 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
2371 {
2372 struct dpif_netdev_flow_dump *dump;
2373
2374 dump = xzalloc(sizeof *dump);
2375 dpif_flow_dump_init(&dump->up, dpif_);
2376 dump->up.terse = terse;
2377 ovs_mutex_init(&dump->mutex);
2378
2379 return &dump->up;
2380 }
2381
2382 static int
2383 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
2384 {
2385 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2386
2387 ovs_mutex_destroy(&dump->mutex);
2388 free(dump);
2389 return 0;
2390 }
2391
2392 struct dpif_netdev_flow_dump_thread {
2393 struct dpif_flow_dump_thread up;
2394 struct dpif_netdev_flow_dump *dump;
2395 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2396 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
2397 };
2398
2399 static struct dpif_netdev_flow_dump_thread *
2400 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2401 {
2402 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2403 }
2404
2405 static struct dpif_flow_dump_thread *
2406 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2407 {
2408 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2409 struct dpif_netdev_flow_dump_thread *thread;
2410
2411 thread = xmalloc(sizeof *thread);
2412 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2413 thread->dump = dump;
2414 return &thread->up;
2415 }
2416
2417 static void
2418 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2419 {
2420 struct dpif_netdev_flow_dump_thread *thread
2421 = dpif_netdev_flow_dump_thread_cast(thread_);
2422
2423 free(thread);
2424 }
2425
2426 static int
2427 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
2428 struct dpif_flow *flows, int max_flows)
2429 {
2430 struct dpif_netdev_flow_dump_thread *thread
2431 = dpif_netdev_flow_dump_thread_cast(thread_);
2432 struct dpif_netdev_flow_dump *dump = thread->dump;
2433 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
2434 int n_flows = 0;
2435 int i;
2436
2437 ovs_mutex_lock(&dump->mutex);
2438 if (!dump->status) {
2439 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2440 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2441 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2442 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2443
2444 /* First call to dump_next(), extracts the first pmd thread.
2445 * If there is no pmd thread, returns immediately. */
2446 if (!pmd) {
2447 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2448 if (!pmd) {
2449 ovs_mutex_unlock(&dump->mutex);
2450 return n_flows;
2451
2452 }
2453 }
2454
2455 do {
2456 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2457 struct cmap_node *node;
2458
2459 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2460 if (!node) {
2461 break;
2462 }
2463 netdev_flows[n_flows] = CONTAINER_OF(node,
2464 struct dp_netdev_flow,
2465 node);
2466 }
2467 /* When finishing dumping the current pmd thread, moves to
2468 * the next. */
2469 if (n_flows < flow_limit) {
2470 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2471 dp_netdev_pmd_unref(pmd);
2472 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2473 if (!pmd) {
2474 dump->status = EOF;
2475 break;
2476 }
2477 }
2478 /* Keeps the reference to next caller. */
2479 dump->cur_pmd = pmd;
2480
2481 /* If the current dump is empty, do not exit the loop, since the
2482 * remaining pmds could have flows to be dumped. Just dumps again
2483 * on the new 'pmd'. */
2484 } while (!n_flows);
2485 }
2486 ovs_mutex_unlock(&dump->mutex);
2487
2488 for (i = 0; i < n_flows; i++) {
2489 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2490 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2491 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2492 struct dpif_flow *f = &flows[i];
2493 struct ofpbuf key, mask;
2494
2495 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2496 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
2497 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2498 dump->up.terse);
2499 }
2500
2501 return n_flows;
2502 }
2503
2504 static int
2505 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
2506 OVS_NO_THREAD_SAFETY_ANALYSIS
2507 {
2508 struct dp_netdev *dp = get_dp_netdev(dpif);
2509 struct dp_netdev_pmd_thread *pmd;
2510 struct dp_packet_batch pp;
2511
2512 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2513 dp_packet_size(execute->packet) > UINT16_MAX) {
2514 return EINVAL;
2515 }
2516
2517 /* Tries finding the 'pmd'. If NULL is returned, that means
2518 * the current thread is a non-pmd thread and should use
2519 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
2520 pmd = ovsthread_getspecific(dp->per_pmd_key);
2521 if (!pmd) {
2522 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2523 }
2524
2525 /* If the current thread is non-pmd thread, acquires
2526 * the 'non_pmd_mutex'. */
2527 if (pmd->core_id == NON_PMD_CORE_ID) {
2528 ovs_mutex_lock(&dp->non_pmd_mutex);
2529 }
2530
2531 /* The action processing expects the RSS hash to be valid, because
2532 * it's always initialized at the beginning of datapath processing.
2533 * In this case, though, 'execute->packet' may not have gone through
2534 * the datapath at all, it may have been generated by the upper layer
2535 * (OpenFlow packet-out, BFD frame, ...). */
2536 if (!dp_packet_rss_valid(execute->packet)) {
2537 dp_packet_set_rss_hash(execute->packet,
2538 flow_hash_5tuple(execute->flow, 0));
2539 }
2540
2541 packet_batch_init_packet(&pp, execute->packet);
2542 dp_netdev_execute_actions(pmd, &pp, false, execute->actions,
2543 execute->actions_len, time_msec());
2544
2545 if (pmd->core_id == NON_PMD_CORE_ID) {
2546 ovs_mutex_unlock(&dp->non_pmd_mutex);
2547 dp_netdev_pmd_unref(pmd);
2548 }
2549
2550 return 0;
2551 }
2552
2553 static void
2554 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2555 {
2556 size_t i;
2557
2558 for (i = 0; i < n_ops; i++) {
2559 struct dpif_op *op = ops[i];
2560
2561 switch (op->type) {
2562 case DPIF_OP_FLOW_PUT:
2563 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2564 break;
2565
2566 case DPIF_OP_FLOW_DEL:
2567 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2568 break;
2569
2570 case DPIF_OP_EXECUTE:
2571 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2572 break;
2573
2574 case DPIF_OP_FLOW_GET:
2575 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2576 break;
2577 }
2578 }
2579 }
2580
2581 /* Changes the number or the affinity of pmd threads. The changes are actually
2582 * applied in dpif_netdev_run(). */
2583 static int
2584 dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
2585 {
2586 struct dp_netdev *dp = get_dp_netdev(dpif);
2587
2588 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
2589 free(dp->pmd_cmask);
2590 dp->pmd_cmask = nullable_xstrdup(cmask);
2591 dp_netdev_request_reconfigure(dp);
2592 }
2593
2594 return 0;
2595 }
2596
2597 /* Parses affinity list and returns result in 'core_ids'. */
2598 static int
2599 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
2600 {
2601 unsigned i;
2602 char *list, *copy, *key, *value;
2603 int error = 0;
2604
2605 for (i = 0; i < n_rxq; i++) {
2606 core_ids[i] = -1;
2607 }
2608
2609 if (!affinity_list) {
2610 return 0;
2611 }
2612
2613 list = copy = xstrdup(affinity_list);
2614
2615 while (ofputil_parse_key_value(&list, &key, &value)) {
2616 int rxq_id, core_id;
2617
2618 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
2619 || !str_to_int(value, 0, &core_id) || core_id < 0) {
2620 error = EINVAL;
2621 break;
2622 }
2623
2624 if (rxq_id < n_rxq) {
2625 core_ids[rxq_id] = core_id;
2626 }
2627 }
2628
2629 free(copy);
2630 return error;
2631 }
2632
2633 /* Parses 'affinity_list' and applies configuration if it is valid. */
2634 static int
2635 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
2636 const char *affinity_list)
2637 {
2638 unsigned *core_ids, i;
2639 int error = 0;
2640
2641 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
2642 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
2643 error = EINVAL;
2644 goto exit;
2645 }
2646
2647 for (i = 0; i < port->n_rxq; i++) {
2648 port->rxqs[i].core_id = core_ids[i];
2649 }
2650
2651 exit:
2652 free(core_ids);
2653 return error;
2654 }
2655
2656 /* Changes the affinity of port's rx queues. The changes are actually applied
2657 * in dpif_netdev_run(). */
2658 static int
2659 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
2660 const struct smap *cfg)
2661 {
2662 struct dp_netdev *dp = get_dp_netdev(dpif);
2663 struct dp_netdev_port *port;
2664 int error = 0;
2665 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2666
2667 ovs_mutex_lock(&dp->port_mutex);
2668 error = get_port_by_number(dp, port_no, &port);
2669 if (error || !netdev_is_pmd(port->netdev)
2670 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
2671 goto unlock;
2672 }
2673
2674 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
2675 if (error) {
2676 goto unlock;
2677 }
2678 free(port->rxq_affinity_list);
2679 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
2680
2681 dp_netdev_request_reconfigure(dp);
2682 unlock:
2683 ovs_mutex_unlock(&dp->port_mutex);
2684 return error;
2685 }
2686
2687 static int
2688 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2689 uint32_t queue_id, uint32_t *priority)
2690 {
2691 *priority = queue_id;
2692 return 0;
2693 }
2694
2695 \f
2696 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2697 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
2698 struct dp_netdev_actions *
2699 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2700 {
2701 struct dp_netdev_actions *netdev_actions;
2702
2703 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2704 memcpy(netdev_actions->actions, actions, size);
2705 netdev_actions->size = size;
2706
2707 return netdev_actions;
2708 }
2709
2710 struct dp_netdev_actions *
2711 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
2712 {
2713 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
2714 }
2715
2716 static void
2717 dp_netdev_actions_free(struct dp_netdev_actions *actions)
2718 {
2719 free(actions);
2720 }
2721 \f
2722 static inline unsigned long long
2723 cycles_counter(void)
2724 {
2725 #ifdef DPDK_NETDEV
2726 return rte_get_tsc_cycles();
2727 #else
2728 return 0;
2729 #endif
2730 }
2731
2732 /* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2733 extern struct ovs_mutex cycles_counter_fake_mutex;
2734
2735 /* Start counting cycles. Must be followed by 'cycles_count_end()' */
2736 static inline void
2737 cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2738 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2739 OVS_NO_THREAD_SAFETY_ANALYSIS
2740 {
2741 pmd->last_cycles = cycles_counter();
2742 }
2743
2744 /* Stop counting cycles and add them to the counter 'type' */
2745 static inline void
2746 cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2747 enum pmd_cycles_counter_type type)
2748 OVS_RELEASES(&cycles_counter_fake_mutex)
2749 OVS_NO_THREAD_SAFETY_ANALYSIS
2750 {
2751 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2752
2753 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2754 }
2755
2756 static void
2757 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
2758 struct dp_netdev_port *port,
2759 struct netdev_rxq *rxq)
2760 {
2761 struct dp_packet_batch batch;
2762 int error;
2763
2764 dp_packet_batch_init(&batch);
2765 cycles_count_start(pmd);
2766 error = netdev_rxq_recv(rxq, &batch);
2767 cycles_count_end(pmd, PMD_CYCLES_POLLING);
2768 if (!error) {
2769 *recirc_depth_get() = 0;
2770
2771 cycles_count_start(pmd);
2772 dp_netdev_input(pmd, &batch, port->port_no);
2773 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
2774 } else if (error != EAGAIN && error != EOPNOTSUPP) {
2775 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2776
2777 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
2778 netdev_get_name(port->netdev), ovs_strerror(error));
2779 }
2780 }
2781
2782 static int
2783 port_reconfigure(struct dp_netdev_port *port)
2784 {
2785 struct netdev *netdev = port->netdev;
2786 int i, err;
2787
2788 if (!netdev_is_reconf_required(netdev)) {
2789 return 0;
2790 }
2791
2792 /* Closes the existing 'rxq's. */
2793 for (i = 0; i < port->n_rxq; i++) {
2794 netdev_rxq_close(port->rxqs[i].rxq);
2795 port->rxqs[i].rxq = NULL;
2796 }
2797 port->n_rxq = 0;
2798
2799 /* Allows 'netdev' to apply the pending configuration changes. */
2800 err = netdev_reconfigure(netdev);
2801 if (err && (err != EOPNOTSUPP)) {
2802 VLOG_ERR("Failed to set interface %s new configuration",
2803 netdev_get_name(netdev));
2804 return err;
2805 }
2806 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
2807 port->rxqs = xrealloc(port->rxqs,
2808 sizeof *port->rxqs * netdev_n_rxq(netdev));
2809 /* Realloc 'used' counters for tx queues. */
2810 free(port->txq_used);
2811 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
2812
2813 for (i = 0; i < netdev_n_rxq(netdev); i++) {
2814 err = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
2815 if (err) {
2816 return err;
2817 }
2818 port->n_rxq++;
2819 }
2820
2821 /* Parse affinity list to apply configuration for new queues. */
2822 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
2823
2824 return 0;
2825 }
2826
2827 static void
2828 reconfigure_pmd_threads(struct dp_netdev *dp)
2829 OVS_REQUIRES(dp->port_mutex)
2830 {
2831 struct dp_netdev_port *port, *next;
2832 int n_cores;
2833
2834 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
2835
2836 dp_netdev_destroy_all_pmds(dp);
2837
2838 /* Reconfigures the cpu mask. */
2839 ovs_numa_set_cpu_mask(dp->pmd_cmask);
2840
2841 n_cores = ovs_numa_get_n_cores();
2842 if (n_cores == OVS_CORE_UNSPEC) {
2843 VLOG_ERR("Cannot get cpu core info");
2844 return;
2845 }
2846
2847 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
2848 int err;
2849
2850 err = port_reconfigure(port);
2851 if (err) {
2852 hmap_remove(&dp->ports, &port->node);
2853 seq_change(dp->port_seq);
2854 port_destroy(port);
2855 } else {
2856 port->dynamic_txqs = netdev_n_txq(port->netdev) < n_cores + 1;
2857 }
2858 }
2859 /* Restores the non-pmd. */
2860 dp_netdev_set_nonpmd(dp);
2861 /* Restores all pmd threads. */
2862 dp_netdev_reset_pmd_threads(dp);
2863 }
2864
2865 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
2866 static bool
2867 ports_require_restart(const struct dp_netdev *dp)
2868 OVS_REQUIRES(dp->port_mutex)
2869 {
2870 struct dp_netdev_port *port;
2871
2872 HMAP_FOR_EACH (port, node, &dp->ports) {
2873 if (netdev_is_reconf_required(port->netdev)) {
2874 return true;
2875 }
2876 }
2877
2878 return false;
2879 }
2880
2881 /* Return true if needs to revalidate datapath flows. */
2882 static bool
2883 dpif_netdev_run(struct dpif *dpif)
2884 {
2885 struct dp_netdev_port *port;
2886 struct dp_netdev *dp = get_dp_netdev(dpif);
2887 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_pmd(dp,
2888 NON_PMD_CORE_ID);
2889 uint64_t new_tnl_seq;
2890
2891 ovs_mutex_lock(&dp->port_mutex);
2892 ovs_mutex_lock(&dp->non_pmd_mutex);
2893 HMAP_FOR_EACH (port, node, &dp->ports) {
2894 if (!netdev_is_pmd(port->netdev)) {
2895 int i;
2896
2897 for (i = 0; i < port->n_rxq; i++) {
2898 dp_netdev_process_rxq_port(non_pmd, port, port->rxqs[i].rxq);
2899 }
2900 }
2901 }
2902 dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
2903 ovs_mutex_unlock(&dp->non_pmd_mutex);
2904
2905 dp_netdev_pmd_unref(non_pmd);
2906
2907 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
2908 reconfigure_pmd_threads(dp);
2909 }
2910 ovs_mutex_unlock(&dp->port_mutex);
2911
2912 tnl_neigh_cache_run();
2913 tnl_port_map_run();
2914 new_tnl_seq = seq_read(tnl_conf_seq);
2915
2916 if (dp->last_tnl_conf_seq != new_tnl_seq) {
2917 dp->last_tnl_conf_seq = new_tnl_seq;
2918 return true;
2919 }
2920 return false;
2921 }
2922
2923 static void
2924 dpif_netdev_wait(struct dpif *dpif)
2925 {
2926 struct dp_netdev_port *port;
2927 struct dp_netdev *dp = get_dp_netdev(dpif);
2928
2929 ovs_mutex_lock(&dp_netdev_mutex);
2930 ovs_mutex_lock(&dp->port_mutex);
2931 HMAP_FOR_EACH (port, node, &dp->ports) {
2932 netdev_wait_reconf_required(port->netdev);
2933 if (!netdev_is_pmd(port->netdev)) {
2934 int i;
2935
2936 for (i = 0; i < port->n_rxq; i++) {
2937 netdev_rxq_wait(port->rxqs[i].rxq);
2938 }
2939 }
2940 }
2941 ovs_mutex_unlock(&dp->port_mutex);
2942 ovs_mutex_unlock(&dp_netdev_mutex);
2943 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
2944 }
2945
2946 static void
2947 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
2948 {
2949 struct tx_port *tx_port_cached;
2950
2951 /* Free all used tx queue ids. */
2952 dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
2953
2954 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->port_cache) {
2955 free(tx_port_cached);
2956 }
2957 }
2958
2959 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
2960 * 'pmd->port_cache' (thread local) */
2961 static void
2962 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
2963 OVS_REQUIRES(pmd->port_mutex)
2964 {
2965 struct tx_port *tx_port, *tx_port_cached;
2966
2967 pmd_free_cached_ports(pmd);
2968 hmap_shrink(&pmd->port_cache);
2969
2970 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
2971 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
2972 hmap_insert(&pmd->port_cache, &tx_port_cached->node,
2973 hash_port_no(tx_port_cached->port->port_no));
2974 }
2975 }
2976
2977 static int
2978 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
2979 struct rxq_poll **ppoll_list)
2980 {
2981 struct rxq_poll *poll_list = *ppoll_list;
2982 struct rxq_poll *poll;
2983 int i;
2984
2985 ovs_mutex_lock(&pmd->port_mutex);
2986 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
2987
2988 i = 0;
2989 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
2990 poll_list[i++] = *poll;
2991 }
2992
2993 pmd_load_cached_ports(pmd);
2994
2995 ovs_mutex_unlock(&pmd->port_mutex);
2996
2997 *ppoll_list = poll_list;
2998 return i;
2999 }
3000
3001 static void *
3002 pmd_thread_main(void *f_)
3003 {
3004 struct dp_netdev_pmd_thread *pmd = f_;
3005 unsigned int lc = 0;
3006 struct rxq_poll *poll_list;
3007 unsigned int port_seq = PMD_INITIAL_SEQ;
3008 bool exiting;
3009 int poll_cnt;
3010 int i;
3011
3012 poll_list = NULL;
3013
3014 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
3015 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
3016 ovs_numa_thread_setaffinity_core(pmd->core_id);
3017 dpdk_set_lcore_id(pmd->core_id);
3018 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3019 reload:
3020 emc_cache_init(&pmd->flow_cache);
3021
3022 /* List port/core affinity */
3023 for (i = 0; i < poll_cnt; i++) {
3024 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
3025 pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
3026 netdev_rxq_get_queue_id(poll_list[i].rx));
3027 }
3028
3029 for (;;) {
3030 for (i = 0; i < poll_cnt; i++) {
3031 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
3032 }
3033
3034 if (lc++ > 1024) {
3035 unsigned int seq;
3036
3037 lc = 0;
3038
3039 coverage_try_clear();
3040 if (!ovsrcu_try_quiesce()) {
3041 emc_cache_slow_sweep(&pmd->flow_cache);
3042 }
3043
3044 atomic_read_relaxed(&pmd->change_seq, &seq);
3045 if (seq != port_seq) {
3046 port_seq = seq;
3047 break;
3048 }
3049 }
3050 }
3051
3052 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3053 exiting = latch_is_set(&pmd->exit_latch);
3054 /* Signal here to make sure the pmd finishes
3055 * reloading the updated configuration. */
3056 dp_netdev_pmd_reload_done(pmd);
3057
3058 emc_cache_uninit(&pmd->flow_cache);
3059
3060 if (!exiting) {
3061 goto reload;
3062 }
3063
3064 free(poll_list);
3065 pmd_free_cached_ports(pmd);
3066 return NULL;
3067 }
3068
3069 static void
3070 dp_netdev_disable_upcall(struct dp_netdev *dp)
3071 OVS_ACQUIRES(dp->upcall_rwlock)
3072 {
3073 fat_rwlock_wrlock(&dp->upcall_rwlock);
3074 }
3075
3076 static void
3077 dpif_netdev_disable_upcall(struct dpif *dpif)
3078 OVS_NO_THREAD_SAFETY_ANALYSIS
3079 {
3080 struct dp_netdev *dp = get_dp_netdev(dpif);
3081 dp_netdev_disable_upcall(dp);
3082 }
3083
3084 static void
3085 dp_netdev_enable_upcall(struct dp_netdev *dp)
3086 OVS_RELEASES(dp->upcall_rwlock)
3087 {
3088 fat_rwlock_unlock(&dp->upcall_rwlock);
3089 }
3090
3091 static void
3092 dpif_netdev_enable_upcall(struct dpif *dpif)
3093 OVS_NO_THREAD_SAFETY_ANALYSIS
3094 {
3095 struct dp_netdev *dp = get_dp_netdev(dpif);
3096 dp_netdev_enable_upcall(dp);
3097 }
3098
3099 static void
3100 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
3101 {
3102 ovs_mutex_lock(&pmd->cond_mutex);
3103 xpthread_cond_signal(&pmd->cond);
3104 ovs_mutex_unlock(&pmd->cond_mutex);
3105 }
3106
3107 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
3108 * the pointer if succeeds, otherwise, NULL.
3109 *
3110 * Caller must unrefs the returned reference. */
3111 static struct dp_netdev_pmd_thread *
3112 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
3113 {
3114 struct dp_netdev_pmd_thread *pmd;
3115 const struct cmap_node *pnode;
3116
3117 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
3118 if (!pnode) {
3119 return NULL;
3120 }
3121 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
3122
3123 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
3124 }
3125
3126 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
3127 static void
3128 dp_netdev_set_nonpmd(struct dp_netdev *dp)
3129 OVS_REQUIRES(dp->port_mutex)
3130 {
3131 struct dp_netdev_pmd_thread *non_pmd;
3132 struct dp_netdev_port *port;
3133
3134 non_pmd = xzalloc(sizeof *non_pmd);
3135 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
3136
3137 HMAP_FOR_EACH (port, node, &dp->ports) {
3138 dp_netdev_add_port_tx_to_pmd(non_pmd, port);
3139 }
3140
3141 dp_netdev_reload_pmd__(non_pmd);
3142 }
3143
3144 /* Caller must have valid pointer to 'pmd'. */
3145 static bool
3146 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
3147 {
3148 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
3149 }
3150
3151 static void
3152 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
3153 {
3154 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
3155 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
3156 }
3157 }
3158
3159 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
3160 * fails, keeps checking for next node until reaching the end of cmap.
3161 *
3162 * Caller must unrefs the returned reference. */
3163 static struct dp_netdev_pmd_thread *
3164 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
3165 {
3166 struct dp_netdev_pmd_thread *next;
3167
3168 do {
3169 struct cmap_node *node;
3170
3171 node = cmap_next_position(&dp->poll_threads, pos);
3172 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
3173 : NULL;
3174 } while (next && !dp_netdev_pmd_try_ref(next));
3175
3176 return next;
3177 }
3178
3179 /* Configures the 'pmd' based on the input argument. */
3180 static void
3181 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
3182 unsigned core_id, int numa_id)
3183 {
3184 pmd->dp = dp;
3185 pmd->core_id = core_id;
3186 pmd->numa_id = numa_id;
3187 pmd->poll_cnt = 0;
3188
3189 atomic_init(&pmd->static_tx_qid,
3190 (core_id == NON_PMD_CORE_ID)
3191 ? ovs_numa_get_n_cores()
3192 : get_n_pmd_threads(dp));
3193
3194 ovs_refcount_init(&pmd->ref_cnt);
3195 latch_init(&pmd->exit_latch);
3196 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
3197 xpthread_cond_init(&pmd->cond, NULL);
3198 ovs_mutex_init(&pmd->cond_mutex);
3199 ovs_mutex_init(&pmd->flow_mutex);
3200 ovs_mutex_init(&pmd->port_mutex);
3201 dpcls_init(&pmd->cls);
3202 cmap_init(&pmd->flow_table);
3203 ovs_list_init(&pmd->poll_list);
3204 hmap_init(&pmd->tx_ports);
3205 hmap_init(&pmd->port_cache);
3206 /* init the 'flow_cache' since there is no
3207 * actual thread created for NON_PMD_CORE_ID. */
3208 if (core_id == NON_PMD_CORE_ID) {
3209 emc_cache_init(&pmd->flow_cache);
3210 }
3211 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
3212 hash_int(core_id, 0));
3213 }
3214
3215 static void
3216 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
3217 {
3218 dp_netdev_pmd_flow_flush(pmd);
3219 dpcls_destroy(&pmd->cls);
3220 hmap_destroy(&pmd->port_cache);
3221 hmap_destroy(&pmd->tx_ports);
3222 cmap_destroy(&pmd->flow_table);
3223 ovs_mutex_destroy(&pmd->flow_mutex);
3224 latch_destroy(&pmd->exit_latch);
3225 xpthread_cond_destroy(&pmd->cond);
3226 ovs_mutex_destroy(&pmd->cond_mutex);
3227 ovs_mutex_destroy(&pmd->port_mutex);
3228 free(pmd);
3229 }
3230
3231 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
3232 * and unrefs the struct. */
3233 static void
3234 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
3235 {
3236 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
3237 * but extra cleanup is necessary */
3238 if (pmd->core_id == NON_PMD_CORE_ID) {
3239 emc_cache_uninit(&pmd->flow_cache);
3240 pmd_free_cached_ports(pmd);
3241 } else {
3242 latch_set(&pmd->exit_latch);
3243 dp_netdev_reload_pmd__(pmd);
3244 ovs_numa_unpin_core(pmd->core_id);
3245 xpthread_join(pmd->thread, NULL);
3246 }
3247
3248 dp_netdev_pmd_clear_ports(pmd);
3249
3250 /* Purges the 'pmd''s flows after stopping the thread, but before
3251 * destroying the flows, so that the flow stats can be collected. */
3252 if (dp->dp_purge_cb) {
3253 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
3254 }
3255 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
3256 dp_netdev_pmd_unref(pmd);
3257 }
3258
3259 /* Destroys all pmd threads. */
3260 static void
3261 dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
3262 {
3263 struct dp_netdev_pmd_thread *pmd;
3264 struct dp_netdev_pmd_thread **pmd_list;
3265 size_t k = 0, n_pmds;
3266
3267 n_pmds = cmap_count(&dp->poll_threads);
3268 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
3269
3270 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3271 /* We cannot call dp_netdev_del_pmd(), since it alters
3272 * 'dp->poll_threads' (while we're iterating it) and it
3273 * might quiesce. */
3274 ovs_assert(k < n_pmds);
3275 pmd_list[k++] = pmd;
3276 }
3277
3278 for (size_t i = 0; i < k; i++) {
3279 dp_netdev_del_pmd(dp, pmd_list[i]);
3280 }
3281 free(pmd_list);
3282 }
3283
3284 /* Deletes all pmd threads on numa node 'numa_id' and
3285 * fixes static_tx_qids of other threads to keep them sequential. */
3286 static void
3287 dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3288 {
3289 struct dp_netdev_pmd_thread *pmd;
3290 int n_pmds_on_numa, n_pmds;
3291 int *free_idx, k = 0;
3292 struct dp_netdev_pmd_thread **pmd_list;
3293
3294 n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
3295 free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
3296 pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
3297
3298 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3299 /* We cannot call dp_netdev_del_pmd(), since it alters
3300 * 'dp->poll_threads' (while we're iterating it) and it
3301 * might quiesce. */
3302 if (pmd->numa_id == numa_id) {
3303 atomic_read_relaxed(&pmd->static_tx_qid, &free_idx[k]);
3304 pmd_list[k] = pmd;
3305 ovs_assert(k < n_pmds_on_numa);
3306 k++;
3307 }
3308 }
3309
3310 for (int i = 0; i < k; i++) {
3311 dp_netdev_del_pmd(dp, pmd_list[i]);
3312 }
3313
3314 n_pmds = get_n_pmd_threads(dp);
3315 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3316 int old_tx_qid;
3317
3318 atomic_read_relaxed(&pmd->static_tx_qid, &old_tx_qid);
3319
3320 if (old_tx_qid >= n_pmds) {
3321 int new_tx_qid = free_idx[--k];
3322
3323 atomic_store_relaxed(&pmd->static_tx_qid, new_tx_qid);
3324 }
3325 }
3326
3327 free(pmd_list);
3328 free(free_idx);
3329 }
3330
3331 /* Deletes all rx queues from pmd->poll_list and all the ports from
3332 * pmd->tx_ports. */
3333 static void
3334 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
3335 {
3336 struct rxq_poll *poll;
3337 struct tx_port *port;
3338
3339 ovs_mutex_lock(&pmd->port_mutex);
3340 LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
3341 free(poll);
3342 }
3343 pmd->poll_cnt = 0;
3344 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
3345 free(port);
3346 }
3347 ovs_mutex_unlock(&pmd->port_mutex);
3348 }
3349
3350 static struct tx_port *
3351 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3352 {
3353 struct tx_port *tx;
3354
3355 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3356 if (tx->port->port_no == port_no) {
3357 return tx;
3358 }
3359 }
3360
3361 return NULL;
3362 }
3363
3364 /* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
3365 * 'tx_ports' of 'pmd' thread. Returns true if 'port' was found in 'pmd'
3366 * (therefore a restart is required). */
3367 static bool
3368 dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
3369 struct dp_netdev_pmd_thread *pmd)
3370 {
3371 struct rxq_poll *poll, *next;
3372 struct tx_port *tx;
3373 bool found = false;
3374
3375 ovs_mutex_lock(&pmd->port_mutex);
3376 LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
3377 if (poll->port == port) {
3378 found = true;
3379 ovs_list_remove(&poll->node);
3380 pmd->poll_cnt--;
3381 free(poll);
3382 }
3383 }
3384
3385 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
3386 if (tx) {
3387 hmap_remove(&pmd->tx_ports, &tx->node);
3388 free(tx);
3389 found = true;
3390 }
3391 ovs_mutex_unlock(&pmd->port_mutex);
3392
3393 return found;
3394 }
3395
3396 /* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3397 * threads. The pmd threads that need to be restarted are inserted in
3398 * 'to_reload'. */
3399 static void
3400 dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
3401 struct dp_netdev_port *port,
3402 struct hmapx *to_reload)
3403 {
3404 struct dp_netdev_pmd_thread *pmd;
3405
3406 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3407 bool found;
3408
3409 found = dp_netdev_del_port_from_pmd__(port, pmd);
3410
3411 if (found) {
3412 hmapx_add(to_reload, pmd);
3413 }
3414 }
3415 }
3416
3417 /* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3418 * threads. Reloads the threads if needed. */
3419 static void
3420 dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
3421 struct dp_netdev_port *port)
3422 {
3423 struct dp_netdev_pmd_thread *pmd;
3424 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3425 struct hmapx_node *node;
3426
3427 dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);
3428
3429 HMAPX_FOR_EACH (node, &to_reload) {
3430 pmd = (struct dp_netdev_pmd_thread *) node->data;
3431 dp_netdev_reload_pmd__(pmd);
3432 }
3433
3434 hmapx_destroy(&to_reload);
3435 }
3436
3437
3438 /* Returns non-isolated PMD thread from this numa node with fewer
3439 * rx queues to poll. Returns NULL if there is no non-isolated PMD threads
3440 * on this numa node. Can be called safely only by main thread. */
3441 static struct dp_netdev_pmd_thread *
3442 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
3443 {
3444 int min_cnt = -1;
3445 struct dp_netdev_pmd_thread *pmd, *res = NULL;
3446
3447 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3448 if (!pmd->isolated && pmd->numa_id == numa_id
3449 && (min_cnt > pmd->poll_cnt || res == NULL)) {
3450 min_cnt = pmd->poll_cnt;
3451 res = pmd;
3452 }
3453 }
3454
3455 return res;
3456 }
3457
3458 /* Adds rx queue to poll_list of PMD thread. */
3459 static void
3460 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3461 struct dp_netdev_port *port, struct netdev_rxq *rx)
3462 OVS_REQUIRES(pmd->port_mutex)
3463 {
3464 struct rxq_poll *poll = xmalloc(sizeof *poll);
3465
3466 poll->port = port;
3467 poll->rx = rx;
3468
3469 ovs_list_push_back(&pmd->poll_list, &poll->node);
3470 pmd->poll_cnt++;
3471 }
3472
3473 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
3474 * changes to take effect. */
3475 static void
3476 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
3477 struct dp_netdev_port *port)
3478 {
3479 struct tx_port *tx = xzalloc(sizeof *tx);
3480
3481 tx->port = port;
3482 tx->qid = -1;
3483
3484 ovs_mutex_lock(&pmd->port_mutex);
3485 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
3486 ovs_mutex_unlock(&pmd->port_mutex);
3487 }
3488
3489 /* Distribute all {pinned|non-pinned} rx queues of 'port' between PMD
3490 * threads in 'dp'. The pmd threads that need to be restarted are inserted
3491 * in 'to_reload'. PMD threads with pinned queues marked as isolated. */
3492 static void
3493 dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
3494 struct dp_netdev_port *port,
3495 struct hmapx *to_reload, bool pinned)
3496 {
3497 int numa_id = netdev_get_numa_id(port->netdev);
3498 struct dp_netdev_pmd_thread *pmd;
3499 int i;
3500
3501 if (!netdev_is_pmd(port->netdev)) {
3502 return;
3503 }
3504
3505 for (i = 0; i < port->n_rxq; i++) {
3506 if (pinned) {
3507 if (port->rxqs[i].core_id == -1) {
3508 continue;
3509 }
3510 pmd = dp_netdev_get_pmd(dp, port->rxqs[i].core_id);
3511 if (!pmd) {
3512 VLOG_WARN("There is no PMD thread on core %d. "
3513 "Queue %d on port \'%s\' will not be polled.",
3514 port->rxqs[i].core_id, i,
3515 netdev_get_name(port->netdev));
3516 continue;
3517 }
3518 pmd->isolated = true;
3519 dp_netdev_pmd_unref(pmd);
3520 } else {
3521 if (port->rxqs[i].core_id != -1) {
3522 continue;
3523 }
3524 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
3525 if (!pmd) {
3526 VLOG_WARN("There's no available pmd thread on numa node %d",
3527 numa_id);
3528 break;
3529 }
3530 }
3531
3532 ovs_mutex_lock(&pmd->port_mutex);
3533 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxqs[i].rxq);
3534 ovs_mutex_unlock(&pmd->port_mutex);
3535
3536 hmapx_add(to_reload, pmd);
3537 }
3538 }
3539
3540 /* Distributes all non-pinned rx queues of 'port' between all PMD threads
3541 * in 'dp' and inserts 'port' in the PMD threads 'tx_ports'. The pmd threads
3542 * that need to be restarted are inserted in 'to_reload'. */
3543 static void
3544 dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
3545 struct hmapx *to_reload)
3546 {
3547 struct dp_netdev_pmd_thread *pmd;
3548
3549 dp_netdev_add_port_rx_to_pmds(dp, port, to_reload, false);
3550
3551 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3552 dp_netdev_add_port_tx_to_pmd(pmd, port);
3553 hmapx_add(to_reload, pmd);
3554 }
3555 }
3556
3557 /* Distributes all non-pinned rx queues of 'port' between all PMD threads
3558 * in 'dp', inserts 'port' in the PMD threads 'tx_ports' and reloads them,
3559 * if needed. */
3560 static void
3561 dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
3562 {
3563 struct dp_netdev_pmd_thread *pmd;
3564 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3565 struct hmapx_node *node;
3566
3567 dp_netdev_add_port_to_pmds__(dp, port, &to_reload);
3568
3569 HMAPX_FOR_EACH (node, &to_reload) {
3570 pmd = (struct dp_netdev_pmd_thread *) node->data;
3571 dp_netdev_reload_pmd__(pmd);
3572 }
3573
3574 hmapx_destroy(&to_reload);
3575 }
3576
3577 /* Starts pmd threads for the numa node 'numa_id', if not already started.
3578 * The function takes care of filling the threads tx port cache. */
3579 static void
3580 dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3581 OVS_REQUIRES(dp->port_mutex)
3582 {
3583 int n_pmds;
3584
3585 if (!ovs_numa_numa_id_is_valid(numa_id)) {
3586 VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
3587 numa_id);
3588 return;
3589 }
3590
3591 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
3592
3593 /* If there are already pmd threads created for the numa node
3594 * in which 'netdev' is on, do nothing. Else, creates the
3595 * pmd threads for the numa node. */
3596 if (!n_pmds) {
3597 int can_have, n_unpinned, i;
3598
3599 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
3600 if (!n_unpinned) {
3601 VLOG_WARN("Cannot create pmd threads due to out of unpinned "
3602 "cores on numa node %d", numa_id);
3603 return;
3604 }
3605
3606 /* If cpu mask is specified, uses all unpinned cores, otherwise
3607 * tries creating NR_PMD_THREADS pmd threads. */
3608 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
3609 for (i = 0; i < can_have; i++) {
3610 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
3611 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
3612 struct dp_netdev_port *port;
3613
3614 dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
3615
3616 HMAP_FOR_EACH (port, node, &dp->ports) {
3617 dp_netdev_add_port_tx_to_pmd(pmd, port);
3618 }
3619
3620 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
3621 }
3622 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
3623 }
3624 }
3625
3626 \f
3627 /* Called after pmd threads config change. Restarts pmd threads with
3628 * new configuration. */
3629 static void
3630 dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
3631 OVS_REQUIRES(dp->port_mutex)
3632 {
3633 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3634 struct dp_netdev_pmd_thread *pmd;
3635 struct dp_netdev_port *port;
3636 struct hmapx_node *node;
3637
3638 HMAP_FOR_EACH (port, node, &dp->ports) {
3639 if (netdev_is_pmd(port->netdev)) {
3640 int numa_id = netdev_get_numa_id(port->netdev);
3641
3642 dp_netdev_set_pmds_on_numa(dp, numa_id);
3643 }
3644 /* Distribute only pinned rx queues first to mark threads as isolated */
3645 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, true);
3646 }
3647
3648 /* Distribute remaining non-pinned rx queues to non-isolated PMD threads. */
3649 HMAP_FOR_EACH (port, node, &dp->ports) {
3650 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, false);
3651 }
3652
3653 HMAPX_FOR_EACH (node, &to_reload) {
3654 pmd = (struct dp_netdev_pmd_thread *) node->data;
3655 dp_netdev_reload_pmd__(pmd);
3656 }
3657
3658 hmapx_destroy(&to_reload);
3659 }
3660
3661 static char *
3662 dpif_netdev_get_datapath_version(void)
3663 {
3664 return xstrdup("<built-in>");
3665 }
3666
3667 static void
3668 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
3669 uint16_t tcp_flags, long long now)
3670 {
3671 uint16_t flags;
3672
3673 atomic_store_relaxed(&netdev_flow->stats.used, now);
3674 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
3675 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
3676 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3677 flags |= tcp_flags;
3678 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
3679 }
3680
3681 static void
3682 dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
3683 enum dp_stat_type type, int cnt)
3684 {
3685 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
3686 }
3687
3688 static int
3689 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
3690 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
3691 enum dpif_upcall_type type, const struct nlattr *userdata,
3692 struct ofpbuf *actions, struct ofpbuf *put_actions)
3693 {
3694 struct dp_netdev *dp = pmd->dp;
3695 struct flow_tnl orig_tunnel;
3696 int err;
3697
3698 if (OVS_UNLIKELY(!dp->upcall_cb)) {
3699 return ENODEV;
3700 }
3701
3702 /* Upcall processing expects the Geneve options to be in the translated
3703 * format but we need to retain the raw format for datapath use. */
3704 orig_tunnel.flags = flow->tunnel.flags;
3705 if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
3706 orig_tunnel.metadata.present.len = flow->tunnel.metadata.present.len;
3707 memcpy(orig_tunnel.metadata.opts.gnv, flow->tunnel.metadata.opts.gnv,
3708 flow->tunnel.metadata.present.len);
3709 err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel,
3710 &flow->tunnel);
3711 if (err) {
3712 return err;
3713 }
3714 }
3715
3716 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
3717 struct ds ds = DS_EMPTY_INITIALIZER;
3718 char *packet_str;
3719 struct ofpbuf key;
3720 struct odp_flow_key_parms odp_parms = {
3721 .flow = flow,
3722 .mask = &wc->masks,
3723 .support = dp_netdev_support,
3724 };
3725
3726 ofpbuf_init(&key, 0);
3727 odp_flow_key_from_flow(&odp_parms, &key);
3728 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3729 dp_packet_size(packet_));
3730
3731 odp_flow_key_format(key.data, key.size, &ds);
3732
3733 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3734 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3735
3736 ofpbuf_uninit(&key);
3737 free(packet_str);
3738
3739 ds_destroy(&ds);
3740 }
3741
3742 err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
3743 actions, wc, put_actions, dp->upcall_aux);
3744 if (err && err != ENOSPC) {
3745 return err;
3746 }
3747
3748 /* Translate tunnel metadata masks to datapath format. */
3749 if (wc) {
3750 if (wc->masks.tunnel.metadata.present.map) {
3751 struct geneve_opt opts[TLV_TOT_OPT_SIZE /
3752 sizeof(struct geneve_opt)];
3753
3754 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3755 tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
3756 &wc->masks.tunnel,
3757 orig_tunnel.metadata.opts.gnv,
3758 orig_tunnel.metadata.present.len,
3759 opts);
3760 } else {
3761 orig_tunnel.metadata.present.len = 0;
3762 }
3763
3764 memset(&wc->masks.tunnel.metadata, 0,
3765 sizeof wc->masks.tunnel.metadata);
3766 memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts,
3767 orig_tunnel.metadata.present.len);
3768 }
3769 wc->masks.tunnel.metadata.present.len = 0xff;
3770 }
3771
3772 /* Restore tunnel metadata. We need to use the saved options to ensure
3773 * that any unknown options are not lost. The generated mask will have
3774 * the same structure, matching on types and lengths but wildcarding
3775 * option data we don't care about. */
3776 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3777 memcpy(&flow->tunnel.metadata.opts.gnv, orig_tunnel.metadata.opts.gnv,
3778 orig_tunnel.metadata.present.len);
3779 flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len;
3780 flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
3781 }
3782
3783 return err;
3784 }
3785
3786 static inline uint32_t
3787 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3788 const struct miniflow *mf)
3789 {
3790 uint32_t hash, recirc_depth;
3791
3792 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
3793 hash = dp_packet_get_rss_hash(packet);
3794 } else {
3795 hash = miniflow_hash_5tuple(mf, 0);
3796 dp_packet_set_rss_hash(packet, hash);
3797 }
3798
3799 /* The RSS hash must account for the recirculation depth to avoid
3800 * collisions in the exact match cache */
3801 recirc_depth = *recirc_depth_get_unsafe();
3802 if (OVS_UNLIKELY(recirc_depth)) {
3803 hash = hash_finish(hash, recirc_depth);
3804 dp_packet_set_rss_hash(packet, hash);
3805 }
3806 return hash;
3807 }
3808
3809 struct packet_batch_per_flow {
3810 unsigned int byte_count;
3811 uint16_t tcp_flags;
3812 struct dp_netdev_flow *flow;
3813
3814 struct dp_packet_batch array;
3815 };
3816
3817 static inline void
3818 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
3819 struct dp_packet *packet,
3820 const struct miniflow *mf)
3821 {
3822 batch->byte_count += dp_packet_size(packet);
3823 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3824 batch->array.packets[batch->array.count++] = packet;
3825 }
3826
3827 static inline void
3828 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
3829 struct dp_netdev_flow *flow)
3830 {
3831 flow->batch = batch;
3832
3833 batch->flow = flow;
3834 dp_packet_batch_init(&batch->array);
3835 batch->byte_count = 0;
3836 batch->tcp_flags = 0;
3837 }
3838
3839 static inline void
3840 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
3841 struct dp_netdev_pmd_thread *pmd,
3842 long long now)
3843 {
3844 struct dp_netdev_actions *actions;
3845 struct dp_netdev_flow *flow = batch->flow;
3846
3847 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
3848 batch->tcp_flags, now);
3849
3850 actions = dp_netdev_flow_get_actions(flow);
3851
3852 dp_netdev_execute_actions(pmd, &batch->array, true,
3853 actions->actions, actions->size, now);
3854 }
3855
3856 static inline void
3857 dp_netdev_queue_batches(struct dp_packet *pkt,
3858 struct dp_netdev_flow *flow, const struct miniflow *mf,
3859 struct packet_batch_per_flow *batches, size_t *n_batches)
3860 {
3861 struct packet_batch_per_flow *batch = flow->batch;
3862
3863 if (OVS_UNLIKELY(!batch)) {
3864 batch = &batches[(*n_batches)++];
3865 packet_batch_per_flow_init(batch, flow);
3866 }
3867
3868 packet_batch_per_flow_update(batch, pkt, mf);
3869 }
3870
3871 /* Try to process all ('cnt') the 'packets' using only the exact match cache
3872 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
3873 * miniflow is copied into 'keys' and the packet pointer is moved at the
3874 * beginning of the 'packets' array.
3875 *
3876 * The function returns the number of packets that needs to be processed in the
3877 * 'packets' array (they have been moved to the beginning of the vector).
3878 *
3879 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
3880 * initialized by this function using 'port_no'.
3881 */
3882 static inline size_t
3883 emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
3884 struct netdev_flow_key *keys,
3885 struct packet_batch_per_flow batches[], size_t *n_batches,
3886 bool md_is_valid, odp_port_t port_no)
3887 {
3888 struct emc_cache *flow_cache = &pmd->flow_cache;
3889 struct netdev_flow_key *key = &keys[0];
3890 size_t i, n_missed = 0, n_dropped = 0;
3891 struct dp_packet **packets = packets_->packets;
3892 int cnt = packets_->count;
3893
3894 for (i = 0; i < cnt; i++) {
3895 struct dp_netdev_flow *flow;
3896 struct dp_packet *packet = packets[i];
3897
3898 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
3899 dp_packet_delete(packet);
3900 n_dropped++;
3901 continue;
3902 }
3903
3904 if (i != cnt - 1) {
3905 /* Prefetch next packet data and metadata. */
3906 OVS_PREFETCH(dp_packet_data(packets[i+1]));
3907 pkt_metadata_prefetch_init(&packets[i+1]->md);
3908 }
3909
3910 if (!md_is_valid) {
3911 pkt_metadata_init(&packet->md, port_no);
3912 }
3913 miniflow_extract(packet, &key->mf);
3914 key->len = 0; /* Not computed yet. */
3915 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
3916
3917 flow = emc_lookup(flow_cache, key);
3918 if (OVS_LIKELY(flow)) {
3919 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
3920 n_batches);
3921 } else {
3922 /* Exact match cache missed. Group missed packets together at
3923 * the beginning of the 'packets' array. */
3924 packets[n_missed] = packet;
3925 /* 'key[n_missed]' contains the key of the current packet and it
3926 * must be returned to the caller. The next key should be extracted
3927 * to 'keys[n_missed + 1]'. */
3928 key = &keys[++n_missed];
3929 }
3930 }
3931
3932 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
3933
3934 return n_missed;
3935 }
3936
3937 static inline void
3938 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
3939 const struct netdev_flow_key *key,
3940 struct ofpbuf *actions, struct ofpbuf *put_actions,
3941 int *lost_cnt, long long now)
3942 {
3943 struct ofpbuf *add_actions;
3944 struct dp_packet_batch b;
3945 struct match match;
3946 ovs_u128 ufid;
3947 int error;
3948
3949 match.tun_md.valid = false;
3950 miniflow_expand(&key->mf, &match.flow);
3951
3952 ofpbuf_clear(actions);
3953 ofpbuf_clear(put_actions);
3954
3955 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
3956 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
3957 &ufid, DPIF_UC_MISS, NULL, actions,
3958 put_actions);
3959 if (OVS_UNLIKELY(error && error != ENOSPC)) {
3960 dp_packet_delete(packet);
3961 (*lost_cnt)++;
3962 return;
3963 }
3964
3965 /* The Netlink encoding of datapath flow keys cannot express
3966 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3967 * tag is interpreted as exact match on the fact that there is no
3968 * VLAN. Unless we refactor a lot of code that translates between
3969 * Netlink and struct flow representations, we have to do the same
3970 * here. */
3971 if (!match.wc.masks.vlan_tci) {
3972 match.wc.masks.vlan_tci = htons(0xffff);
3973 }
3974
3975 /* We can't allow the packet batching in the next loop to execute
3976 * the actions. Otherwise, if there are any slow path actions,
3977 * we'll send the packet up twice. */
3978 packet_batch_init_packet(&b, packet);
3979 dp_netdev_execute_actions(pmd, &b, true,
3980 actions->data, actions->size, now);
3981
3982 add_actions = put_actions->size ? put_actions : actions;
3983 if (OVS_LIKELY(error != ENOSPC)) {
3984 struct dp_netdev_flow *netdev_flow;
3985
3986 /* XXX: There's a race window where a flow covering this packet
3987 * could have already been installed since we last did the flow
3988 * lookup before upcall. This could be solved by moving the
3989 * mutex lock outside the loop, but that's an awful long time
3990 * to be locking everyone out of making flow installs. If we
3991 * move to a per-core classifier, it would be reasonable. */
3992 ovs_mutex_lock(&pmd->flow_mutex);
3993 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key);
3994 if (OVS_LIKELY(!netdev_flow)) {
3995 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
3996 add_actions->data,
3997 add_actions->size);
3998 }
3999 ovs_mutex_unlock(&pmd->flow_mutex);
4000
4001 emc_insert(&pmd->flow_cache, key, netdev_flow);
4002 }
4003 }
4004
4005 static inline void
4006 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
4007 struct dp_packet_batch *packets_,
4008 struct netdev_flow_key *keys,
4009 struct packet_batch_per_flow batches[], size_t *n_batches,
4010 long long now)
4011 {
4012 int cnt = packets_->count;
4013 #if !defined(__CHECKER__) && !defined(_WIN32)
4014 const size_t PKT_ARRAY_SIZE = cnt;
4015 #else
4016 /* Sparse or MSVC doesn't like variable length array. */
4017 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4018 #endif
4019 struct dp_packet **packets = packets_->packets;
4020 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
4021 struct dp_netdev *dp = pmd->dp;
4022 struct emc_cache *flow_cache = &pmd->flow_cache;
4023 int miss_cnt = 0, lost_cnt = 0;
4024 bool any_miss;
4025 size_t i;
4026
4027 for (i = 0; i < cnt; i++) {
4028 /* Key length is needed in all the cases, hash computed on demand. */
4029 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
4030 }
4031 any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
4032 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4033 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
4034 struct ofpbuf actions, put_actions;
4035
4036 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
4037 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
4038
4039 for (i = 0; i < cnt; i++) {
4040 struct dp_netdev_flow *netdev_flow;
4041
4042 if (OVS_LIKELY(rules[i])) {
4043 continue;
4044 }
4045
4046 /* It's possible that an earlier slow path execution installed
4047 * a rule covering this flow. In this case, it's a lot cheaper
4048 * to catch it here than execute a miss. */
4049 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
4050 if (netdev_flow) {
4051 rules[i] = &netdev_flow->cr;
4052 continue;
4053 }
4054
4055 miss_cnt++;
4056 handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
4057 &put_actions, &lost_cnt, now);
4058 }
4059
4060 ofpbuf_uninit(&actions);
4061 ofpbuf_uninit(&put_actions);
4062 fat_rwlock_unlock(&dp->upcall_rwlock);
4063 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
4064 } else if (OVS_UNLIKELY(any_miss)) {
4065 for (i = 0; i < cnt; i++) {
4066 if (OVS_UNLIKELY(!rules[i])) {
4067 dp_packet_delete(packets[i]);
4068 lost_cnt++;
4069 miss_cnt++;
4070 }
4071 }
4072 }
4073
4074 for (i = 0; i < cnt; i++) {
4075 struct dp_packet *packet = packets[i];
4076 struct dp_netdev_flow *flow;
4077
4078 if (OVS_UNLIKELY(!rules[i])) {
4079 continue;
4080 }
4081
4082 flow = dp_netdev_flow_cast(rules[i]);
4083
4084 emc_insert(flow_cache, &keys[i], flow);
4085 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
4086 }
4087
4088 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
4089 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
4090 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
4091 }
4092
4093 /* Packets enter the datapath from a port (or from recirculation) here.
4094 *
4095 * For performance reasons a caller may choose not to initialize the metadata
4096 * in 'packets': in this case 'mdinit' is false and this function needs to
4097 * initialize it using 'port_no'. If the metadata in 'packets' is already
4098 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
4099 static void
4100 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
4101 struct dp_packet_batch *packets,
4102 bool md_is_valid, odp_port_t port_no)
4103 {
4104 int cnt = packets->count;
4105 #if !defined(__CHECKER__) && !defined(_WIN32)
4106 const size_t PKT_ARRAY_SIZE = cnt;
4107 #else
4108 /* Sparse or MSVC doesn't like variable length array. */
4109 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4110 #endif
4111 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
4112 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
4113 long long now = time_msec();
4114 size_t newcnt, n_batches, i;
4115
4116 n_batches = 0;
4117 newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
4118 md_is_valid, port_no);
4119 if (OVS_UNLIKELY(newcnt)) {
4120 packets->count = newcnt;
4121 fast_path_processing(pmd, packets, keys, batches, &n_batches, now);
4122 }
4123
4124 for (i = 0; i < n_batches; i++) {
4125 batches[i].flow->batch = NULL;
4126 }
4127
4128 for (i = 0; i < n_batches; i++) {
4129 packet_batch_per_flow_execute(&batches[i], pmd, now);
4130 }
4131 }
4132
4133 static void
4134 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
4135 struct dp_packet_batch *packets,
4136 odp_port_t port_no)
4137 {
4138 dp_netdev_input__(pmd, packets, false, port_no);
4139 }
4140
4141 static void
4142 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
4143 struct dp_packet_batch *packets)
4144 {
4145 dp_netdev_input__(pmd, packets, true, 0);
4146 }
4147
4148 struct dp_netdev_execute_aux {
4149 struct dp_netdev_pmd_thread *pmd;
4150 long long now;
4151 };
4152
4153 static void
4154 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
4155 void *aux)
4156 {
4157 struct dp_netdev *dp = get_dp_netdev(dpif);
4158 dp->dp_purge_aux = aux;
4159 dp->dp_purge_cb = cb;
4160 }
4161
4162 static void
4163 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
4164 void *aux)
4165 {
4166 struct dp_netdev *dp = get_dp_netdev(dpif);
4167 dp->upcall_aux = aux;
4168 dp->upcall_cb = cb;
4169 }
4170
4171 static void
4172 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
4173 long long now, bool purge)
4174 {
4175 struct tx_port *tx;
4176 struct dp_netdev_port *port;
4177 long long interval;
4178
4179 HMAP_FOR_EACH (tx, node, &pmd->port_cache) {
4180 if (tx->port->dynamic_txqs) {
4181 continue;
4182 }
4183 interval = now - tx->last_used;
4184 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
4185 port = tx->port;
4186 ovs_mutex_lock(&port->txq_used_mutex);
4187 port->txq_used[tx->qid]--;
4188 ovs_mutex_unlock(&port->txq_used_mutex);
4189 tx->qid = -1;
4190 }
4191 }
4192 }
4193
4194 static int
4195 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
4196 struct tx_port *tx, long long now)
4197 {
4198 struct dp_netdev_port *port;
4199 long long interval;
4200 int i, min_cnt, min_qid;
4201
4202 if (OVS_UNLIKELY(!now)) {
4203 now = time_msec();
4204 }
4205
4206 interval = now - tx->last_used;
4207 tx->last_used = now;
4208
4209 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
4210 return tx->qid;
4211 }
4212
4213 port = tx->port;
4214
4215 ovs_mutex_lock(&port->txq_used_mutex);
4216 if (tx->qid >= 0) {
4217 port->txq_used[tx->qid]--;
4218 tx->qid = -1;
4219 }
4220
4221 min_cnt = -1;
4222 min_qid = 0;
4223 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
4224 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
4225 min_cnt = port->txq_used[i];
4226 min_qid = i;
4227 }
4228 }
4229
4230 port->txq_used[min_qid]++;
4231 tx->qid = min_qid;
4232
4233 ovs_mutex_unlock(&port->txq_used_mutex);
4234
4235 dpif_netdev_xps_revalidate_pmd(pmd, now, false);
4236
4237 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
4238 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
4239 return min_qid;
4240 }
4241
4242 static struct tx_port *
4243 pmd_tx_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
4244 odp_port_t port_no)
4245 {
4246 return tx_port_lookup(&pmd->port_cache, port_no);
4247 }
4248
4249 static int
4250 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
4251 const struct nlattr *attr,
4252 struct dp_packet_batch *batch)
4253 {
4254 struct tx_port *tun_port;
4255 const struct ovs_action_push_tnl *data;
4256 int err;
4257
4258 data = nl_attr_get(attr);
4259
4260 tun_port = pmd_tx_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
4261 if (!tun_port) {
4262 err = -EINVAL;
4263 goto error;
4264 }
4265 err = netdev_push_header(tun_port->port->netdev, batch, data);
4266 if (!err) {
4267 return 0;
4268 }
4269 error:
4270 dp_packet_delete_batch(batch, true);
4271 return err;
4272 }
4273
4274 static void
4275 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
4276 struct dp_packet *packet, bool may_steal,
4277 struct flow *flow, ovs_u128 *ufid,
4278 struct ofpbuf *actions,
4279 const struct nlattr *userdata, long long now)
4280 {
4281 struct dp_packet_batch b;
4282 int error;
4283
4284 ofpbuf_clear(actions);
4285
4286 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
4287 DPIF_UC_ACTION, userdata, actions,
4288 NULL);
4289 if (!error || error == ENOSPC) {
4290 packet_batch_init_packet(&b, packet);
4291 dp_netdev_execute_actions(pmd, &b, may_steal,
4292 actions->data, actions->size, now);
4293 } else if (may_steal) {
4294 dp_packet_delete(packet);
4295 }
4296 }
4297
4298 static void
4299 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
4300 const struct nlattr *a, bool may_steal)
4301 {
4302 struct dp_netdev_execute_aux *aux = aux_;
4303 uint32_t *depth = recirc_depth_get();
4304 struct dp_netdev_pmd_thread *pmd = aux->pmd;
4305 struct dp_netdev *dp = pmd->dp;
4306 int type = nl_attr_type(a);
4307 long long now = aux->now;
4308 struct tx_port *p;
4309
4310 switch ((enum ovs_action_attr)type) {
4311 case OVS_ACTION_ATTR_OUTPUT:
4312 p = pmd_tx_port_cache_lookup(pmd, u32_to_odp(nl_attr_get_u32(a)));
4313 if (OVS_LIKELY(p)) {
4314 int tx_qid;
4315 bool dynamic_txqs;
4316
4317 dynamic_txqs = p->port->dynamic_txqs;
4318 if (dynamic_txqs) {
4319 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
4320 } else {
4321 atomic_read_relaxed(&pmd->static_tx_qid, &tx_qid);
4322 }
4323
4324 netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
4325 dynamic_txqs);
4326 return;
4327 }
4328 break;
4329
4330 case OVS_ACTION_ATTR_TUNNEL_PUSH:
4331 if (*depth < MAX_RECIRC_DEPTH) {
4332 struct dp_packet_batch tnl_pkt;
4333 struct dp_packet_batch *orig_packets_ = packets_;
4334 int err;
4335
4336 if (!may_steal) {
4337 dp_packet_batch_clone(&tnl_pkt, packets_);
4338 packets_ = &tnl_pkt;
4339 dp_packet_batch_reset_cutlen(orig_packets_);
4340 }
4341
4342 dp_packet_batch_apply_cutlen(packets_);
4343
4344 err = push_tnl_action(pmd, a, packets_);
4345 if (!err) {
4346 (*depth)++;
4347 dp_netdev_recirculate(pmd, packets_);
4348 (*depth)--;
4349 }
4350 return;
4351 }
4352 break;
4353
4354 case OVS_ACTION_ATTR_TUNNEL_POP:
4355 if (*depth < MAX_RECIRC_DEPTH) {
4356 struct dp_packet_batch *orig_packets_ = packets_;
4357 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
4358
4359 p = pmd_tx_port_cache_lookup(pmd, portno);
4360 if (p) {
4361 struct dp_packet_batch tnl_pkt;
4362 int i;
4363
4364 if (!may_steal) {
4365 dp_packet_batch_clone(&tnl_pkt, packets_);
4366 packets_ = &tnl_pkt;
4367 dp_packet_batch_reset_cutlen(orig_packets_);
4368 }
4369
4370 dp_packet_batch_apply_cutlen(packets_);
4371
4372 netdev_pop_header(p->port->netdev, packets_);
4373 if (!packets_->count) {
4374 return;
4375 }
4376
4377 for (i = 0; i < packets_->count; i++) {
4378 packets_->packets[i]->md.in_port.odp_port = portno;
4379 }
4380
4381 (*depth)++;
4382 dp_netdev_recirculate(pmd, packets_);
4383 (*depth)--;
4384 return;
4385 }
4386 }
4387 break;
4388
4389 case OVS_ACTION_ATTR_USERSPACE:
4390 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4391 struct dp_packet_batch *orig_packets_ = packets_;
4392 struct dp_packet **packets = packets_->packets;
4393 const struct nlattr *userdata;
4394 struct dp_packet_batch usr_pkt;
4395 struct ofpbuf actions;
4396 struct flow flow;
4397 ovs_u128 ufid;
4398 bool clone = false;
4399 int i;
4400
4401 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
4402 ofpbuf_init(&actions, 0);
4403
4404 if (packets_->trunc) {
4405 if (!may_steal) {
4406 dp_packet_batch_clone(&usr_pkt, packets_);
4407 packets_ = &usr_pkt;
4408 packets = packets_->packets;
4409 clone = true;
4410 dp_packet_batch_reset_cutlen(orig_packets_);
4411 }
4412
4413 dp_packet_batch_apply_cutlen(packets_);
4414 }
4415
4416 for (i = 0; i < packets_->count; i++) {
4417 flow_extract(packets[i], &flow);
4418 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
4419 dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
4420 &ufid, &actions, userdata, now);
4421 }
4422
4423 if (clone) {
4424 dp_packet_delete_batch(packets_, true);
4425 }
4426
4427 ofpbuf_uninit(&actions);
4428 fat_rwlock_unlock(&dp->upcall_rwlock);
4429
4430 return;
4431 }
4432 break;
4433
4434 case OVS_ACTION_ATTR_RECIRC:
4435 if (*depth < MAX_RECIRC_DEPTH) {
4436 struct dp_packet_batch recirc_pkts;
4437 int i;
4438
4439 if (!may_steal) {
4440 dp_packet_batch_clone(&recirc_pkts, packets_);
4441 packets_ = &recirc_pkts;
4442 }
4443
4444 for (i = 0; i < packets_->count; i++) {
4445 packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
4446 }
4447
4448 (*depth)++;
4449 dp_netdev_recirculate(pmd, packets_);
4450 (*depth)--;
4451
4452 return;
4453 }
4454
4455 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
4456 break;
4457
4458 case OVS_ACTION_ATTR_CT:
4459 /* If a flow with this action is slow-pathed, datapath assistance is
4460 * required to implement it. However, we don't support this action
4461 * in the userspace datapath. */
4462 VLOG_WARN("Cannot execute conntrack action in userspace.");
4463 break;
4464
4465 case OVS_ACTION_ATTR_PUSH_VLAN:
4466 case OVS_ACTION_ATTR_POP_VLAN:
4467 case OVS_ACTION_ATTR_PUSH_MPLS:
4468 case OVS_ACTION_ATTR_POP_MPLS:
4469 case OVS_ACTION_ATTR_SET:
4470 case OVS_ACTION_ATTR_SET_MASKED:
4471 case OVS_ACTION_ATTR_SAMPLE:
4472 case OVS_ACTION_ATTR_HASH:
4473 case OVS_ACTION_ATTR_UNSPEC:
4474 case OVS_ACTION_ATTR_TRUNC:
4475 case __OVS_ACTION_ATTR_MAX:
4476 OVS_NOT_REACHED();
4477 }
4478
4479 dp_packet_delete_batch(packets_, may_steal);
4480 }
4481
4482 static void
4483 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
4484 struct dp_packet_batch *packets,
4485 bool may_steal,
4486 const struct nlattr *actions, size_t actions_len,
4487 long long now)
4488 {
4489 struct dp_netdev_execute_aux aux = { pmd, now };
4490
4491 odp_execute_actions(&aux, packets, may_steal, actions,
4492 actions_len, dp_execute_cb);
4493 }
4494
4495 const struct dpif_class dpif_netdev_class = {
4496 "netdev",
4497 dpif_netdev_init,
4498 dpif_netdev_enumerate,
4499 dpif_netdev_port_open_type,
4500 dpif_netdev_open,
4501 dpif_netdev_close,
4502 dpif_netdev_destroy,
4503 dpif_netdev_run,
4504 dpif_netdev_wait,
4505 dpif_netdev_get_stats,
4506 dpif_netdev_port_add,
4507 dpif_netdev_port_del,
4508 dpif_netdev_port_set_config,
4509 dpif_netdev_port_query_by_number,
4510 dpif_netdev_port_query_by_name,
4511 NULL, /* port_get_pid */
4512 dpif_netdev_port_dump_start,
4513 dpif_netdev_port_dump_next,
4514 dpif_netdev_port_dump_done,
4515 dpif_netdev_port_poll,
4516 dpif_netdev_port_poll_wait,
4517 dpif_netdev_flow_flush,
4518 dpif_netdev_flow_dump_create,
4519 dpif_netdev_flow_dump_destroy,
4520 dpif_netdev_flow_dump_thread_create,
4521 dpif_netdev_flow_dump_thread_destroy,
4522 dpif_netdev_flow_dump_next,
4523 dpif_netdev_operate,
4524 NULL, /* recv_set */
4525 NULL, /* handlers_set */
4526 dpif_netdev_pmd_set,
4527 dpif_netdev_queue_to_priority,
4528 NULL, /* recv */
4529 NULL, /* recv_wait */
4530 NULL, /* recv_purge */
4531 dpif_netdev_register_dp_purge_cb,
4532 dpif_netdev_register_upcall_cb,
4533 dpif_netdev_enable_upcall,
4534 dpif_netdev_disable_upcall,
4535 dpif_netdev_get_datapath_version,
4536 NULL, /* ct_dump_start */
4537 NULL, /* ct_dump_next */
4538 NULL, /* ct_dump_done */
4539 NULL, /* ct_flush */
4540 };
4541
4542 static void
4543 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
4544 const char *argv[], void *aux OVS_UNUSED)
4545 {
4546 struct dp_netdev_port *port;
4547 struct dp_netdev *dp;
4548 odp_port_t port_no;
4549
4550 ovs_mutex_lock(&dp_netdev_mutex);
4551 dp = shash_find_data(&dp_netdevs, argv[1]);
4552 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
4553 ovs_mutex_unlock(&dp_netdev_mutex);
4554 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4555 return;
4556 }
4557 ovs_refcount_ref(&dp->ref_cnt);
4558 ovs_mutex_unlock(&dp_netdev_mutex);
4559
4560 ovs_mutex_lock(&dp->port_mutex);
4561 if (get_port_by_name(dp, argv[2], &port)) {
4562 unixctl_command_reply_error(conn, "unknown port");
4563 goto exit;
4564 }
4565
4566 port_no = u32_to_odp(atoi(argv[3]));
4567 if (!port_no || port_no == ODPP_NONE) {
4568 unixctl_command_reply_error(conn, "bad port number");
4569 goto exit;
4570 }
4571 if (dp_netdev_lookup_port(dp, port_no)) {
4572 unixctl_command_reply_error(conn, "port number already in use");
4573 goto exit;
4574 }
4575
4576 /* Remove port. */
4577 hmap_remove(&dp->ports, &port->node);
4578 dp_netdev_del_port_from_all_pmds(dp, port);
4579
4580 /* Reinsert with new port number. */
4581 port->port_no = port_no;
4582 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
4583 dp_netdev_add_port_to_pmds(dp, port);
4584
4585 seq_change(dp->port_seq);
4586 unixctl_command_reply(conn, NULL);
4587
4588 exit:
4589 ovs_mutex_unlock(&dp->port_mutex);
4590 dp_netdev_unref(dp);
4591 }
4592
4593 static void
4594 dpif_dummy_register__(const char *type)
4595 {
4596 struct dpif_class *class;
4597
4598 class = xmalloc(sizeof *class);
4599 *class = dpif_netdev_class;
4600 class->type = xstrdup(type);
4601 dp_register_provider(class);
4602 }
4603
4604 static void
4605 dpif_dummy_override(const char *type)
4606 {
4607 int error;
4608
4609 /*
4610 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
4611 * a userland-only build. It's useful for testsuite.
4612 */
4613 error = dp_unregister_provider(type);
4614 if (error == 0 || error == EAFNOSUPPORT) {
4615 dpif_dummy_register__(type);
4616 }
4617 }
4618
4619 void
4620 dpif_dummy_register(enum dummy_level level)
4621 {
4622 if (level == DUMMY_OVERRIDE_ALL) {
4623 struct sset types;
4624 const char *type;
4625
4626 sset_init(&types);
4627 dp_enumerate_types(&types);
4628 SSET_FOR_EACH (type, &types) {
4629 dpif_dummy_override(type);
4630 }
4631 sset_destroy(&types);
4632 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
4633 dpif_dummy_override("system");
4634 }
4635
4636 dpif_dummy_register__("dummy");
4637
4638 unixctl_command_register("dpif-dummy/change-port-number",
4639 "dp port new-number",
4640 3, 3, dpif_dummy_change_port_number, NULL);
4641 }
4642 \f
4643 /* Datapath Classifier. */
4644
4645 /* A set of rules that all have the same fields wildcarded. */
4646 struct dpcls_subtable {
4647 /* The fields are only used by writers. */
4648 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
4649
4650 /* These fields are accessed by readers. */
4651 struct cmap rules; /* Contains "struct dpcls_rule"s. */
4652 struct netdev_flow_key mask; /* Wildcards for fields (const). */
4653 /* 'mask' must be the last field, additional space is allocated here. */
4654 };
4655
4656 /* Initializes 'cls' as a classifier that initially contains no classification
4657 * rules. */
4658 static void
4659 dpcls_init(struct dpcls *cls)
4660 {
4661 cmap_init(&cls->subtables_map);
4662 pvector_init(&cls->subtables);
4663 }
4664
4665 static void
4666 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
4667 {
4668 pvector_remove(&cls->subtables, subtable);
4669 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
4670 subtable->mask.hash);
4671 cmap_destroy(&subtable->rules);
4672 ovsrcu_postpone(free, subtable);
4673 }
4674
4675 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
4676 * caller's responsibility.
4677 * May only be called after all the readers have been terminated. */
4678 static void
4679 dpcls_destroy(struct dpcls *cls)
4680 {
4681 if (cls) {
4682 struct dpcls_subtable *subtable;
4683
4684 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
4685 ovs_assert(cmap_count(&subtable->rules) == 0);
4686 dpcls_destroy_subtable(cls, subtable);
4687 }
4688 cmap_destroy(&cls->subtables_map);
4689 pvector_destroy(&cls->subtables);
4690 }
4691 }
4692
4693 static struct dpcls_subtable *
4694 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4695 {
4696 struct dpcls_subtable *subtable;
4697
4698 /* Need to add one. */
4699 subtable = xmalloc(sizeof *subtable
4700 - sizeof subtable->mask.mf + mask->len);
4701 cmap_init(&subtable->rules);
4702 netdev_flow_key_clone(&subtable->mask, mask);
4703 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
4704 pvector_insert(&cls->subtables, subtable, 0);
4705 pvector_publish(&cls->subtables);
4706
4707 return subtable;
4708 }
4709
4710 static inline struct dpcls_subtable *
4711 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4712 {
4713 struct dpcls_subtable *subtable;
4714
4715 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
4716 &cls->subtables_map) {
4717 if (netdev_flow_key_equal(&subtable->mask, mask)) {
4718 return subtable;
4719 }
4720 }
4721 return dpcls_create_subtable(cls, mask);
4722 }
4723
4724 /* Insert 'rule' into 'cls'. */
4725 static void
4726 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
4727 const struct netdev_flow_key *mask)
4728 {
4729 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
4730
4731 rule->mask = &subtable->mask;
4732 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
4733 }
4734
4735 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
4736 static void
4737 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
4738 {
4739 struct dpcls_subtable *subtable;
4740
4741 ovs_assert(rule->mask);
4742
4743 INIT_CONTAINER(subtable, rule->mask, mask);
4744
4745 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
4746 == 0) {
4747 dpcls_destroy_subtable(cls, subtable);
4748 pvector_publish(&cls->subtables);
4749 }
4750 }
4751
4752 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
4753 * in 'mask' the values in 'key' and 'target' are the same. */
4754 static inline bool
4755 dpcls_rule_matches_key(const struct dpcls_rule *rule,
4756 const struct netdev_flow_key *target)
4757 {
4758 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
4759 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
4760 uint64_t value;
4761
4762 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
4763 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
4764 return false;
4765 }
4766 }
4767 return true;
4768 }
4769
4770 /* For each miniflow in 'flows' performs a classifier lookup writing the result
4771 * into the corresponding slot in 'rules'. If a particular entry in 'flows' is
4772 * NULL it is skipped.
4773 *
4774 * This function is optimized for use in the userspace datapath and therefore
4775 * does not implement a lot of features available in the standard
4776 * classifier_lookup() function. Specifically, it does not implement
4777 * priorities, instead returning any rule which matches the flow.
4778 *
4779 * Returns true if all flows found a corresponding rule. */
4780 static bool
4781 dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
4782 struct dpcls_rule **rules, const size_t cnt)
4783 {
4784 /* The batch size 16 was experimentally found faster than 8 or 32. */
4785 typedef uint16_t map_type;
4786 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
4787
4788 #if !defined(__CHECKER__) && !defined(_WIN32)
4789 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
4790 #else
4791 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
4792 #endif
4793 map_type maps[N_MAPS];
4794 struct dpcls_subtable *subtable;
4795
4796 memset(maps, 0xff, sizeof maps);
4797 if (cnt % MAP_BITS) {
4798 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
4799 }
4800 memset(rules, 0, cnt * sizeof *rules);
4801
4802 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
4803 const struct netdev_flow_key *mkeys = keys;
4804 struct dpcls_rule **mrules = rules;
4805 map_type remains = 0;
4806 int m;
4807
4808 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
4809
4810 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
4811 uint32_t hashes[MAP_BITS];
4812 const struct cmap_node *nodes[MAP_BITS];
4813 unsigned long map = maps[m];
4814 int i;
4815
4816 if (!map) {
4817 continue; /* Skip empty maps. */
4818 }
4819
4820 /* Compute hashes for the remaining keys. */
4821 ULLONG_FOR_EACH_1(i, map) {
4822 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
4823 &subtable->mask);
4824 }
4825 /* Lookup. */
4826 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
4827 /* Check results. */
4828 ULLONG_FOR_EACH_1(i, map) {
4829 struct dpcls_rule *rule;
4830
4831 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
4832 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
4833 mrules[i] = rule;
4834 goto next;
4835 }
4836 }
4837 ULLONG_SET0(map, i); /* Did not match. */
4838 next:
4839 ; /* Keep Sparse happy. */
4840 }
4841 maps[m] &= ~map; /* Clear the found rules. */
4842 remains |= maps[m];
4843 }
4844 if (!remains) {
4845 return true; /* All found. */
4846 }
4847 }
4848 return false; /* Some misses. */
4849 }