]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif: Meter framework.
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <netinet/in.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/socket.h>
31 #include <sys/stat.h>
32 #include <unistd.h>
33
34 #ifdef DPDK_NETDEV
35 #include <rte_cycles.h>
36 #endif
37
38 #include "bitmap.h"
39 #include "cmap.h"
40 #include "conntrack.h"
41 #include "coverage.h"
42 #include "ct-dpif.h"
43 #include "csum.h"
44 #include "dp-packet.h"
45 #include "dpif.h"
46 #include "dpif-provider.h"
47 #include "dummy.h"
48 #include "fat-rwlock.h"
49 #include "flow.h"
50 #include "hmapx.h"
51 #include "latch.h"
52 #include "netdev.h"
53 #include "netdev-vport.h"
54 #include "netlink.h"
55 #include "odp-execute.h"
56 #include "odp-util.h"
57 #include "openvswitch/dynamic-string.h"
58 #include "openvswitch/list.h"
59 #include "openvswitch/match.h"
60 #include "openvswitch/ofp-print.h"
61 #include "openvswitch/ofp-util.h"
62 #include "openvswitch/ofpbuf.h"
63 #include "openvswitch/shash.h"
64 #include "openvswitch/vlog.h"
65 #include "ovs-numa.h"
66 #include "ovs-rcu.h"
67 #include "packets.h"
68 #include "poll-loop.h"
69 #include "pvector.h"
70 #include "random.h"
71 #include "seq.h"
72 #include "smap.h"
73 #include "sset.h"
74 #include "timeval.h"
75 #include "tnl-neigh-cache.h"
76 #include "tnl-ports.h"
77 #include "unixctl.h"
78 #include "util.h"
79
80 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
81
82 #define FLOW_DUMP_MAX_BATCH 50
83 /* Use per thread recirc_depth to prevent recirculation loop. */
84 #define MAX_RECIRC_DEPTH 5
85 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
86
87 /* Configuration parameters. */
88 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
89
90 /* Protects against changes to 'dp_netdevs'. */
91 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
92
93 /* Contains all 'struct dp_netdev's. */
94 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
95 = SHASH_INITIALIZER(&dp_netdevs);
96
97 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
98
99 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
100 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED)
101 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
102
103 static struct odp_support dp_netdev_support = {
104 .max_mpls_depth = SIZE_MAX,
105 .recirc = true,
106 .ct_state = true,
107 .ct_zone = true,
108 .ct_mark = true,
109 .ct_label = true,
110 };
111
112 /* Stores a miniflow with inline values */
113
114 struct netdev_flow_key {
115 uint32_t hash; /* Hash function differs for different users. */
116 uint32_t len; /* Length of the following miniflow (incl. map). */
117 struct miniflow mf;
118 uint64_t buf[FLOW_MAX_PACKET_U64S];
119 };
120
121 /* Exact match cache for frequently used flows
122 *
123 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
124 * search its entries for a miniflow that matches exactly the miniflow of the
125 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
126 *
127 * A cache entry holds a reference to its 'dp_netdev_flow'.
128 *
129 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
130 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
131 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
132 * value is the index of a cache entry where the miniflow could be.
133 *
134 *
135 * Thread-safety
136 * =============
137 *
138 * Each pmd_thread has its own private exact match cache.
139 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
140 */
141
142 #define EM_FLOW_HASH_SHIFT 13
143 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
144 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
145 #define EM_FLOW_HASH_SEGS 2
146
147 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
148 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
149 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
150 DEFAULT_EM_FLOW_INSERT_INV_PROB)
151
152 struct emc_entry {
153 struct dp_netdev_flow *flow;
154 struct netdev_flow_key key; /* key.hash used for emc hash value. */
155 };
156
157 struct emc_cache {
158 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
159 int sweep_idx; /* For emc_cache_slow_sweep(). */
160 };
161
162 /* Iterate in the exact match cache through every entry that might contain a
163 * miniflow with hash 'HASH'. */
164 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
165 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
166 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
167 i__ < EM_FLOW_HASH_SEGS; \
168 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
169 \f
170 /* Simple non-wildcarding single-priority classifier. */
171
172 /* Time in ms between successive optimizations of the dpcls subtable vector */
173 #define DPCLS_OPTIMIZATION_INTERVAL 1000
174
175 struct dpcls {
176 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
177 odp_port_t in_port;
178 struct cmap subtables_map;
179 struct pvector subtables;
180 };
181
182 /* A rule to be inserted to the classifier. */
183 struct dpcls_rule {
184 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
185 struct netdev_flow_key *mask; /* Subtable's mask. */
186 struct netdev_flow_key flow; /* Matching key. */
187 /* 'flow' must be the last field, additional space is allocated here. */
188 };
189
190 static void dpcls_init(struct dpcls *);
191 static void dpcls_destroy(struct dpcls *);
192 static void dpcls_sort_subtable_vector(struct dpcls *);
193 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
194 const struct netdev_flow_key *mask);
195 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
196 static bool dpcls_lookup(struct dpcls *cls,
197 const struct netdev_flow_key keys[],
198 struct dpcls_rule **rules, size_t cnt,
199 int *num_lookups_p);
200 \f
201 /* Datapath based on the network device interface from netdev.h.
202 *
203 *
204 * Thread-safety
205 * =============
206 *
207 * Some members, marked 'const', are immutable. Accessing other members
208 * requires synchronization, as noted in more detail below.
209 *
210 * Acquisition order is, from outermost to innermost:
211 *
212 * dp_netdev_mutex (global)
213 * port_mutex
214 * non_pmd_mutex
215 */
216 struct dp_netdev {
217 const struct dpif_class *const class;
218 const char *const name;
219 struct dpif *dpif;
220 struct ovs_refcount ref_cnt;
221 atomic_flag destroyed;
222
223 /* Ports.
224 *
225 * Any lookup into 'ports' or any access to the dp_netdev_ports found
226 * through 'ports' requires taking 'port_mutex'. */
227 struct ovs_mutex port_mutex;
228 struct hmap ports;
229 struct seq *port_seq; /* Incremented whenever a port changes. */
230
231 /* Protects access to ofproto-dpif-upcall interface during revalidator
232 * thread synchronization. */
233 struct fat_rwlock upcall_rwlock;
234 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
235 void *upcall_aux;
236
237 /* Callback function for notifying the purging of dp flows (during
238 * reseting pmd deletion). */
239 dp_purge_callback *dp_purge_cb;
240 void *dp_purge_aux;
241
242 /* Stores all 'struct dp_netdev_pmd_thread's. */
243 struct cmap poll_threads;
244
245 /* Protects the access of the 'struct dp_netdev_pmd_thread'
246 * instance for non-pmd thread. */
247 struct ovs_mutex non_pmd_mutex;
248
249 /* Each pmd thread will store its pointer to
250 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
251 ovsthread_key_t per_pmd_key;
252
253 struct seq *reconfigure_seq;
254 uint64_t last_reconfigure_seq;
255
256 /* Cpu mask for pin of pmd threads. */
257 char *pmd_cmask;
258
259 uint64_t last_tnl_conf_seq;
260
261 struct conntrack conntrack;
262
263 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
264 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
265 };
266
267 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
268 odp_port_t)
269 OVS_REQUIRES(dp->port_mutex);
270
271 enum dp_stat_type {
272 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
273 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
274 DP_STAT_MISS, /* Packets that did not match. */
275 DP_STAT_LOST, /* Packets not passed up to the client. */
276 DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
277 hits */
278 DP_N_STATS
279 };
280
281 enum pmd_cycles_counter_type {
282 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
283 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
284 PMD_N_CYCLES
285 };
286
287 #define XPS_TIMEOUT_MS 500LL
288
289 /* Contained by struct dp_netdev_port's 'rxqs' member. */
290 struct dp_netdev_rxq {
291 struct dp_netdev_port *port;
292 struct netdev_rxq *rx;
293 unsigned core_id; /* Core to which this queue should be
294 pinned. OVS_CORE_UNSPEC if the
295 queue doesn't need to be pinned to a
296 particular core. */
297 struct dp_netdev_pmd_thread *pmd; /* pmd thread that will poll this queue. */
298 };
299
300 /* A port in a netdev-based datapath. */
301 struct dp_netdev_port {
302 odp_port_t port_no;
303 struct netdev *netdev;
304 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
305 struct netdev_saved_flags *sf;
306 struct dp_netdev_rxq *rxqs;
307 unsigned n_rxq; /* Number of elements in 'rxq' */
308 bool dynamic_txqs; /* If true XPS will be used. */
309 unsigned *txq_used; /* Number of threads that uses each tx queue. */
310 struct ovs_mutex txq_used_mutex;
311 char *type; /* Port type as requested by user. */
312 char *rxq_affinity_list; /* Requested affinity of rx queues. */
313 bool need_reconfigure; /* True if we should reconfigure netdev. */
314 };
315
316 /* Contained by struct dp_netdev_flow's 'stats' member. */
317 struct dp_netdev_flow_stats {
318 atomic_llong used; /* Last used time, in monotonic msecs. */
319 atomic_ullong packet_count; /* Number of packets matched. */
320 atomic_ullong byte_count; /* Number of bytes matched. */
321 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
322 };
323
324 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
325 *
326 *
327 * Thread-safety
328 * =============
329 *
330 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
331 * its pmd thread's classifier. The text below calls this classifier 'cls'.
332 *
333 * Motivation
334 * ----------
335 *
336 * The thread safety rules described here for "struct dp_netdev_flow" are
337 * motivated by two goals:
338 *
339 * - Prevent threads that read members of "struct dp_netdev_flow" from
340 * reading bad data due to changes by some thread concurrently modifying
341 * those members.
342 *
343 * - Prevent two threads making changes to members of a given "struct
344 * dp_netdev_flow" from interfering with each other.
345 *
346 *
347 * Rules
348 * -----
349 *
350 * A flow 'flow' may be accessed without a risk of being freed during an RCU
351 * grace period. Code that needs to hold onto a flow for a while
352 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
353 *
354 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
355 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
356 * from modification.
357 *
358 * Some members, marked 'const', are immutable. Accessing other members
359 * requires synchronization, as noted in more detail below.
360 */
361 struct dp_netdev_flow {
362 const struct flow flow; /* Unmasked flow that created this entry. */
363 /* Hash table index by unmasked flow. */
364 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
365 /* 'flow_table'. */
366 const ovs_u128 ufid; /* Unique flow identifier. */
367 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
368 /* flow. */
369
370 /* Number of references.
371 * The classifier owns one reference.
372 * Any thread trying to keep a rule from being freed should hold its own
373 * reference. */
374 struct ovs_refcount ref_cnt;
375
376 bool dead;
377
378 /* Statistics. */
379 struct dp_netdev_flow_stats stats;
380
381 /* Actions. */
382 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
383
384 /* While processing a group of input packets, the datapath uses the next
385 * member to store a pointer to the output batch for the flow. It is
386 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
387 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
388 struct packet_batch_per_flow *batch;
389
390 /* Packet classification. */
391 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
392 /* 'cr' must be the last member. */
393 };
394
395 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
396 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
397 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
398 struct flow *);
399
400 /* A set of datapath actions within a "struct dp_netdev_flow".
401 *
402 *
403 * Thread-safety
404 * =============
405 *
406 * A struct dp_netdev_actions 'actions' is protected with RCU. */
407 struct dp_netdev_actions {
408 /* These members are immutable: they do not change during the struct's
409 * lifetime. */
410 unsigned int size; /* Size of 'actions', in bytes. */
411 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
412 };
413
414 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
415 size_t);
416 struct dp_netdev_actions *dp_netdev_flow_get_actions(
417 const struct dp_netdev_flow *);
418 static void dp_netdev_actions_free(struct dp_netdev_actions *);
419
420 /* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
421 struct dp_netdev_pmd_stats {
422 /* Indexed by DP_STAT_*. */
423 atomic_ullong n[DP_N_STATS];
424 };
425
426 /* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
427 struct dp_netdev_pmd_cycles {
428 /* Indexed by PMD_CYCLES_*. */
429 atomic_ullong n[PMD_N_CYCLES];
430 };
431
432 struct polled_queue {
433 struct netdev_rxq *rx;
434 odp_port_t port_no;
435 };
436
437 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
438 struct rxq_poll {
439 struct dp_netdev_rxq *rxq;
440 struct hmap_node node;
441 };
442
443 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
444 * 'tnl_port_cache' or 'tx_ports'. */
445 struct tx_port {
446 struct dp_netdev_port *port;
447 int qid;
448 long long last_used;
449 struct hmap_node node;
450 };
451
452 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
453 * the performance overhead of interrupt processing. Therefore netdev can
454 * not implement rx-wait for these devices. dpif-netdev needs to poll
455 * these device to check for recv buffer. pmd-thread does polling for
456 * devices assigned to itself.
457 *
458 * DPDK used PMD for accessing NIC.
459 *
460 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
461 * I/O of all non-pmd threads. There will be no actual thread created
462 * for the instance.
463 *
464 * Each struct has its own flow table and classifier. Packets received
465 * from managed ports are looked up in the corresponding pmd thread's
466 * flow table, and are executed with the found actions.
467 * */
468 struct dp_netdev_pmd_thread {
469 struct dp_netdev *dp;
470 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
471 struct cmap_node node; /* In 'dp->poll_threads'. */
472
473 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
474 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
475
476 /* Per thread exact-match cache. Note, the instance for cpu core
477 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
478 * need to be protected by 'non_pmd_mutex'. Every other instance
479 * will only be accessed by its own pmd thread. */
480 struct emc_cache flow_cache;
481
482 /* Flow-Table and classifiers
483 *
484 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
485 * changes to 'classifiers' must be made while still holding the
486 * 'flow_mutex'.
487 */
488 struct ovs_mutex flow_mutex;
489 struct cmap flow_table OVS_GUARDED; /* Flow table. */
490
491 /* One classifier per in_port polled by the pmd */
492 struct cmap classifiers;
493 /* Periodically sort subtable vectors according to hit frequencies */
494 long long int next_optimization;
495
496 /* Statistics. */
497 struct dp_netdev_pmd_stats stats;
498
499 /* Cycles counters */
500 struct dp_netdev_pmd_cycles cycles;
501
502 /* Used to count cicles. See 'cycles_counter_end()' */
503 unsigned long long last_cycles;
504
505 struct latch exit_latch; /* For terminating the pmd thread. */
506 struct seq *reload_seq;
507 uint64_t last_reload_seq;
508 atomic_bool reload; /* Do we need to reload ports? */
509 pthread_t thread;
510 unsigned core_id; /* CPU core id of this pmd thread. */
511 int numa_id; /* numa node id of this pmd thread. */
512 bool isolated;
513
514 /* Queue id used by this pmd thread to send packets on all netdevs if
515 * XPS disabled for this netdev. All static_tx_qid's are unique and less
516 * than 'cmap_count(dp->poll_threads)'. */
517 const int static_tx_qid;
518
519 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
520 /* List of rx queues to poll. */
521 struct hmap poll_list OVS_GUARDED;
522 /* Map of 'tx_port's used for transmission. Written by the main thread,
523 * read by the pmd thread. */
524 struct hmap tx_ports OVS_GUARDED;
525
526 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
527 * ports (that support push_tunnel/pop_tunnel), the other contains ports
528 * with at least one txq (that support send). A port can be in both.
529 *
530 * There are two separate maps to make sure that we don't try to execute
531 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
532 *
533 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
534 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
535 * other instance will only be accessed by its own pmd thread. */
536 struct hmap tnl_port_cache;
537 struct hmap send_port_cache;
538
539 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
540 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
541 * values and subtracts them from 'stats' and 'cycles' before
542 * reporting to the user */
543 unsigned long long stats_zero[DP_N_STATS];
544 uint64_t cycles_zero[PMD_N_CYCLES];
545
546 /* Set to true if the pmd thread needs to be reloaded. */
547 bool need_reload;
548 };
549
550 /* Interface to netdev-based datapath. */
551 struct dpif_netdev {
552 struct dpif dpif;
553 struct dp_netdev *dp;
554 uint64_t last_port_seq;
555 };
556
557 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
558 struct dp_netdev_port **portp)
559 OVS_REQUIRES(dp->port_mutex);
560 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
561 struct dp_netdev_port **portp)
562 OVS_REQUIRES(dp->port_mutex);
563 static void dp_netdev_free(struct dp_netdev *)
564 OVS_REQUIRES(dp_netdev_mutex);
565 static int do_add_port(struct dp_netdev *dp, const char *devname,
566 const char *type, odp_port_t port_no)
567 OVS_REQUIRES(dp->port_mutex);
568 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
569 OVS_REQUIRES(dp->port_mutex);
570 static int dpif_netdev_open(const struct dpif_class *, const char *name,
571 bool create, struct dpif **);
572 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
573 struct dp_packet_batch *,
574 bool may_steal, const struct flow *flow,
575 const struct nlattr *actions,
576 size_t actions_len,
577 long long now);
578 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
579 struct dp_packet_batch *, odp_port_t port_no);
580 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
581 struct dp_packet_batch *);
582
583 static void dp_netdev_disable_upcall(struct dp_netdev *);
584 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
585 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
586 struct dp_netdev *dp, unsigned core_id,
587 int numa_id);
588 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
589 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
590 OVS_REQUIRES(dp->port_mutex);
591
592 static void *pmd_thread_main(void *);
593 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
594 unsigned core_id);
595 static struct dp_netdev_pmd_thread *
596 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
597 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
598 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
599 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
600 struct dp_netdev_port *port)
601 OVS_REQUIRES(pmd->port_mutex);
602 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
603 struct tx_port *tx)
604 OVS_REQUIRES(pmd->port_mutex);
605 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
606 struct dp_netdev_rxq *rxq)
607 OVS_REQUIRES(pmd->port_mutex);
608 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
609 struct rxq_poll *poll)
610 OVS_REQUIRES(pmd->port_mutex);
611 static void reconfigure_datapath(struct dp_netdev *dp)
612 OVS_REQUIRES(dp->port_mutex);
613 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
614 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
615 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
616 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
617 OVS_REQUIRES(pmd->port_mutex);
618 static inline void
619 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
620
621 static void
622 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
623 long long now, bool purge);
624 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
625 struct tx_port *tx, long long now);
626
627 static inline bool emc_entry_alive(struct emc_entry *ce);
628 static void emc_clear_entry(struct emc_entry *ce);
629
630 static void
631 emc_cache_init(struct emc_cache *flow_cache)
632 {
633 int i;
634
635 flow_cache->sweep_idx = 0;
636 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
637 flow_cache->entries[i].flow = NULL;
638 flow_cache->entries[i].key.hash = 0;
639 flow_cache->entries[i].key.len = sizeof(struct miniflow);
640 flowmap_init(&flow_cache->entries[i].key.mf.map);
641 }
642 }
643
644 static void
645 emc_cache_uninit(struct emc_cache *flow_cache)
646 {
647 int i;
648
649 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
650 emc_clear_entry(&flow_cache->entries[i]);
651 }
652 }
653
654 /* Check and clear dead flow references slowly (one entry at each
655 * invocation). */
656 static void
657 emc_cache_slow_sweep(struct emc_cache *flow_cache)
658 {
659 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
660
661 if (!emc_entry_alive(entry)) {
662 emc_clear_entry(entry);
663 }
664 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
665 }
666
667 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
668 bool
669 dpif_is_netdev(const struct dpif *dpif)
670 {
671 return dpif->dpif_class->open == dpif_netdev_open;
672 }
673
674 static struct dpif_netdev *
675 dpif_netdev_cast(const struct dpif *dpif)
676 {
677 ovs_assert(dpif_is_netdev(dpif));
678 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
679 }
680
681 static struct dp_netdev *
682 get_dp_netdev(const struct dpif *dpif)
683 {
684 return dpif_netdev_cast(dpif)->dp;
685 }
686 \f
687 enum pmd_info_type {
688 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
689 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
690 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
691 };
692
693 static void
694 pmd_info_show_stats(struct ds *reply,
695 struct dp_netdev_pmd_thread *pmd,
696 unsigned long long stats[DP_N_STATS],
697 uint64_t cycles[PMD_N_CYCLES])
698 {
699 unsigned long long total_packets = 0;
700 uint64_t total_cycles = 0;
701 int i;
702
703 /* These loops subtracts reference values ('*_zero') from the counters.
704 * Since loads and stores are relaxed, it might be possible for a '*_zero'
705 * value to be more recent than the current value we're reading from the
706 * counter. This is not a big problem, since these numbers are not
707 * supposed to be too accurate, but we should at least make sure that
708 * the result is not negative. */
709 for (i = 0; i < DP_N_STATS; i++) {
710 if (stats[i] > pmd->stats_zero[i]) {
711 stats[i] -= pmd->stats_zero[i];
712 } else {
713 stats[i] = 0;
714 }
715
716 if (i != DP_STAT_LOST) {
717 /* Lost packets are already included in DP_STAT_MISS */
718 total_packets += stats[i];
719 }
720 }
721
722 for (i = 0; i < PMD_N_CYCLES; i++) {
723 if (cycles[i] > pmd->cycles_zero[i]) {
724 cycles[i] -= pmd->cycles_zero[i];
725 } else {
726 cycles[i] = 0;
727 }
728
729 total_cycles += cycles[i];
730 }
731
732 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
733 ? "main thread" : "pmd thread");
734
735 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
736 ds_put_format(reply, " numa_id %d", pmd->numa_id);
737 }
738 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
739 ds_put_format(reply, " core_id %u", pmd->core_id);
740 }
741 ds_put_cstr(reply, ":\n");
742
743 ds_put_format(reply,
744 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
745 "\tavg. subtable lookups per hit:%.2f\n"
746 "\tmiss:%llu\n\tlost:%llu\n",
747 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
748 stats[DP_STAT_MASKED_HIT] > 0
749 ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
750 : 0,
751 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
752
753 if (total_cycles == 0) {
754 return;
755 }
756
757 ds_put_format(reply,
758 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
759 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
760 cycles[PMD_CYCLES_POLLING],
761 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
762 cycles[PMD_CYCLES_PROCESSING],
763 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
764
765 if (total_packets == 0) {
766 return;
767 }
768
769 ds_put_format(reply,
770 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
771 total_cycles / (double)total_packets,
772 total_cycles, total_packets);
773
774 ds_put_format(reply,
775 "\tavg processing cycles per packet: "
776 "%.02f (%"PRIu64"/%llu)\n",
777 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
778 cycles[PMD_CYCLES_PROCESSING], total_packets);
779 }
780
781 static void
782 pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
783 struct dp_netdev_pmd_thread *pmd,
784 unsigned long long stats[DP_N_STATS],
785 uint64_t cycles[PMD_N_CYCLES])
786 {
787 int i;
788
789 /* We cannot write 'stats' and 'cycles' (because they're written by other
790 * threads) and we shouldn't change 'stats' (because they're used to count
791 * datapath stats, which must not be cleared here). Instead, we save the
792 * current values and subtract them from the values to be displayed in the
793 * future */
794 for (i = 0; i < DP_N_STATS; i++) {
795 pmd->stats_zero[i] = stats[i];
796 }
797 for (i = 0; i < PMD_N_CYCLES; i++) {
798 pmd->cycles_zero[i] = cycles[i];
799 }
800 }
801
802 static int
803 compare_poll_list(const void *a_, const void *b_)
804 {
805 const struct rxq_poll *a = a_;
806 const struct rxq_poll *b = b_;
807
808 const char *namea = netdev_rxq_get_name(a->rxq->rx);
809 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
810
811 int cmp = strcmp(namea, nameb);
812 if (!cmp) {
813 return netdev_rxq_get_queue_id(a->rxq->rx)
814 - netdev_rxq_get_queue_id(b->rxq->rx);
815 } else {
816 return cmp;
817 }
818 }
819
820 static void
821 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
822 size_t *n)
823 {
824 struct rxq_poll *ret, *poll;
825 size_t i;
826
827 *n = hmap_count(&pmd->poll_list);
828 if (!*n) {
829 ret = NULL;
830 } else {
831 ret = xcalloc(*n, sizeof *ret);
832 i = 0;
833 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
834 ret[i] = *poll;
835 i++;
836 }
837 ovs_assert(i == *n);
838 }
839
840 qsort(ret, *n, sizeof *ret, compare_poll_list);
841
842 *list = ret;
843 }
844
845 static void
846 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
847 {
848 if (pmd->core_id != NON_PMD_CORE_ID) {
849 const char *prev_name = NULL;
850 struct rxq_poll *list;
851 size_t i, n;
852
853 ds_put_format(reply,
854 "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
855 pmd->numa_id, pmd->core_id, (pmd->isolated)
856 ? "true" : "false");
857
858 ovs_mutex_lock(&pmd->port_mutex);
859 sorted_poll_list(pmd, &list, &n);
860 for (i = 0; i < n; i++) {
861 const char *name = netdev_rxq_get_name(list[i].rxq->rx);
862
863 if (!prev_name || strcmp(name, prev_name)) {
864 if (prev_name) {
865 ds_put_cstr(reply, "\n");
866 }
867 ds_put_format(reply, "\tport: %s\tqueue-id:", name);
868 }
869 ds_put_format(reply, " %d",
870 netdev_rxq_get_queue_id(list[i].rxq->rx));
871 prev_name = name;
872 }
873 ovs_mutex_unlock(&pmd->port_mutex);
874 ds_put_cstr(reply, "\n");
875 free(list);
876 }
877 }
878
879 static void
880 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
881 void *aux)
882 {
883 struct ds reply = DS_EMPTY_INITIALIZER;
884 struct dp_netdev_pmd_thread *pmd;
885 struct dp_netdev *dp = NULL;
886 enum pmd_info_type type = *(enum pmd_info_type *) aux;
887
888 ovs_mutex_lock(&dp_netdev_mutex);
889
890 if (argc == 2) {
891 dp = shash_find_data(&dp_netdevs, argv[1]);
892 } else if (shash_count(&dp_netdevs) == 1) {
893 /* There's only one datapath */
894 dp = shash_first(&dp_netdevs)->data;
895 }
896
897 if (!dp) {
898 ovs_mutex_unlock(&dp_netdev_mutex);
899 unixctl_command_reply_error(conn,
900 "please specify an existing datapath");
901 return;
902 }
903
904 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
905 if (type == PMD_INFO_SHOW_RXQ) {
906 pmd_info_show_rxq(&reply, pmd);
907 } else {
908 unsigned long long stats[DP_N_STATS];
909 uint64_t cycles[PMD_N_CYCLES];
910 int i;
911
912 /* Read current stats and cycle counters */
913 for (i = 0; i < ARRAY_SIZE(stats); i++) {
914 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
915 }
916 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
917 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
918 }
919
920 if (type == PMD_INFO_CLEAR_STATS) {
921 pmd_info_clear_stats(&reply, pmd, stats, cycles);
922 } else if (type == PMD_INFO_SHOW_STATS) {
923 pmd_info_show_stats(&reply, pmd, stats, cycles);
924 }
925 }
926 }
927
928 ovs_mutex_unlock(&dp_netdev_mutex);
929
930 unixctl_command_reply(conn, ds_cstr(&reply));
931 ds_destroy(&reply);
932 }
933 \f
934 static int
935 dpif_netdev_init(void)
936 {
937 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
938 clear_aux = PMD_INFO_CLEAR_STATS,
939 poll_aux = PMD_INFO_SHOW_RXQ;
940
941 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
942 0, 1, dpif_netdev_pmd_info,
943 (void *)&show_aux);
944 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
945 0, 1, dpif_netdev_pmd_info,
946 (void *)&clear_aux);
947 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
948 0, 1, dpif_netdev_pmd_info,
949 (void *)&poll_aux);
950 return 0;
951 }
952
953 static int
954 dpif_netdev_enumerate(struct sset *all_dps,
955 const struct dpif_class *dpif_class)
956 {
957 struct shash_node *node;
958
959 ovs_mutex_lock(&dp_netdev_mutex);
960 SHASH_FOR_EACH(node, &dp_netdevs) {
961 struct dp_netdev *dp = node->data;
962 if (dpif_class != dp->class) {
963 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
964 * If the class doesn't match, skip this dpif. */
965 continue;
966 }
967 sset_add(all_dps, node->name);
968 }
969 ovs_mutex_unlock(&dp_netdev_mutex);
970
971 return 0;
972 }
973
974 static bool
975 dpif_netdev_class_is_dummy(const struct dpif_class *class)
976 {
977 return class != &dpif_netdev_class;
978 }
979
980 static const char *
981 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
982 {
983 return strcmp(type, "internal") ? type
984 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
985 : "tap";
986 }
987
988 static struct dpif *
989 create_dpif_netdev(struct dp_netdev *dp)
990 {
991 uint16_t netflow_id = hash_string(dp->name, 0);
992 struct dpif_netdev *dpif;
993
994 ovs_refcount_ref(&dp->ref_cnt);
995
996 dpif = xmalloc(sizeof *dpif);
997 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
998 dpif->dp = dp;
999 dpif->last_port_seq = seq_read(dp->port_seq);
1000
1001 return &dpif->dpif;
1002 }
1003
1004 /* Choose an unused, non-zero port number and return it on success.
1005 * Return ODPP_NONE on failure. */
1006 static odp_port_t
1007 choose_port(struct dp_netdev *dp, const char *name)
1008 OVS_REQUIRES(dp->port_mutex)
1009 {
1010 uint32_t port_no;
1011
1012 if (dp->class != &dpif_netdev_class) {
1013 const char *p;
1014 int start_no = 0;
1015
1016 /* If the port name begins with "br", start the number search at
1017 * 100 to make writing tests easier. */
1018 if (!strncmp(name, "br", 2)) {
1019 start_no = 100;
1020 }
1021
1022 /* If the port name contains a number, try to assign that port number.
1023 * This can make writing unit tests easier because port numbers are
1024 * predictable. */
1025 for (p = name; *p != '\0'; p++) {
1026 if (isdigit((unsigned char) *p)) {
1027 port_no = start_no + strtol(p, NULL, 10);
1028 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1029 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1030 return u32_to_odp(port_no);
1031 }
1032 break;
1033 }
1034 }
1035 }
1036
1037 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1038 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1039 return u32_to_odp(port_no);
1040 }
1041 }
1042
1043 return ODPP_NONE;
1044 }
1045
1046 static int
1047 create_dp_netdev(const char *name, const struct dpif_class *class,
1048 struct dp_netdev **dpp)
1049 OVS_REQUIRES(dp_netdev_mutex)
1050 {
1051 struct dp_netdev *dp;
1052 int error;
1053
1054 dp = xzalloc(sizeof *dp);
1055 shash_add(&dp_netdevs, name, dp);
1056
1057 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1058 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1059 ovs_refcount_init(&dp->ref_cnt);
1060 atomic_flag_clear(&dp->destroyed);
1061
1062 ovs_mutex_init(&dp->port_mutex);
1063 hmap_init(&dp->ports);
1064 dp->port_seq = seq_create();
1065 fat_rwlock_init(&dp->upcall_rwlock);
1066
1067 dp->reconfigure_seq = seq_create();
1068 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1069
1070 /* Disable upcalls by default. */
1071 dp_netdev_disable_upcall(dp);
1072 dp->upcall_aux = NULL;
1073 dp->upcall_cb = NULL;
1074
1075 conntrack_init(&dp->conntrack);
1076
1077 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1078
1079 cmap_init(&dp->poll_threads);
1080 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1081 ovsthread_key_create(&dp->per_pmd_key, NULL);
1082
1083 ovs_mutex_lock(&dp->port_mutex);
1084 dp_netdev_set_nonpmd(dp);
1085
1086 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1087 "internal"),
1088 ODPP_LOCAL);
1089 ovs_mutex_unlock(&dp->port_mutex);
1090 if (error) {
1091 dp_netdev_free(dp);
1092 return error;
1093 }
1094
1095 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1096 *dpp = dp;
1097 return 0;
1098 }
1099
1100 static void
1101 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1102 {
1103 seq_change(dp->reconfigure_seq);
1104 }
1105
1106 static bool
1107 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1108 {
1109 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1110 }
1111
1112 static int
1113 dpif_netdev_open(const struct dpif_class *class, const char *name,
1114 bool create, struct dpif **dpifp)
1115 {
1116 struct dp_netdev *dp;
1117 int error;
1118
1119 ovs_mutex_lock(&dp_netdev_mutex);
1120 dp = shash_find_data(&dp_netdevs, name);
1121 if (!dp) {
1122 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1123 } else {
1124 error = (dp->class != class ? EINVAL
1125 : create ? EEXIST
1126 : 0);
1127 }
1128 if (!error) {
1129 *dpifp = create_dpif_netdev(dp);
1130 dp->dpif = *dpifp;
1131 }
1132 ovs_mutex_unlock(&dp_netdev_mutex);
1133
1134 return error;
1135 }
1136
1137 static void
1138 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1139 OVS_NO_THREAD_SAFETY_ANALYSIS
1140 {
1141 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1142 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1143
1144 /* Before freeing a lock we should release it */
1145 fat_rwlock_unlock(&dp->upcall_rwlock);
1146 fat_rwlock_destroy(&dp->upcall_rwlock);
1147 }
1148
1149 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1150 * through the 'dp_netdevs' shash while freeing 'dp'. */
1151 static void
1152 dp_netdev_free(struct dp_netdev *dp)
1153 OVS_REQUIRES(dp_netdev_mutex)
1154 {
1155 struct dp_netdev_port *port, *next;
1156
1157 shash_find_and_delete(&dp_netdevs, dp->name);
1158
1159 ovs_mutex_lock(&dp->port_mutex);
1160 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1161 do_del_port(dp, port);
1162 }
1163 ovs_mutex_unlock(&dp->port_mutex);
1164 dp_netdev_destroy_all_pmds(dp, true);
1165 cmap_destroy(&dp->poll_threads);
1166
1167 ovs_mutex_destroy(&dp->non_pmd_mutex);
1168 ovsthread_key_delete(dp->per_pmd_key);
1169
1170 conntrack_destroy(&dp->conntrack);
1171
1172
1173 seq_destroy(dp->reconfigure_seq);
1174
1175 seq_destroy(dp->port_seq);
1176 hmap_destroy(&dp->ports);
1177 ovs_mutex_destroy(&dp->port_mutex);
1178
1179 /* Upcalls must be disabled at this point */
1180 dp_netdev_destroy_upcall_lock(dp);
1181
1182 free(dp->pmd_cmask);
1183 free(CONST_CAST(char *, dp->name));
1184 free(dp);
1185 }
1186
1187 static void
1188 dp_netdev_unref(struct dp_netdev *dp)
1189 {
1190 if (dp) {
1191 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1192 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1193 ovs_mutex_lock(&dp_netdev_mutex);
1194 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1195 dp_netdev_free(dp);
1196 }
1197 ovs_mutex_unlock(&dp_netdev_mutex);
1198 }
1199 }
1200
1201 static void
1202 dpif_netdev_close(struct dpif *dpif)
1203 {
1204 struct dp_netdev *dp = get_dp_netdev(dpif);
1205
1206 dp_netdev_unref(dp);
1207 free(dpif);
1208 }
1209
1210 static int
1211 dpif_netdev_destroy(struct dpif *dpif)
1212 {
1213 struct dp_netdev *dp = get_dp_netdev(dpif);
1214
1215 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1216 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1217 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1218 OVS_NOT_REACHED();
1219 }
1220 }
1221
1222 return 0;
1223 }
1224
1225 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1226 * load/store semantics. While the increment is not atomic, the load and
1227 * store operations are, making it impossible to read inconsistent values.
1228 *
1229 * This is used to update thread local stats counters. */
1230 static void
1231 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1232 {
1233 unsigned long long tmp;
1234
1235 atomic_read_relaxed(var, &tmp);
1236 tmp += n;
1237 atomic_store_relaxed(var, tmp);
1238 }
1239
1240 static int
1241 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1242 {
1243 struct dp_netdev *dp = get_dp_netdev(dpif);
1244 struct dp_netdev_pmd_thread *pmd;
1245
1246 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1247 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1248 unsigned long long n;
1249 stats->n_flows += cmap_count(&pmd->flow_table);
1250
1251 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1252 stats->n_hit += n;
1253 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
1254 stats->n_hit += n;
1255 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1256 stats->n_missed += n;
1257 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1258 stats->n_lost += n;
1259 }
1260 stats->n_masks = UINT32_MAX;
1261 stats->n_mask_hit = UINT64_MAX;
1262
1263 return 0;
1264 }
1265
1266 static void
1267 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1268 {
1269 if (pmd->core_id == NON_PMD_CORE_ID) {
1270 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1271 ovs_mutex_lock(&pmd->port_mutex);
1272 pmd_load_cached_ports(pmd);
1273 ovs_mutex_unlock(&pmd->port_mutex);
1274 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1275 return;
1276 }
1277
1278 ovs_mutex_lock(&pmd->cond_mutex);
1279 seq_change(pmd->reload_seq);
1280 atomic_store_relaxed(&pmd->reload, true);
1281 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1282 ovs_mutex_unlock(&pmd->cond_mutex);
1283 }
1284
1285 static uint32_t
1286 hash_port_no(odp_port_t port_no)
1287 {
1288 return hash_int(odp_to_u32(port_no), 0);
1289 }
1290
1291 static int
1292 port_create(const char *devname, const char *type,
1293 odp_port_t port_no, struct dp_netdev_port **portp)
1294 {
1295 struct netdev_saved_flags *sf;
1296 struct dp_netdev_port *port;
1297 enum netdev_flags flags;
1298 struct netdev *netdev;
1299 int error;
1300
1301 *portp = NULL;
1302
1303 /* Open and validate network device. */
1304 error = netdev_open(devname, type, &netdev);
1305 if (error) {
1306 return error;
1307 }
1308 /* XXX reject non-Ethernet devices */
1309
1310 netdev_get_flags(netdev, &flags);
1311 if (flags & NETDEV_LOOPBACK) {
1312 VLOG_ERR("%s: cannot add a loopback device", devname);
1313 error = EINVAL;
1314 goto out;
1315 }
1316
1317 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1318 if (error) {
1319 VLOG_ERR("%s: cannot set promisc flag", devname);
1320 goto out;
1321 }
1322
1323 port = xzalloc(sizeof *port);
1324 port->port_no = port_no;
1325 port->netdev = netdev;
1326 port->type = xstrdup(type);
1327 port->sf = sf;
1328 port->need_reconfigure = true;
1329 ovs_mutex_init(&port->txq_used_mutex);
1330
1331 *portp = port;
1332
1333 return 0;
1334
1335 out:
1336 netdev_close(netdev);
1337 return error;
1338 }
1339
1340 static int
1341 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1342 odp_port_t port_no)
1343 OVS_REQUIRES(dp->port_mutex)
1344 {
1345 struct dp_netdev_port *port;
1346 int error;
1347
1348 /* Reject devices already in 'dp'. */
1349 if (!get_port_by_name(dp, devname, &port)) {
1350 return EEXIST;
1351 }
1352
1353 error = port_create(devname, type, port_no, &port);
1354 if (error) {
1355 return error;
1356 }
1357
1358 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1359 seq_change(dp->port_seq);
1360
1361 reconfigure_datapath(dp);
1362
1363 return 0;
1364 }
1365
1366 static int
1367 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1368 odp_port_t *port_nop)
1369 {
1370 struct dp_netdev *dp = get_dp_netdev(dpif);
1371 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1372 const char *dpif_port;
1373 odp_port_t port_no;
1374 int error;
1375
1376 ovs_mutex_lock(&dp->port_mutex);
1377 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1378 if (*port_nop != ODPP_NONE) {
1379 port_no = *port_nop;
1380 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1381 } else {
1382 port_no = choose_port(dp, dpif_port);
1383 error = port_no == ODPP_NONE ? EFBIG : 0;
1384 }
1385 if (!error) {
1386 *port_nop = port_no;
1387 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1388 }
1389 ovs_mutex_unlock(&dp->port_mutex);
1390
1391 return error;
1392 }
1393
1394 static int
1395 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1396 {
1397 struct dp_netdev *dp = get_dp_netdev(dpif);
1398 int error;
1399
1400 ovs_mutex_lock(&dp->port_mutex);
1401 if (port_no == ODPP_LOCAL) {
1402 error = EINVAL;
1403 } else {
1404 struct dp_netdev_port *port;
1405
1406 error = get_port_by_number(dp, port_no, &port);
1407 if (!error) {
1408 do_del_port(dp, port);
1409 }
1410 }
1411 ovs_mutex_unlock(&dp->port_mutex);
1412
1413 return error;
1414 }
1415
1416 static bool
1417 is_valid_port_number(odp_port_t port_no)
1418 {
1419 return port_no != ODPP_NONE;
1420 }
1421
1422 static struct dp_netdev_port *
1423 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1424 OVS_REQUIRES(dp->port_mutex)
1425 {
1426 struct dp_netdev_port *port;
1427
1428 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1429 if (port->port_no == port_no) {
1430 return port;
1431 }
1432 }
1433 return NULL;
1434 }
1435
1436 static int
1437 get_port_by_number(struct dp_netdev *dp,
1438 odp_port_t port_no, struct dp_netdev_port **portp)
1439 OVS_REQUIRES(dp->port_mutex)
1440 {
1441 if (!is_valid_port_number(port_no)) {
1442 *portp = NULL;
1443 return EINVAL;
1444 } else {
1445 *portp = dp_netdev_lookup_port(dp, port_no);
1446 return *portp ? 0 : ENODEV;
1447 }
1448 }
1449
1450 static void
1451 port_destroy(struct dp_netdev_port *port)
1452 {
1453 if (!port) {
1454 return;
1455 }
1456
1457 netdev_close(port->netdev);
1458 netdev_restore_flags(port->sf);
1459
1460 for (unsigned i = 0; i < port->n_rxq; i++) {
1461 netdev_rxq_close(port->rxqs[i].rx);
1462 }
1463 ovs_mutex_destroy(&port->txq_used_mutex);
1464 free(port->rxq_affinity_list);
1465 free(port->txq_used);
1466 free(port->rxqs);
1467 free(port->type);
1468 free(port);
1469 }
1470
1471 static int
1472 get_port_by_name(struct dp_netdev *dp,
1473 const char *devname, struct dp_netdev_port **portp)
1474 OVS_REQUIRES(dp->port_mutex)
1475 {
1476 struct dp_netdev_port *port;
1477
1478 HMAP_FOR_EACH (port, node, &dp->ports) {
1479 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1480 *portp = port;
1481 return 0;
1482 }
1483 }
1484
1485 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1486 * existing port. */
1487 return ENODEV;
1488 }
1489
1490 /* Returns 'true' if there is a port with pmd netdev. */
1491 static bool
1492 has_pmd_port(struct dp_netdev *dp)
1493 OVS_REQUIRES(dp->port_mutex)
1494 {
1495 struct dp_netdev_port *port;
1496
1497 HMAP_FOR_EACH (port, node, &dp->ports) {
1498 if (netdev_is_pmd(port->netdev)) {
1499 return true;
1500 }
1501 }
1502
1503 return false;
1504 }
1505
1506 static void
1507 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1508 OVS_REQUIRES(dp->port_mutex)
1509 {
1510 hmap_remove(&dp->ports, &port->node);
1511 seq_change(dp->port_seq);
1512
1513 reconfigure_datapath(dp);
1514
1515 port_destroy(port);
1516 }
1517
1518 static void
1519 answer_port_query(const struct dp_netdev_port *port,
1520 struct dpif_port *dpif_port)
1521 {
1522 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1523 dpif_port->type = xstrdup(port->type);
1524 dpif_port->port_no = port->port_no;
1525 }
1526
1527 static int
1528 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1529 struct dpif_port *dpif_port)
1530 {
1531 struct dp_netdev *dp = get_dp_netdev(dpif);
1532 struct dp_netdev_port *port;
1533 int error;
1534
1535 ovs_mutex_lock(&dp->port_mutex);
1536 error = get_port_by_number(dp, port_no, &port);
1537 if (!error && dpif_port) {
1538 answer_port_query(port, dpif_port);
1539 }
1540 ovs_mutex_unlock(&dp->port_mutex);
1541
1542 return error;
1543 }
1544
1545 static int
1546 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1547 struct dpif_port *dpif_port)
1548 {
1549 struct dp_netdev *dp = get_dp_netdev(dpif);
1550 struct dp_netdev_port *port;
1551 int error;
1552
1553 ovs_mutex_lock(&dp->port_mutex);
1554 error = get_port_by_name(dp, devname, &port);
1555 if (!error && dpif_port) {
1556 answer_port_query(port, dpif_port);
1557 }
1558 ovs_mutex_unlock(&dp->port_mutex);
1559
1560 return error;
1561 }
1562
1563 static void
1564 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1565 {
1566 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1567 free(flow);
1568 }
1569
1570 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1571 {
1572 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1573 ovsrcu_postpone(dp_netdev_flow_free, flow);
1574 }
1575 }
1576
1577 static uint32_t
1578 dp_netdev_flow_hash(const ovs_u128 *ufid)
1579 {
1580 return ufid->u32[0];
1581 }
1582
1583 static inline struct dpcls *
1584 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1585 odp_port_t in_port)
1586 {
1587 struct dpcls *cls;
1588 uint32_t hash = hash_port_no(in_port);
1589 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1590 if (cls->in_port == in_port) {
1591 /* Port classifier exists already */
1592 return cls;
1593 }
1594 }
1595 return NULL;
1596 }
1597
1598 static inline struct dpcls *
1599 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1600 odp_port_t in_port)
1601 OVS_REQUIRES(pmd->flow_mutex)
1602 {
1603 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1604 uint32_t hash = hash_port_no(in_port);
1605
1606 if (!cls) {
1607 /* Create new classifier for in_port */
1608 cls = xmalloc(sizeof(*cls));
1609 dpcls_init(cls);
1610 cls->in_port = in_port;
1611 cmap_insert(&pmd->classifiers, &cls->node, hash);
1612 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1613 }
1614 return cls;
1615 }
1616
1617 static void
1618 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1619 struct dp_netdev_flow *flow)
1620 OVS_REQUIRES(pmd->flow_mutex)
1621 {
1622 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
1623 struct dpcls *cls;
1624 odp_port_t in_port = flow->flow.in_port.odp_port;
1625
1626 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1627 ovs_assert(cls != NULL);
1628 dpcls_remove(cls, &flow->cr);
1629 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
1630 flow->dead = true;
1631
1632 dp_netdev_flow_unref(flow);
1633 }
1634
1635 static void
1636 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
1637 {
1638 struct dp_netdev_flow *netdev_flow;
1639
1640 ovs_mutex_lock(&pmd->flow_mutex);
1641 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1642 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
1643 }
1644 ovs_mutex_unlock(&pmd->flow_mutex);
1645 }
1646
1647 static int
1648 dpif_netdev_flow_flush(struct dpif *dpif)
1649 {
1650 struct dp_netdev *dp = get_dp_netdev(dpif);
1651 struct dp_netdev_pmd_thread *pmd;
1652
1653 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1654 dp_netdev_pmd_flow_flush(pmd);
1655 }
1656
1657 return 0;
1658 }
1659
1660 struct dp_netdev_port_state {
1661 struct hmap_position position;
1662 char *name;
1663 };
1664
1665 static int
1666 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1667 {
1668 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1669 return 0;
1670 }
1671
1672 static int
1673 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
1674 struct dpif_port *dpif_port)
1675 {
1676 struct dp_netdev_port_state *state = state_;
1677 struct dp_netdev *dp = get_dp_netdev(dpif);
1678 struct hmap_node *node;
1679 int retval;
1680
1681 ovs_mutex_lock(&dp->port_mutex);
1682 node = hmap_at_position(&dp->ports, &state->position);
1683 if (node) {
1684 struct dp_netdev_port *port;
1685
1686 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1687
1688 free(state->name);
1689 state->name = xstrdup(netdev_get_name(port->netdev));
1690 dpif_port->name = state->name;
1691 dpif_port->type = port->type;
1692 dpif_port->port_no = port->port_no;
1693
1694 retval = 0;
1695 } else {
1696 retval = EOF;
1697 }
1698 ovs_mutex_unlock(&dp->port_mutex);
1699
1700 return retval;
1701 }
1702
1703 static int
1704 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
1705 {
1706 struct dp_netdev_port_state *state = state_;
1707 free(state->name);
1708 free(state);
1709 return 0;
1710 }
1711
1712 static int
1713 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
1714 {
1715 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1716 uint64_t new_port_seq;
1717 int error;
1718
1719 new_port_seq = seq_read(dpif->dp->port_seq);
1720 if (dpif->last_port_seq != new_port_seq) {
1721 dpif->last_port_seq = new_port_seq;
1722 error = ENOBUFS;
1723 } else {
1724 error = EAGAIN;
1725 }
1726
1727 return error;
1728 }
1729
1730 static void
1731 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1732 {
1733 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1734
1735 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
1736 }
1737
1738 static struct dp_netdev_flow *
1739 dp_netdev_flow_cast(const struct dpcls_rule *cr)
1740 {
1741 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
1742 }
1743
1744 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1745 {
1746 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1747 }
1748
1749 /* netdev_flow_key utilities.
1750 *
1751 * netdev_flow_key is basically a miniflow. We use these functions
1752 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1753 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1754 *
1755 * - Since we are dealing exclusively with miniflows created by
1756 * miniflow_extract(), if the map is different the miniflow is different.
1757 * Therefore we can be faster by comparing the map and the miniflow in a
1758 * single memcmp().
1759 * - These functions can be inlined by the compiler. */
1760
1761 /* Given the number of bits set in miniflow's maps, returns the size of the
1762 * 'netdev_flow_key.mf' */
1763 static inline size_t
1764 netdev_flow_key_size(size_t flow_u64s)
1765 {
1766 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
1767 }
1768
1769 static inline bool
1770 netdev_flow_key_equal(const struct netdev_flow_key *a,
1771 const struct netdev_flow_key *b)
1772 {
1773 /* 'b->len' may be not set yet. */
1774 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
1775 }
1776
1777 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1778 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
1779 * generated by miniflow_extract. */
1780 static inline bool
1781 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1782 const struct miniflow *mf)
1783 {
1784 return !memcmp(&key->mf, mf, key->len);
1785 }
1786
1787 static inline void
1788 netdev_flow_key_clone(struct netdev_flow_key *dst,
1789 const struct netdev_flow_key *src)
1790 {
1791 memcpy(dst, src,
1792 offsetof(struct netdev_flow_key, mf) + src->len);
1793 }
1794
1795 /* Slow. */
1796 static void
1797 netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1798 const struct flow *src)
1799 {
1800 struct dp_packet packet;
1801 uint64_t buf_stub[512 / 8];
1802
1803 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1804 pkt_metadata_from_flow(&packet.md, src);
1805 flow_compose(&packet, src);
1806 miniflow_extract(&packet, &dst->mf);
1807 dp_packet_uninit(&packet);
1808
1809 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
1810 dst->hash = 0; /* Not computed yet. */
1811 }
1812
1813 /* Initialize a netdev_flow_key 'mask' from 'match'. */
1814 static inline void
1815 netdev_flow_mask_init(struct netdev_flow_key *mask,
1816 const struct match *match)
1817 {
1818 uint64_t *dst = miniflow_values(&mask->mf);
1819 struct flowmap fmap;
1820 uint32_t hash = 0;
1821 size_t idx;
1822
1823 /* Only check masks that make sense for the flow. */
1824 flow_wc_map(&match->flow, &fmap);
1825 flowmap_init(&mask->mf.map);
1826
1827 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1828 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
1829
1830 if (mask_u64) {
1831 flowmap_set(&mask->mf.map, idx, 1);
1832 *dst++ = mask_u64;
1833 hash = hash_add64(hash, mask_u64);
1834 }
1835 }
1836
1837 map_t map;
1838
1839 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1840 hash = hash_add64(hash, map);
1841 }
1842
1843 size_t n = dst - miniflow_get_values(&mask->mf);
1844
1845 mask->hash = hash_finish(hash, n * 8);
1846 mask->len = netdev_flow_key_size(n);
1847 }
1848
1849 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
1850 static inline void
1851 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1852 const struct flow *flow,
1853 const struct netdev_flow_key *mask)
1854 {
1855 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1856 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
1857 uint32_t hash = 0;
1858 uint64_t value;
1859
1860 dst->len = mask->len;
1861 dst->mf = mask->mf; /* Copy maps. */
1862
1863 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
1864 *dst_u64 = value & *mask_u64++;
1865 hash = hash_add64(hash, *dst_u64++);
1866 }
1867 dst->hash = hash_finish(hash,
1868 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
1869 }
1870
1871 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1872 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1873 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
1874
1875 /* Returns a hash value for the bits of 'key' where there are 1-bits in
1876 * 'mask'. */
1877 static inline uint32_t
1878 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1879 const struct netdev_flow_key *mask)
1880 {
1881 const uint64_t *p = miniflow_get_values(&mask->mf);
1882 uint32_t hash = 0;
1883 uint64_t value;
1884
1885 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1886 hash = hash_add64(hash, value & *p++);
1887 }
1888
1889 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
1890 }
1891
1892 static inline bool
1893 emc_entry_alive(struct emc_entry *ce)
1894 {
1895 return ce->flow && !ce->flow->dead;
1896 }
1897
1898 static void
1899 emc_clear_entry(struct emc_entry *ce)
1900 {
1901 if (ce->flow) {
1902 dp_netdev_flow_unref(ce->flow);
1903 ce->flow = NULL;
1904 }
1905 }
1906
1907 static inline void
1908 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
1909 const struct netdev_flow_key *key)
1910 {
1911 if (ce->flow != flow) {
1912 if (ce->flow) {
1913 dp_netdev_flow_unref(ce->flow);
1914 }
1915
1916 if (dp_netdev_flow_ref(flow)) {
1917 ce->flow = flow;
1918 } else {
1919 ce->flow = NULL;
1920 }
1921 }
1922 if (key) {
1923 netdev_flow_key_clone(&ce->key, key);
1924 }
1925 }
1926
1927 static inline void
1928 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
1929 struct dp_netdev_flow *flow)
1930 {
1931 struct emc_entry *to_be_replaced = NULL;
1932 struct emc_entry *current_entry;
1933
1934 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1935 if (netdev_flow_key_equal(&current_entry->key, key)) {
1936 /* We found the entry with the 'mf' miniflow */
1937 emc_change_entry(current_entry, flow, NULL);
1938 return;
1939 }
1940
1941 /* Replacement policy: put the flow in an empty (not alive) entry, or
1942 * in the first entry where it can be */
1943 if (!to_be_replaced
1944 || (emc_entry_alive(to_be_replaced)
1945 && !emc_entry_alive(current_entry))
1946 || current_entry->key.hash < to_be_replaced->key.hash) {
1947 to_be_replaced = current_entry;
1948 }
1949 }
1950 /* We didn't find the miniflow in the cache.
1951 * The 'to_be_replaced' entry is where the new flow will be stored */
1952
1953 emc_change_entry(to_be_replaced, flow, key);
1954 }
1955
1956 static inline void
1957 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
1958 const struct netdev_flow_key *key,
1959 struct dp_netdev_flow *flow)
1960 {
1961 /* Insert an entry into the EMC based on probability value 'min'. By
1962 * default the value is UINT32_MAX / 100 which yields an insertion
1963 * probability of 1/100 ie. 1% */
1964
1965 uint32_t min;
1966 atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
1967
1968 #ifdef DPDK_NETDEV
1969 if (min && (key->hash ^ (uint32_t) pmd->last_cycles) <= min) {
1970 #else
1971 if (min && (key->hash ^ random_uint32()) <= min) {
1972 #endif
1973 emc_insert(&pmd->flow_cache, key, flow);
1974 }
1975 }
1976
1977 static inline struct dp_netdev_flow *
1978 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
1979 {
1980 struct emc_entry *current_entry;
1981
1982 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1983 if (current_entry->key.hash == key->hash
1984 && emc_entry_alive(current_entry)
1985 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
1986
1987 /* We found the entry with the 'key->mf' miniflow */
1988 return current_entry->flow;
1989 }
1990 }
1991
1992 return NULL;
1993 }
1994
1995 static struct dp_netdev_flow *
1996 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
1997 const struct netdev_flow_key *key,
1998 int *lookup_num_p)
1999 {
2000 struct dpcls *cls;
2001 struct dpcls_rule *rule;
2002 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
2003 struct dp_netdev_flow *netdev_flow = NULL;
2004
2005 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2006 if (OVS_LIKELY(cls)) {
2007 dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2008 netdev_flow = dp_netdev_flow_cast(rule);
2009 }
2010 return netdev_flow;
2011 }
2012
2013 static struct dp_netdev_flow *
2014 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2015 const ovs_u128 *ufidp, const struct nlattr *key,
2016 size_t key_len)
2017 {
2018 struct dp_netdev_flow *netdev_flow;
2019 struct flow flow;
2020 ovs_u128 ufid;
2021
2022 /* If a UFID is not provided, determine one based on the key. */
2023 if (!ufidp && key && key_len
2024 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
2025 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2026 ufidp = &ufid;
2027 }
2028
2029 if (ufidp) {
2030 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2031 &pmd->flow_table) {
2032 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2033 return netdev_flow;
2034 }
2035 }
2036 }
2037
2038 return NULL;
2039 }
2040
2041 static void
2042 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2043 struct dpif_flow_stats *stats)
2044 {
2045 struct dp_netdev_flow *netdev_flow;
2046 unsigned long long n;
2047 long long used;
2048 uint16_t flags;
2049
2050 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2051
2052 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2053 stats->n_packets = n;
2054 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2055 stats->n_bytes = n;
2056 atomic_read_relaxed(&netdev_flow->stats.used, &used);
2057 stats->used = used;
2058 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2059 stats->tcp_flags = flags;
2060 }
2061
2062 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2063 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2064 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2065 * protect them. */
2066 static void
2067 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2068 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2069 struct dpif_flow *flow, bool terse)
2070 {
2071 if (terse) {
2072 memset(flow, 0, sizeof *flow);
2073 } else {
2074 struct flow_wildcards wc;
2075 struct dp_netdev_actions *actions;
2076 size_t offset;
2077 struct odp_flow_key_parms odp_parms = {
2078 .flow = &netdev_flow->flow,
2079 .mask = &wc.masks,
2080 .support = dp_netdev_support,
2081 };
2082
2083 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2084 /* in_port is exact matched, but we have left it out from the mask for
2085 * optimnization reasons. Add in_port back to the mask. */
2086 wc.masks.in_port.odp_port = ODPP_NONE;
2087
2088 /* Key */
2089 offset = key_buf->size;
2090 flow->key = ofpbuf_tail(key_buf);
2091 odp_flow_key_from_flow(&odp_parms, key_buf);
2092 flow->key_len = key_buf->size - offset;
2093
2094 /* Mask */
2095 offset = mask_buf->size;
2096 flow->mask = ofpbuf_tail(mask_buf);
2097 odp_parms.key_buf = key_buf;
2098 odp_flow_key_from_mask(&odp_parms, mask_buf);
2099 flow->mask_len = mask_buf->size - offset;
2100
2101 /* Actions */
2102 actions = dp_netdev_flow_get_actions(netdev_flow);
2103 flow->actions = actions->actions;
2104 flow->actions_len = actions->size;
2105 }
2106
2107 flow->ufid = netdev_flow->ufid;
2108 flow->ufid_present = true;
2109 flow->pmd_id = netdev_flow->pmd_id;
2110 get_dpif_flow_stats(netdev_flow, &flow->stats);
2111 }
2112
2113 static int
2114 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2115 const struct nlattr *mask_key,
2116 uint32_t mask_key_len, const struct flow *flow,
2117 struct flow_wildcards *wc)
2118 {
2119 enum odp_key_fitness fitness;
2120
2121 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
2122 if (fitness) {
2123 /* This should not happen: it indicates that
2124 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2125 * disagree on the acceptable form of a mask. Log the problem
2126 * as an error, with enough details to enable debugging. */
2127 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2128
2129 if (!VLOG_DROP_ERR(&rl)) {
2130 struct ds s;
2131
2132 ds_init(&s);
2133 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2134 true);
2135 VLOG_ERR("internal error parsing flow mask %s (%s)",
2136 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2137 ds_destroy(&s);
2138 }
2139
2140 return EINVAL;
2141 }
2142
2143 return 0;
2144 }
2145
2146 static int
2147 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2148 struct flow *flow)
2149 {
2150 odp_port_t in_port;
2151
2152 if (odp_flow_key_to_flow(key, key_len, flow)) {
2153 /* This should not happen: it indicates that odp_flow_key_from_flow()
2154 * and odp_flow_key_to_flow() disagree on the acceptable form of a
2155 * flow. Log the problem as an error, with enough details to enable
2156 * debugging. */
2157 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2158
2159 if (!VLOG_DROP_ERR(&rl)) {
2160 struct ds s;
2161
2162 ds_init(&s);
2163 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2164 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2165 ds_destroy(&s);
2166 }
2167
2168 return EINVAL;
2169 }
2170
2171 in_port = flow->in_port.odp_port;
2172 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
2173 return EINVAL;
2174 }
2175
2176 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2177 return EINVAL;
2178 }
2179
2180 return 0;
2181 }
2182
2183 static int
2184 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2185 {
2186 struct dp_netdev *dp = get_dp_netdev(dpif);
2187 struct dp_netdev_flow *netdev_flow;
2188 struct dp_netdev_pmd_thread *pmd;
2189 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2190 struct hmapx_node *node;
2191 int error = EINVAL;
2192
2193 if (get->pmd_id == PMD_ID_NULL) {
2194 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2195 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2196 dp_netdev_pmd_unref(pmd);
2197 }
2198 }
2199 } else {
2200 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2201 if (!pmd) {
2202 goto out;
2203 }
2204 hmapx_add(&to_find, pmd);
2205 }
2206
2207 if (!hmapx_count(&to_find)) {
2208 goto out;
2209 }
2210
2211 HMAPX_FOR_EACH (node, &to_find) {
2212 pmd = (struct dp_netdev_pmd_thread *) node->data;
2213 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2214 get->key_len);
2215 if (netdev_flow) {
2216 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2217 get->flow, false);
2218 error = 0;
2219 break;
2220 } else {
2221 error = ENOENT;
2222 }
2223 }
2224
2225 HMAPX_FOR_EACH (node, &to_find) {
2226 pmd = (struct dp_netdev_pmd_thread *) node->data;
2227 dp_netdev_pmd_unref(pmd);
2228 }
2229 out:
2230 hmapx_destroy(&to_find);
2231 return error;
2232 }
2233
2234 static struct dp_netdev_flow *
2235 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2236 struct match *match, const ovs_u128 *ufid,
2237 const struct nlattr *actions, size_t actions_len)
2238 OVS_REQUIRES(pmd->flow_mutex)
2239 {
2240 struct dp_netdev_flow *flow;
2241 struct netdev_flow_key mask;
2242 struct dpcls *cls;
2243
2244 /* Make sure in_port is exact matched before we read it. */
2245 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
2246 odp_port_t in_port = match->flow.in_port.odp_port;
2247
2248 /* As we select the dpcls based on the port number, each netdev flow
2249 * belonging to the same dpcls will have the same odp_port value.
2250 * For performance reasons we wildcard odp_port here in the mask. In the
2251 * typical case dp_hash is also wildcarded, and the resulting 8-byte
2252 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
2253 * will not be part of the subtable mask.
2254 * This will speed up the hash computation during dpcls_lookup() because
2255 * there is one less call to hash_add64() in this case. */
2256 match->wc.masks.in_port.odp_port = 0;
2257 netdev_flow_mask_init(&mask, match);
2258 match->wc.masks.in_port.odp_port = ODPP_NONE;
2259
2260 /* Make sure wc does not have metadata. */
2261 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2262 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2263
2264 /* Do not allocate extra space. */
2265 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2266 memset(&flow->stats, 0, sizeof flow->stats);
2267 flow->dead = false;
2268 flow->batch = NULL;
2269 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2270 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2271 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2272 ovs_refcount_init(&flow->ref_cnt);
2273 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2274
2275 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2276
2277 /* Select dpcls for in_port. Relies on in_port to be exact match. */
2278 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2279 dpcls_insert(cls, &flow->cr, &mask);
2280
2281 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2282 dp_netdev_flow_hash(&flow->ufid));
2283
2284 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
2285 struct ds ds = DS_EMPTY_INITIALIZER;
2286 struct ofpbuf key_buf, mask_buf;
2287 struct odp_flow_key_parms odp_parms = {
2288 .flow = &match->flow,
2289 .mask = &match->wc.masks,
2290 .support = dp_netdev_support,
2291 };
2292
2293 ofpbuf_init(&key_buf, 0);
2294 ofpbuf_init(&mask_buf, 0);
2295
2296 odp_flow_key_from_flow(&odp_parms, &key_buf);
2297 odp_parms.key_buf = &key_buf;
2298 odp_flow_key_from_mask(&odp_parms, &mask_buf);
2299
2300 ds_put_cstr(&ds, "flow_add: ");
2301 odp_format_ufid(ufid, &ds);
2302 ds_put_cstr(&ds, " ");
2303 odp_flow_format(key_buf.data, key_buf.size,
2304 mask_buf.data, mask_buf.size,
2305 NULL, &ds, false);
2306 ds_put_cstr(&ds, ", actions:");
2307 format_odp_actions(&ds, actions, actions_len);
2308
2309 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2310
2311 ofpbuf_uninit(&key_buf);
2312 ofpbuf_uninit(&mask_buf);
2313 ds_destroy(&ds);
2314 }
2315
2316 return flow;
2317 }
2318
2319 static int
2320 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
2321 struct netdev_flow_key *key,
2322 struct match *match,
2323 ovs_u128 *ufid,
2324 const struct dpif_flow_put *put,
2325 struct dpif_flow_stats *stats)
2326 {
2327 struct dp_netdev_flow *netdev_flow;
2328 int error = 0;
2329
2330 if (stats) {
2331 memset(stats, 0, sizeof *stats);
2332 }
2333
2334 ovs_mutex_lock(&pmd->flow_mutex);
2335 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
2336 if (!netdev_flow) {
2337 if (put->flags & DPIF_FP_CREATE) {
2338 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2339 dp_netdev_flow_add(pmd, match, ufid, put->actions,
2340 put->actions_len);
2341 error = 0;
2342 } else {
2343 error = EFBIG;
2344 }
2345 } else {
2346 error = ENOENT;
2347 }
2348 } else {
2349 if (put->flags & DPIF_FP_MODIFY
2350 && flow_equal(&match->flow, &netdev_flow->flow)) {
2351 struct dp_netdev_actions *new_actions;
2352 struct dp_netdev_actions *old_actions;
2353
2354 new_actions = dp_netdev_actions_create(put->actions,
2355 put->actions_len);
2356
2357 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2358 ovsrcu_set(&netdev_flow->actions, new_actions);
2359
2360 if (stats) {
2361 get_dpif_flow_stats(netdev_flow, stats);
2362 }
2363 if (put->flags & DPIF_FP_ZERO_STATS) {
2364 /* XXX: The userspace datapath uses thread local statistics
2365 * (for flows), which should be updated only by the owning
2366 * thread. Since we cannot write on stats memory here,
2367 * we choose not to support this flag. Please note:
2368 * - This feature is currently used only by dpctl commands with
2369 * option --clear.
2370 * - Should the need arise, this operation can be implemented
2371 * by keeping a base value (to be update here) for each
2372 * counter, and subtracting it before outputting the stats */
2373 error = EOPNOTSUPP;
2374 }
2375
2376 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2377 } else if (put->flags & DPIF_FP_CREATE) {
2378 error = EEXIST;
2379 } else {
2380 /* Overlapping flow. */
2381 error = EINVAL;
2382 }
2383 }
2384 ovs_mutex_unlock(&pmd->flow_mutex);
2385 return error;
2386 }
2387
2388 static int
2389 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2390 {
2391 struct dp_netdev *dp = get_dp_netdev(dpif);
2392 struct netdev_flow_key key;
2393 struct dp_netdev_pmd_thread *pmd;
2394 struct match match;
2395 ovs_u128 ufid;
2396 int error;
2397
2398 if (put->stats) {
2399 memset(put->stats, 0, sizeof *put->stats);
2400 }
2401 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
2402 if (error) {
2403 return error;
2404 }
2405 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2406 put->mask, put->mask_len,
2407 &match.flow, &match.wc);
2408 if (error) {
2409 return error;
2410 }
2411
2412 if (put->ufid) {
2413 ufid = *put->ufid;
2414 } else {
2415 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2416 }
2417
2418 /* Must produce a netdev_flow_key for lookup.
2419 * This interface is no longer performance critical, since it is not used
2420 * for upcall processing any more. */
2421 netdev_flow_key_from_flow(&key, &match.flow);
2422
2423 if (put->pmd_id == PMD_ID_NULL) {
2424 if (cmap_count(&dp->poll_threads) == 0) {
2425 return EINVAL;
2426 }
2427 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2428 struct dpif_flow_stats pmd_stats;
2429 int pmd_error;
2430
2431 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
2432 &pmd_stats);
2433 if (pmd_error) {
2434 error = pmd_error;
2435 } else if (put->stats) {
2436 put->stats->n_packets += pmd_stats.n_packets;
2437 put->stats->n_bytes += pmd_stats.n_bytes;
2438 put->stats->used = MAX(put->stats->used, pmd_stats.used);
2439 put->stats->tcp_flags |= pmd_stats.tcp_flags;
2440 }
2441 }
2442 } else {
2443 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
2444 if (!pmd) {
2445 return EINVAL;
2446 }
2447 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
2448 dp_netdev_pmd_unref(pmd);
2449 }
2450
2451 return error;
2452 }
2453
2454 static int
2455 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
2456 struct dpif_flow_stats *stats,
2457 const struct dpif_flow_del *del)
2458 {
2459 struct dp_netdev_flow *netdev_flow;
2460 int error = 0;
2461
2462 ovs_mutex_lock(&pmd->flow_mutex);
2463 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2464 del->key_len);
2465 if (netdev_flow) {
2466 if (stats) {
2467 get_dpif_flow_stats(netdev_flow, stats);
2468 }
2469 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2470 } else {
2471 error = ENOENT;
2472 }
2473 ovs_mutex_unlock(&pmd->flow_mutex);
2474
2475 return error;
2476 }
2477
2478 static int
2479 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
2480 {
2481 struct dp_netdev *dp = get_dp_netdev(dpif);
2482 struct dp_netdev_pmd_thread *pmd;
2483 int error = 0;
2484
2485 if (del->stats) {
2486 memset(del->stats, 0, sizeof *del->stats);
2487 }
2488
2489 if (del->pmd_id == PMD_ID_NULL) {
2490 if (cmap_count(&dp->poll_threads) == 0) {
2491 return EINVAL;
2492 }
2493 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2494 struct dpif_flow_stats pmd_stats;
2495 int pmd_error;
2496
2497 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
2498 if (pmd_error) {
2499 error = pmd_error;
2500 } else if (del->stats) {
2501 del->stats->n_packets += pmd_stats.n_packets;
2502 del->stats->n_bytes += pmd_stats.n_bytes;
2503 del->stats->used = MAX(del->stats->used, pmd_stats.used);
2504 del->stats->tcp_flags |= pmd_stats.tcp_flags;
2505 }
2506 }
2507 } else {
2508 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
2509 if (!pmd) {
2510 return EINVAL;
2511 }
2512 error = flow_del_on_pmd(pmd, del->stats, del);
2513 dp_netdev_pmd_unref(pmd);
2514 }
2515
2516
2517 return error;
2518 }
2519
2520 struct dpif_netdev_flow_dump {
2521 struct dpif_flow_dump up;
2522 struct cmap_position poll_thread_pos;
2523 struct cmap_position flow_pos;
2524 struct dp_netdev_pmd_thread *cur_pmd;
2525 int status;
2526 struct ovs_mutex mutex;
2527 };
2528
2529 static struct dpif_netdev_flow_dump *
2530 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
2531 {
2532 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
2533 }
2534
2535 static struct dpif_flow_dump *
2536 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
2537 {
2538 struct dpif_netdev_flow_dump *dump;
2539
2540 dump = xzalloc(sizeof *dump);
2541 dpif_flow_dump_init(&dump->up, dpif_);
2542 dump->up.terse = terse;
2543 ovs_mutex_init(&dump->mutex);
2544
2545 return &dump->up;
2546 }
2547
2548 static int
2549 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
2550 {
2551 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2552
2553 ovs_mutex_destroy(&dump->mutex);
2554 free(dump);
2555 return 0;
2556 }
2557
2558 struct dpif_netdev_flow_dump_thread {
2559 struct dpif_flow_dump_thread up;
2560 struct dpif_netdev_flow_dump *dump;
2561 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2562 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
2563 };
2564
2565 static struct dpif_netdev_flow_dump_thread *
2566 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2567 {
2568 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2569 }
2570
2571 static struct dpif_flow_dump_thread *
2572 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2573 {
2574 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2575 struct dpif_netdev_flow_dump_thread *thread;
2576
2577 thread = xmalloc(sizeof *thread);
2578 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2579 thread->dump = dump;
2580 return &thread->up;
2581 }
2582
2583 static void
2584 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2585 {
2586 struct dpif_netdev_flow_dump_thread *thread
2587 = dpif_netdev_flow_dump_thread_cast(thread_);
2588
2589 free(thread);
2590 }
2591
2592 static int
2593 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
2594 struct dpif_flow *flows, int max_flows)
2595 {
2596 struct dpif_netdev_flow_dump_thread *thread
2597 = dpif_netdev_flow_dump_thread_cast(thread_);
2598 struct dpif_netdev_flow_dump *dump = thread->dump;
2599 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
2600 int n_flows = 0;
2601 int i;
2602
2603 ovs_mutex_lock(&dump->mutex);
2604 if (!dump->status) {
2605 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2606 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2607 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2608 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2609
2610 /* First call to dump_next(), extracts the first pmd thread.
2611 * If there is no pmd thread, returns immediately. */
2612 if (!pmd) {
2613 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2614 if (!pmd) {
2615 ovs_mutex_unlock(&dump->mutex);
2616 return n_flows;
2617
2618 }
2619 }
2620
2621 do {
2622 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2623 struct cmap_node *node;
2624
2625 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2626 if (!node) {
2627 break;
2628 }
2629 netdev_flows[n_flows] = CONTAINER_OF(node,
2630 struct dp_netdev_flow,
2631 node);
2632 }
2633 /* When finishing dumping the current pmd thread, moves to
2634 * the next. */
2635 if (n_flows < flow_limit) {
2636 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2637 dp_netdev_pmd_unref(pmd);
2638 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2639 if (!pmd) {
2640 dump->status = EOF;
2641 break;
2642 }
2643 }
2644 /* Keeps the reference to next caller. */
2645 dump->cur_pmd = pmd;
2646
2647 /* If the current dump is empty, do not exit the loop, since the
2648 * remaining pmds could have flows to be dumped. Just dumps again
2649 * on the new 'pmd'. */
2650 } while (!n_flows);
2651 }
2652 ovs_mutex_unlock(&dump->mutex);
2653
2654 for (i = 0; i < n_flows; i++) {
2655 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2656 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2657 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2658 struct dpif_flow *f = &flows[i];
2659 struct ofpbuf key, mask;
2660
2661 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2662 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
2663 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2664 dump->up.terse);
2665 }
2666
2667 return n_flows;
2668 }
2669
2670 static int
2671 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
2672 OVS_NO_THREAD_SAFETY_ANALYSIS
2673 {
2674 struct dp_netdev *dp = get_dp_netdev(dpif);
2675 struct dp_netdev_pmd_thread *pmd;
2676 struct dp_packet_batch pp;
2677
2678 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2679 dp_packet_size(execute->packet) > UINT16_MAX) {
2680 return EINVAL;
2681 }
2682
2683 /* Tries finding the 'pmd'. If NULL is returned, that means
2684 * the current thread is a non-pmd thread and should use
2685 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
2686 pmd = ovsthread_getspecific(dp->per_pmd_key);
2687 if (!pmd) {
2688 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2689 if (!pmd) {
2690 return EBUSY;
2691 }
2692 }
2693
2694 if (execute->probe) {
2695 /* If this is part of a probe, Drop the packet, since executing
2696 * the action may actually cause spurious packets be sent into
2697 * the network. */
2698 return 0;
2699 }
2700
2701 /* If the current thread is non-pmd thread, acquires
2702 * the 'non_pmd_mutex'. */
2703 if (pmd->core_id == NON_PMD_CORE_ID) {
2704 ovs_mutex_lock(&dp->non_pmd_mutex);
2705 }
2706
2707 /* The action processing expects the RSS hash to be valid, because
2708 * it's always initialized at the beginning of datapath processing.
2709 * In this case, though, 'execute->packet' may not have gone through
2710 * the datapath at all, it may have been generated by the upper layer
2711 * (OpenFlow packet-out, BFD frame, ...). */
2712 if (!dp_packet_rss_valid(execute->packet)) {
2713 dp_packet_set_rss_hash(execute->packet,
2714 flow_hash_5tuple(execute->flow, 0));
2715 }
2716
2717 dp_packet_batch_init_packet(&pp, execute->packet);
2718 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
2719 execute->actions, execute->actions_len,
2720 time_msec());
2721
2722 if (pmd->core_id == NON_PMD_CORE_ID) {
2723 ovs_mutex_unlock(&dp->non_pmd_mutex);
2724 dp_netdev_pmd_unref(pmd);
2725 }
2726
2727 return 0;
2728 }
2729
2730 static void
2731 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2732 {
2733 size_t i;
2734
2735 for (i = 0; i < n_ops; i++) {
2736 struct dpif_op *op = ops[i];
2737
2738 switch (op->type) {
2739 case DPIF_OP_FLOW_PUT:
2740 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2741 break;
2742
2743 case DPIF_OP_FLOW_DEL:
2744 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2745 break;
2746
2747 case DPIF_OP_EXECUTE:
2748 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2749 break;
2750
2751 case DPIF_OP_FLOW_GET:
2752 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2753 break;
2754 }
2755 }
2756 }
2757
2758 /* Applies datapath configuration from the database. Some of the changes are
2759 * actually applied in dpif_netdev_run(). */
2760 static int
2761 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
2762 {
2763 struct dp_netdev *dp = get_dp_netdev(dpif);
2764 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
2765 unsigned long long insert_prob =
2766 smap_get_ullong(other_config, "emc-insert-inv-prob",
2767 DEFAULT_EM_FLOW_INSERT_INV_PROB);
2768 uint32_t insert_min, cur_min;
2769
2770 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
2771 free(dp->pmd_cmask);
2772 dp->pmd_cmask = nullable_xstrdup(cmask);
2773 dp_netdev_request_reconfigure(dp);
2774 }
2775
2776 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
2777 if (insert_prob <= UINT32_MAX) {
2778 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
2779 } else {
2780 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
2781 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
2782 }
2783
2784 if (insert_min != cur_min) {
2785 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
2786 if (insert_min == 0) {
2787 VLOG_INFO("EMC has been disabled");
2788 } else {
2789 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
2790 insert_prob, (100 / (float)insert_prob));
2791 }
2792 }
2793
2794 return 0;
2795 }
2796
2797 /* Parses affinity list and returns result in 'core_ids'. */
2798 static int
2799 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
2800 {
2801 unsigned i;
2802 char *list, *copy, *key, *value;
2803 int error = 0;
2804
2805 for (i = 0; i < n_rxq; i++) {
2806 core_ids[i] = OVS_CORE_UNSPEC;
2807 }
2808
2809 if (!affinity_list) {
2810 return 0;
2811 }
2812
2813 list = copy = xstrdup(affinity_list);
2814
2815 while (ofputil_parse_key_value(&list, &key, &value)) {
2816 int rxq_id, core_id;
2817
2818 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
2819 || !str_to_int(value, 0, &core_id) || core_id < 0) {
2820 error = EINVAL;
2821 break;
2822 }
2823
2824 if (rxq_id < n_rxq) {
2825 core_ids[rxq_id] = core_id;
2826 }
2827 }
2828
2829 free(copy);
2830 return error;
2831 }
2832
2833 /* Parses 'affinity_list' and applies configuration if it is valid. */
2834 static int
2835 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
2836 const char *affinity_list)
2837 {
2838 unsigned *core_ids, i;
2839 int error = 0;
2840
2841 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
2842 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
2843 error = EINVAL;
2844 goto exit;
2845 }
2846
2847 for (i = 0; i < port->n_rxq; i++) {
2848 port->rxqs[i].core_id = core_ids[i];
2849 }
2850
2851 exit:
2852 free(core_ids);
2853 return error;
2854 }
2855
2856 /* Changes the affinity of port's rx queues. The changes are actually applied
2857 * in dpif_netdev_run(). */
2858 static int
2859 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
2860 const struct smap *cfg)
2861 {
2862 struct dp_netdev *dp = get_dp_netdev(dpif);
2863 struct dp_netdev_port *port;
2864 int error = 0;
2865 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2866
2867 ovs_mutex_lock(&dp->port_mutex);
2868 error = get_port_by_number(dp, port_no, &port);
2869 if (error || !netdev_is_pmd(port->netdev)
2870 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
2871 goto unlock;
2872 }
2873
2874 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
2875 if (error) {
2876 goto unlock;
2877 }
2878 free(port->rxq_affinity_list);
2879 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
2880
2881 dp_netdev_request_reconfigure(dp);
2882 unlock:
2883 ovs_mutex_unlock(&dp->port_mutex);
2884 return error;
2885 }
2886
2887 static int
2888 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2889 uint32_t queue_id, uint32_t *priority)
2890 {
2891 *priority = queue_id;
2892 return 0;
2893 }
2894
2895 \f
2896 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2897 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
2898 struct dp_netdev_actions *
2899 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2900 {
2901 struct dp_netdev_actions *netdev_actions;
2902
2903 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2904 memcpy(netdev_actions->actions, actions, size);
2905 netdev_actions->size = size;
2906
2907 return netdev_actions;
2908 }
2909
2910 struct dp_netdev_actions *
2911 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
2912 {
2913 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
2914 }
2915
2916 static void
2917 dp_netdev_actions_free(struct dp_netdev_actions *actions)
2918 {
2919 free(actions);
2920 }
2921 \f
2922 static inline unsigned long long
2923 cycles_counter(void)
2924 {
2925 #ifdef DPDK_NETDEV
2926 return rte_get_tsc_cycles();
2927 #else
2928 return 0;
2929 #endif
2930 }
2931
2932 /* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2933 extern struct ovs_mutex cycles_counter_fake_mutex;
2934
2935 /* Start counting cycles. Must be followed by 'cycles_count_end()' */
2936 static inline void
2937 cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2938 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2939 OVS_NO_THREAD_SAFETY_ANALYSIS
2940 {
2941 pmd->last_cycles = cycles_counter();
2942 }
2943
2944 /* Stop counting cycles and add them to the counter 'type' */
2945 static inline void
2946 cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2947 enum pmd_cycles_counter_type type)
2948 OVS_RELEASES(&cycles_counter_fake_mutex)
2949 OVS_NO_THREAD_SAFETY_ANALYSIS
2950 {
2951 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2952
2953 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2954 }
2955
2956 static void
2957 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
2958 struct netdev_rxq *rx,
2959 odp_port_t port_no)
2960 {
2961 struct dp_packet_batch batch;
2962 int error;
2963
2964 dp_packet_batch_init(&batch);
2965 cycles_count_start(pmd);
2966 error = netdev_rxq_recv(rx, &batch);
2967 cycles_count_end(pmd, PMD_CYCLES_POLLING);
2968 if (!error) {
2969 *recirc_depth_get() = 0;
2970
2971 cycles_count_start(pmd);
2972 dp_netdev_input(pmd, &batch, port_no);
2973 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
2974 } else if (error != EAGAIN && error != EOPNOTSUPP) {
2975 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2976
2977 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
2978 netdev_rxq_get_name(rx), ovs_strerror(error));
2979 }
2980 }
2981
2982 static struct tx_port *
2983 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
2984 {
2985 struct tx_port *tx;
2986
2987 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
2988 if (tx->port->port_no == port_no) {
2989 return tx;
2990 }
2991 }
2992
2993 return NULL;
2994 }
2995
2996 static int
2997 port_reconfigure(struct dp_netdev_port *port)
2998 {
2999 struct netdev *netdev = port->netdev;
3000 int i, err;
3001
3002 port->need_reconfigure = false;
3003
3004 /* Closes the existing 'rxq's. */
3005 for (i = 0; i < port->n_rxq; i++) {
3006 netdev_rxq_close(port->rxqs[i].rx);
3007 port->rxqs[i].rx = NULL;
3008 }
3009 port->n_rxq = 0;
3010
3011 /* Allows 'netdev' to apply the pending configuration changes. */
3012 if (netdev_is_reconf_required(netdev)) {
3013 err = netdev_reconfigure(netdev);
3014 if (err && (err != EOPNOTSUPP)) {
3015 VLOG_ERR("Failed to set interface %s new configuration",
3016 netdev_get_name(netdev));
3017 return err;
3018 }
3019 }
3020 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3021 port->rxqs = xrealloc(port->rxqs,
3022 sizeof *port->rxqs * netdev_n_rxq(netdev));
3023 /* Realloc 'used' counters for tx queues. */
3024 free(port->txq_used);
3025 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
3026
3027 for (i = 0; i < netdev_n_rxq(netdev); i++) {
3028 port->rxqs[i].port = port;
3029 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
3030 if (err) {
3031 return err;
3032 }
3033 port->n_rxq++;
3034 }
3035
3036 /* Parse affinity list to apply configuration for new queues. */
3037 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
3038
3039 return 0;
3040 }
3041
3042 struct rr_numa_list {
3043 struct hmap numas; /* Contains 'struct rr_numa' */
3044 };
3045
3046 struct rr_numa {
3047 struct hmap_node node;
3048
3049 int numa_id;
3050
3051 /* Non isolated pmds on numa node 'numa_id' */
3052 struct dp_netdev_pmd_thread **pmds;
3053 int n_pmds;
3054
3055 int cur_index;
3056 };
3057
3058 static struct rr_numa *
3059 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
3060 {
3061 struct rr_numa *numa;
3062
3063 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
3064 if (numa->numa_id == numa_id) {
3065 return numa;
3066 }
3067 }
3068
3069 return NULL;
3070 }
3071
3072 static void
3073 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
3074 {
3075 struct dp_netdev_pmd_thread *pmd;
3076 struct rr_numa *numa;
3077
3078 hmap_init(&rr->numas);
3079
3080 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3081 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3082 continue;
3083 }
3084
3085 numa = rr_numa_list_lookup(rr, pmd->numa_id);
3086 if (!numa) {
3087 numa = xzalloc(sizeof *numa);
3088 numa->numa_id = pmd->numa_id;
3089 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
3090 }
3091 numa->n_pmds++;
3092 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
3093 numa->pmds[numa->n_pmds - 1] = pmd;
3094 }
3095 }
3096
3097 static struct dp_netdev_pmd_thread *
3098 rr_numa_get_pmd(struct rr_numa *numa)
3099 {
3100 return numa->pmds[numa->cur_index++ % numa->n_pmds];
3101 }
3102
3103 static void
3104 rr_numa_list_destroy(struct rr_numa_list *rr)
3105 {
3106 struct rr_numa *numa;
3107
3108 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
3109 free(numa->pmds);
3110 free(numa);
3111 }
3112 hmap_destroy(&rr->numas);
3113 }
3114
3115 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
3116 * queues and marks the pmds as isolated. Otherwise, assign non isolated
3117 * pmds to unpinned queues.
3118 *
3119 * The function doesn't touch the pmd threads, it just stores the assignment
3120 * in the 'pmd' member of each rxq. */
3121 static void
3122 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
3123 {
3124 struct dp_netdev_port *port;
3125 struct rr_numa_list rr;
3126
3127 rr_numa_list_populate(dp, &rr);
3128
3129 HMAP_FOR_EACH (port, node, &dp->ports) {
3130 struct rr_numa *numa;
3131 int numa_id;
3132
3133 if (!netdev_is_pmd(port->netdev)) {
3134 continue;
3135 }
3136
3137 numa_id = netdev_get_numa_id(port->netdev);
3138 numa = rr_numa_list_lookup(&rr, numa_id);
3139
3140 for (int qid = 0; qid < port->n_rxq; qid++) {
3141 struct dp_netdev_rxq *q = &port->rxqs[qid];
3142
3143 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
3144 struct dp_netdev_pmd_thread *pmd;
3145
3146 pmd = dp_netdev_get_pmd(dp, q->core_id);
3147 if (!pmd) {
3148 VLOG_WARN("There is no PMD thread on core %d. Queue "
3149 "%d on port \'%s\' will not be polled.",
3150 q->core_id, qid, netdev_get_name(port->netdev));
3151 } else {
3152 q->pmd = pmd;
3153 pmd->isolated = true;
3154 dp_netdev_pmd_unref(pmd);
3155 }
3156 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
3157 if (!numa) {
3158 VLOG_WARN("There's no available (non isolated) pmd thread "
3159 "on numa node %d. Queue %d on port \'%s\' will "
3160 "not be polled.",
3161 numa_id, qid, netdev_get_name(port->netdev));
3162 } else {
3163 q->pmd = rr_numa_get_pmd(numa);
3164 }
3165 }
3166 }
3167 }
3168
3169 rr_numa_list_destroy(&rr);
3170 }
3171
3172 static void
3173 reconfigure_pmd_threads(struct dp_netdev *dp)
3174 OVS_REQUIRES(dp->port_mutex)
3175 {
3176 struct dp_netdev_pmd_thread *pmd;
3177 struct ovs_numa_dump *pmd_cores;
3178 bool changed = false;
3179
3180 /* The pmd threads should be started only if there's a pmd port in the
3181 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
3182 * NR_PMD_THREADS per numa node. */
3183 if (!has_pmd_port(dp)) {
3184 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
3185 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
3186 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
3187 } else {
3188 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
3189 }
3190
3191 /* Check for changed configuration */
3192 if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
3193 changed = true;
3194 } else {
3195 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3196 if (pmd->core_id != NON_PMD_CORE_ID
3197 && !ovs_numa_dump_contains_core(pmd_cores,
3198 pmd->numa_id,
3199 pmd->core_id)) {
3200 changed = true;
3201 break;
3202 }
3203 }
3204 }
3205
3206 /* Destroy the old and recreate the new pmd threads. We don't perform an
3207 * incremental update because we would have to adjust 'static_tx_qid'. */
3208 if (changed) {
3209 struct ovs_numa_info_core *core;
3210 struct ovs_numa_info_numa *numa;
3211
3212 /* Do not destroy the non pmd thread. */
3213 dp_netdev_destroy_all_pmds(dp, false);
3214 FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
3215 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
3216
3217 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
3218
3219 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
3220 }
3221
3222 /* Log the number of pmd threads per numa node. */
3223 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
3224 VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
3225 numa->n_cores, numa->numa_id);
3226 }
3227 }
3228
3229 ovs_numa_dump_destroy(pmd_cores);
3230 }
3231
3232 static void
3233 reload_affected_pmds(struct dp_netdev *dp)
3234 {
3235 struct dp_netdev_pmd_thread *pmd;
3236
3237 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3238 if (pmd->need_reload) {
3239 dp_netdev_reload_pmd__(pmd);
3240 pmd->need_reload = false;
3241 }
3242 }
3243 }
3244
3245 static void
3246 pmd_remove_stale_ports(struct dp_netdev *dp,
3247 struct dp_netdev_pmd_thread *pmd)
3248 OVS_EXCLUDED(pmd->port_mutex)
3249 OVS_REQUIRES(dp->port_mutex)
3250 {
3251 struct rxq_poll *poll, *poll_next;
3252 struct tx_port *tx, *tx_next;
3253
3254 ovs_mutex_lock(&pmd->port_mutex);
3255 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3256 struct dp_netdev_port *port = poll->rxq->port;
3257
3258 if (port->need_reconfigure
3259 || !hmap_contains(&dp->ports, &port->node)) {
3260 dp_netdev_del_rxq_from_pmd(pmd, poll);
3261 }
3262 }
3263 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
3264 struct dp_netdev_port *port = tx->port;
3265
3266 if (port->need_reconfigure
3267 || !hmap_contains(&dp->ports, &port->node)) {
3268 dp_netdev_del_port_tx_from_pmd(pmd, tx);
3269 }
3270 }
3271 ovs_mutex_unlock(&pmd->port_mutex);
3272 }
3273
3274 /* Must be called each time a port is added/removed or the cmask changes.
3275 * This creates and destroys pmd threads, reconfigures ports, opens their
3276 * rxqs and assigns all rxqs/txqs to pmd threads. */
3277 static void
3278 reconfigure_datapath(struct dp_netdev *dp)
3279 OVS_REQUIRES(dp->port_mutex)
3280 {
3281 struct dp_netdev_pmd_thread *pmd;
3282 struct dp_netdev_port *port;
3283 int wanted_txqs;
3284
3285 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
3286
3287 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
3288 * on the system and the user configuration. */
3289 reconfigure_pmd_threads(dp);
3290
3291 wanted_txqs = cmap_count(&dp->poll_threads);
3292
3293 /* The number of pmd threads might have changed, or a port can be new:
3294 * adjust the txqs. */
3295 HMAP_FOR_EACH (port, node, &dp->ports) {
3296 netdev_set_tx_multiq(port->netdev, wanted_txqs);
3297 }
3298
3299 /* Step 2: Remove from the pmd threads ports that have been removed or
3300 * need reconfiguration. */
3301
3302 /* Check for all the ports that need reconfiguration. We cache this in
3303 * 'port->reconfigure', because netdev_is_reconf_required() can change at
3304 * any time. */
3305 HMAP_FOR_EACH (port, node, &dp->ports) {
3306 if (netdev_is_reconf_required(port->netdev)) {
3307 port->need_reconfigure = true;
3308 }
3309 }
3310
3311 /* Remove from the pmd threads all the ports that have been deleted or
3312 * need reconfiguration. */
3313 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3314 pmd_remove_stale_ports(dp, pmd);
3315 }
3316
3317 /* Reload affected pmd threads. We must wait for the pmd threads before
3318 * reconfiguring the ports, because a port cannot be reconfigured while
3319 * it's being used. */
3320 reload_affected_pmds(dp);
3321
3322 /* Step 3: Reconfigure ports. */
3323
3324 /* We only reconfigure the ports that we determined above, because they're
3325 * not being used by any pmd thread at the moment. If a port fails to
3326 * reconfigure we remove it from the datapath. */
3327 HMAP_FOR_EACH (port, node, &dp->ports) {
3328 int err;
3329
3330 if (!port->need_reconfigure) {
3331 continue;
3332 }
3333
3334 err = port_reconfigure(port);
3335 if (err) {
3336 hmap_remove(&dp->ports, &port->node);
3337 seq_change(dp->port_seq);
3338 port_destroy(port);
3339 } else {
3340 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
3341 }
3342 }
3343
3344 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
3345 * for now, we just update the 'pmd' pointer in each rxq to point to the
3346 * wanted thread according to the scheduling policy. */
3347
3348 /* Reset all the pmd threads to non isolated. */
3349 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3350 pmd->isolated = false;
3351 }
3352
3353 /* Reset all the queues to unassigned */
3354 HMAP_FOR_EACH (port, node, &dp->ports) {
3355 for (int i = 0; i < port->n_rxq; i++) {
3356 port->rxqs[i].pmd = NULL;
3357 }
3358 }
3359
3360 /* Add pinned queues and mark pmd threads isolated. */
3361 rxq_scheduling(dp, true);
3362
3363 /* Add non-pinned queues. */
3364 rxq_scheduling(dp, false);
3365
3366 /* Step 5: Remove queues not compliant with new scheduling. */
3367 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3368 struct rxq_poll *poll, *poll_next;
3369
3370 ovs_mutex_lock(&pmd->port_mutex);
3371 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3372 if (poll->rxq->pmd != pmd) {
3373 dp_netdev_del_rxq_from_pmd(pmd, poll);
3374 }
3375 }
3376 ovs_mutex_unlock(&pmd->port_mutex);
3377 }
3378
3379 /* Reload affected pmd threads. We must wait for the pmd threads to remove
3380 * the old queues before readding them, otherwise a queue can be polled by
3381 * two threads at the same time. */
3382 reload_affected_pmds(dp);
3383
3384 /* Step 6: Add queues from scheduling, if they're not there already. */
3385 HMAP_FOR_EACH (port, node, &dp->ports) {
3386 if (!netdev_is_pmd(port->netdev)) {
3387 continue;
3388 }
3389
3390 for (int qid = 0; qid < port->n_rxq; qid++) {
3391 struct dp_netdev_rxq *q = &port->rxqs[qid];
3392
3393 if (q->pmd) {
3394 ovs_mutex_lock(&q->pmd->port_mutex);
3395 dp_netdev_add_rxq_to_pmd(q->pmd, q);
3396 ovs_mutex_unlock(&q->pmd->port_mutex);
3397 }
3398 }
3399 }
3400
3401 /* Add every port to the tx cache of every pmd thread, if it's not
3402 * there already and if this pmd has at least one rxq to poll. */
3403 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3404 ovs_mutex_lock(&pmd->port_mutex);
3405 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
3406 HMAP_FOR_EACH (port, node, &dp->ports) {
3407 dp_netdev_add_port_tx_to_pmd(pmd, port);
3408 }
3409 }
3410 ovs_mutex_unlock(&pmd->port_mutex);
3411 }
3412
3413 /* Reload affected pmd threads. */
3414 reload_affected_pmds(dp);
3415 }
3416
3417 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
3418 static bool
3419 ports_require_restart(const struct dp_netdev *dp)
3420 OVS_REQUIRES(dp->port_mutex)
3421 {
3422 struct dp_netdev_port *port;
3423
3424 HMAP_FOR_EACH (port, node, &dp->ports) {
3425 if (netdev_is_reconf_required(port->netdev)) {
3426 return true;
3427 }
3428 }
3429
3430 return false;
3431 }
3432
3433 /* Return true if needs to revalidate datapath flows. */
3434 static bool
3435 dpif_netdev_run(struct dpif *dpif)
3436 {
3437 struct dp_netdev_port *port;
3438 struct dp_netdev *dp = get_dp_netdev(dpif);
3439 struct dp_netdev_pmd_thread *non_pmd;
3440 uint64_t new_tnl_seq;
3441
3442 ovs_mutex_lock(&dp->port_mutex);
3443 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3444 if (non_pmd) {
3445 ovs_mutex_lock(&dp->non_pmd_mutex);
3446 HMAP_FOR_EACH (port, node, &dp->ports) {
3447 if (!netdev_is_pmd(port->netdev)) {
3448 int i;
3449
3450 for (i = 0; i < port->n_rxq; i++) {
3451 dp_netdev_process_rxq_port(non_pmd, port->rxqs[i].rx,
3452 port->port_no);
3453 }
3454 }
3455 }
3456 dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
3457 ovs_mutex_unlock(&dp->non_pmd_mutex);
3458
3459 dp_netdev_pmd_unref(non_pmd);
3460 }
3461
3462 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
3463 reconfigure_datapath(dp);
3464 }
3465 ovs_mutex_unlock(&dp->port_mutex);
3466
3467 tnl_neigh_cache_run();
3468 tnl_port_map_run();
3469 new_tnl_seq = seq_read(tnl_conf_seq);
3470
3471 if (dp->last_tnl_conf_seq != new_tnl_seq) {
3472 dp->last_tnl_conf_seq = new_tnl_seq;
3473 return true;
3474 }
3475 return false;
3476 }
3477
3478 static void
3479 dpif_netdev_wait(struct dpif *dpif)
3480 {
3481 struct dp_netdev_port *port;
3482 struct dp_netdev *dp = get_dp_netdev(dpif);
3483
3484 ovs_mutex_lock(&dp_netdev_mutex);
3485 ovs_mutex_lock(&dp->port_mutex);
3486 HMAP_FOR_EACH (port, node, &dp->ports) {
3487 netdev_wait_reconf_required(port->netdev);
3488 if (!netdev_is_pmd(port->netdev)) {
3489 int i;
3490
3491 for (i = 0; i < port->n_rxq; i++) {
3492 netdev_rxq_wait(port->rxqs[i].rx);
3493 }
3494 }
3495 }
3496 ovs_mutex_unlock(&dp->port_mutex);
3497 ovs_mutex_unlock(&dp_netdev_mutex);
3498 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
3499 }
3500
3501 static void
3502 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
3503 {
3504 struct tx_port *tx_port_cached;
3505
3506 /* Free all used tx queue ids. */
3507 dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
3508
3509 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
3510 free(tx_port_cached);
3511 }
3512 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
3513 free(tx_port_cached);
3514 }
3515 }
3516
3517 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
3518 * 'pmd->port_cache' (thread local) */
3519 static void
3520 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
3521 OVS_REQUIRES(pmd->port_mutex)
3522 {
3523 struct tx_port *tx_port, *tx_port_cached;
3524
3525 pmd_free_cached_ports(pmd);
3526 hmap_shrink(&pmd->send_port_cache);
3527 hmap_shrink(&pmd->tnl_port_cache);
3528
3529 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
3530 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
3531 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
3532 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
3533 hash_port_no(tx_port_cached->port->port_no));
3534 }
3535
3536 if (netdev_n_txq(tx_port->port->netdev)) {
3537 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
3538 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
3539 hash_port_no(tx_port_cached->port->port_no));
3540 }
3541 }
3542 }
3543
3544 static int
3545 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
3546 struct polled_queue **ppoll_list)
3547 {
3548 struct polled_queue *poll_list = *ppoll_list;
3549 struct rxq_poll *poll;
3550 int i;
3551
3552 ovs_mutex_lock(&pmd->port_mutex);
3553 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
3554 * sizeof *poll_list);
3555
3556 i = 0;
3557 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
3558 poll_list[i].rx = poll->rxq->rx;
3559 poll_list[i].port_no = poll->rxq->port->port_no;
3560 i++;
3561 }
3562
3563 pmd_load_cached_ports(pmd);
3564
3565 ovs_mutex_unlock(&pmd->port_mutex);
3566
3567 *ppoll_list = poll_list;
3568 return i;
3569 }
3570
3571 static void *
3572 pmd_thread_main(void *f_)
3573 {
3574 struct dp_netdev_pmd_thread *pmd = f_;
3575 unsigned int lc = 0;
3576 struct polled_queue *poll_list;
3577 bool exiting;
3578 int poll_cnt;
3579 int i;
3580
3581 poll_list = NULL;
3582
3583 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
3584 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
3585 ovs_numa_thread_setaffinity_core(pmd->core_id);
3586 dpdk_set_lcore_id(pmd->core_id);
3587 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3588 reload:
3589 emc_cache_init(&pmd->flow_cache);
3590
3591 /* List port/core affinity */
3592 for (i = 0; i < poll_cnt; i++) {
3593 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
3594 pmd->core_id, netdev_rxq_get_name(poll_list[i].rx),
3595 netdev_rxq_get_queue_id(poll_list[i].rx));
3596 }
3597
3598 if (!poll_cnt) {
3599 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
3600 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
3601 poll_block();
3602 }
3603 lc = UINT_MAX;
3604 }
3605
3606 for (;;) {
3607 for (i = 0; i < poll_cnt; i++) {
3608 dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
3609 poll_list[i].port_no);
3610 }
3611
3612 if (lc++ > 1024) {
3613 bool reload;
3614
3615 lc = 0;
3616
3617 coverage_try_clear();
3618 dp_netdev_pmd_try_optimize(pmd);
3619 if (!ovsrcu_try_quiesce()) {
3620 emc_cache_slow_sweep(&pmd->flow_cache);
3621 }
3622
3623 atomic_read_relaxed(&pmd->reload, &reload);
3624 if (reload) {
3625 break;
3626 }
3627 }
3628 }
3629
3630 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3631 exiting = latch_is_set(&pmd->exit_latch);
3632 /* Signal here to make sure the pmd finishes
3633 * reloading the updated configuration. */
3634 dp_netdev_pmd_reload_done(pmd);
3635
3636 emc_cache_uninit(&pmd->flow_cache);
3637
3638 if (!exiting) {
3639 goto reload;
3640 }
3641
3642 free(poll_list);
3643 pmd_free_cached_ports(pmd);
3644 return NULL;
3645 }
3646
3647 static void
3648 dp_netdev_disable_upcall(struct dp_netdev *dp)
3649 OVS_ACQUIRES(dp->upcall_rwlock)
3650 {
3651 fat_rwlock_wrlock(&dp->upcall_rwlock);
3652 }
3653
3654 \f
3655 /* Meters */
3656 static void
3657 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
3658 struct ofputil_meter_features *features)
3659 {
3660 features->max_meters = 0;
3661 features->band_types = 0;
3662 features->capabilities = 0;
3663 features->max_bands = 0;
3664 features->max_color = 0;
3665 }
3666
3667 static int
3668 dpif_netdev_meter_set(struct dpif *dpif OVS_UNUSED,
3669 ofproto_meter_id *meter_id OVS_UNUSED,
3670 struct ofputil_meter_config *config OVS_UNUSED)
3671 {
3672 return EFBIG; /* meter_id out of range */
3673 }
3674
3675 static int
3676 dpif_netdev_meter_get(const struct dpif *dpif OVS_UNUSED,
3677 ofproto_meter_id meter_id OVS_UNUSED,
3678 struct ofputil_meter_stats *stats OVS_UNUSED,
3679 uint16_t n_bands OVS_UNUSED)
3680 {
3681 return EFBIG; /* meter_id out of range */
3682 }
3683
3684 static int
3685 dpif_netdev_meter_del(struct dpif *dpif OVS_UNUSED,
3686 ofproto_meter_id meter_id OVS_UNUSED,
3687 struct ofputil_meter_stats *stats OVS_UNUSED,
3688 uint16_t n_bands OVS_UNUSED)
3689 {
3690 return EFBIG; /* meter_id out of range */
3691 }
3692
3693 \f
3694 static void
3695 dpif_netdev_disable_upcall(struct dpif *dpif)
3696 OVS_NO_THREAD_SAFETY_ANALYSIS
3697 {
3698 struct dp_netdev *dp = get_dp_netdev(dpif);
3699 dp_netdev_disable_upcall(dp);
3700 }
3701
3702 static void
3703 dp_netdev_enable_upcall(struct dp_netdev *dp)
3704 OVS_RELEASES(dp->upcall_rwlock)
3705 {
3706 fat_rwlock_unlock(&dp->upcall_rwlock);
3707 }
3708
3709 static void
3710 dpif_netdev_enable_upcall(struct dpif *dpif)
3711 OVS_NO_THREAD_SAFETY_ANALYSIS
3712 {
3713 struct dp_netdev *dp = get_dp_netdev(dpif);
3714 dp_netdev_enable_upcall(dp);
3715 }
3716
3717 static void
3718 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
3719 {
3720 ovs_mutex_lock(&pmd->cond_mutex);
3721 atomic_store_relaxed(&pmd->reload, false);
3722 pmd->last_reload_seq = seq_read(pmd->reload_seq);
3723 xpthread_cond_signal(&pmd->cond);
3724 ovs_mutex_unlock(&pmd->cond_mutex);
3725 }
3726
3727 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
3728 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
3729 * 'core_id' is NON_PMD_CORE_ID).
3730 *
3731 * Caller must unrefs the returned reference. */
3732 static struct dp_netdev_pmd_thread *
3733 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
3734 {
3735 struct dp_netdev_pmd_thread *pmd;
3736 const struct cmap_node *pnode;
3737
3738 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
3739 if (!pnode) {
3740 return NULL;
3741 }
3742 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
3743
3744 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
3745 }
3746
3747 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
3748 static void
3749 dp_netdev_set_nonpmd(struct dp_netdev *dp)
3750 OVS_REQUIRES(dp->port_mutex)
3751 {
3752 struct dp_netdev_pmd_thread *non_pmd;
3753
3754 non_pmd = xzalloc(sizeof *non_pmd);
3755 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
3756 }
3757
3758 /* Caller must have valid pointer to 'pmd'. */
3759 static bool
3760 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
3761 {
3762 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
3763 }
3764
3765 static void
3766 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
3767 {
3768 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
3769 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
3770 }
3771 }
3772
3773 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
3774 * fails, keeps checking for next node until reaching the end of cmap.
3775 *
3776 * Caller must unrefs the returned reference. */
3777 static struct dp_netdev_pmd_thread *
3778 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
3779 {
3780 struct dp_netdev_pmd_thread *next;
3781
3782 do {
3783 struct cmap_node *node;
3784
3785 node = cmap_next_position(&dp->poll_threads, pos);
3786 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
3787 : NULL;
3788 } while (next && !dp_netdev_pmd_try_ref(next));
3789
3790 return next;
3791 }
3792
3793 /* Configures the 'pmd' based on the input argument. */
3794 static void
3795 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
3796 unsigned core_id, int numa_id)
3797 {
3798 pmd->dp = dp;
3799 pmd->core_id = core_id;
3800 pmd->numa_id = numa_id;
3801 pmd->need_reload = false;
3802
3803 *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
3804
3805 ovs_refcount_init(&pmd->ref_cnt);
3806 latch_init(&pmd->exit_latch);
3807 pmd->reload_seq = seq_create();
3808 pmd->last_reload_seq = seq_read(pmd->reload_seq);
3809 atomic_init(&pmd->reload, false);
3810 xpthread_cond_init(&pmd->cond, NULL);
3811 ovs_mutex_init(&pmd->cond_mutex);
3812 ovs_mutex_init(&pmd->flow_mutex);
3813 ovs_mutex_init(&pmd->port_mutex);
3814 cmap_init(&pmd->flow_table);
3815 cmap_init(&pmd->classifiers);
3816 pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
3817 hmap_init(&pmd->poll_list);
3818 hmap_init(&pmd->tx_ports);
3819 hmap_init(&pmd->tnl_port_cache);
3820 hmap_init(&pmd->send_port_cache);
3821 /* init the 'flow_cache' since there is no
3822 * actual thread created for NON_PMD_CORE_ID. */
3823 if (core_id == NON_PMD_CORE_ID) {
3824 emc_cache_init(&pmd->flow_cache);
3825 }
3826 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
3827 hash_int(core_id, 0));
3828 }
3829
3830 static void
3831 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
3832 {
3833 struct dpcls *cls;
3834
3835 dp_netdev_pmd_flow_flush(pmd);
3836 hmap_destroy(&pmd->send_port_cache);
3837 hmap_destroy(&pmd->tnl_port_cache);
3838 hmap_destroy(&pmd->tx_ports);
3839 hmap_destroy(&pmd->poll_list);
3840 /* All flows (including their dpcls_rules) have been deleted already */
3841 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
3842 dpcls_destroy(cls);
3843 ovsrcu_postpone(free, cls);
3844 }
3845 cmap_destroy(&pmd->classifiers);
3846 cmap_destroy(&pmd->flow_table);
3847 ovs_mutex_destroy(&pmd->flow_mutex);
3848 latch_destroy(&pmd->exit_latch);
3849 seq_destroy(pmd->reload_seq);
3850 xpthread_cond_destroy(&pmd->cond);
3851 ovs_mutex_destroy(&pmd->cond_mutex);
3852 ovs_mutex_destroy(&pmd->port_mutex);
3853 free(pmd);
3854 }
3855
3856 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
3857 * and unrefs the struct. */
3858 static void
3859 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
3860 {
3861 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
3862 * but extra cleanup is necessary */
3863 if (pmd->core_id == NON_PMD_CORE_ID) {
3864 ovs_mutex_lock(&dp->non_pmd_mutex);
3865 emc_cache_uninit(&pmd->flow_cache);
3866 pmd_free_cached_ports(pmd);
3867 ovs_mutex_unlock(&dp->non_pmd_mutex);
3868 } else {
3869 latch_set(&pmd->exit_latch);
3870 dp_netdev_reload_pmd__(pmd);
3871 xpthread_join(pmd->thread, NULL);
3872 }
3873
3874 dp_netdev_pmd_clear_ports(pmd);
3875
3876 /* Purges the 'pmd''s flows after stopping the thread, but before
3877 * destroying the flows, so that the flow stats can be collected. */
3878 if (dp->dp_purge_cb) {
3879 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
3880 }
3881 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
3882 dp_netdev_pmd_unref(pmd);
3883 }
3884
3885 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
3886 * thread. */
3887 static void
3888 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
3889 {
3890 struct dp_netdev_pmd_thread *pmd;
3891 struct dp_netdev_pmd_thread **pmd_list;
3892 size_t k = 0, n_pmds;
3893
3894 n_pmds = cmap_count(&dp->poll_threads);
3895 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
3896
3897 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3898 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
3899 continue;
3900 }
3901 /* We cannot call dp_netdev_del_pmd(), since it alters
3902 * 'dp->poll_threads' (while we're iterating it) and it
3903 * might quiesce. */
3904 ovs_assert(k < n_pmds);
3905 pmd_list[k++] = pmd;
3906 }
3907
3908 for (size_t i = 0; i < k; i++) {
3909 dp_netdev_del_pmd(dp, pmd_list[i]);
3910 }
3911 free(pmd_list);
3912 }
3913
3914 /* Deletes all rx queues from pmd->poll_list and all the ports from
3915 * pmd->tx_ports. */
3916 static void
3917 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
3918 {
3919 struct rxq_poll *poll;
3920 struct tx_port *port;
3921
3922 ovs_mutex_lock(&pmd->port_mutex);
3923 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
3924 free(poll);
3925 }
3926 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
3927 free(port);
3928 }
3929 ovs_mutex_unlock(&pmd->port_mutex);
3930 }
3931
3932 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
3933 static void
3934 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3935 struct dp_netdev_rxq *rxq)
3936 OVS_REQUIRES(pmd->port_mutex)
3937 {
3938 int qid = netdev_rxq_get_queue_id(rxq->rx);
3939 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
3940 struct rxq_poll *poll;
3941
3942 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
3943 if (poll->rxq == rxq) {
3944 /* 'rxq' is already polled by this thread. Do nothing. */
3945 return;
3946 }
3947 }
3948
3949 poll = xmalloc(sizeof *poll);
3950 poll->rxq = rxq;
3951 hmap_insert(&pmd->poll_list, &poll->node, hash);
3952
3953 pmd->need_reload = true;
3954 }
3955
3956 /* Delete 'poll' from poll_list of PMD thread. */
3957 static void
3958 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
3959 struct rxq_poll *poll)
3960 OVS_REQUIRES(pmd->port_mutex)
3961 {
3962 hmap_remove(&pmd->poll_list, &poll->node);
3963 free(poll);
3964
3965 pmd->need_reload = true;
3966 }
3967
3968 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
3969 * changes to take effect. */
3970 static void
3971 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
3972 struct dp_netdev_port *port)
3973 OVS_REQUIRES(pmd->port_mutex)
3974 {
3975 struct tx_port *tx;
3976
3977 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
3978 if (tx) {
3979 /* 'port' is already on this thread tx cache. Do nothing. */
3980 return;
3981 }
3982
3983 tx = xzalloc(sizeof *tx);
3984
3985 tx->port = port;
3986 tx->qid = -1;
3987
3988 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
3989 pmd->need_reload = true;
3990 }
3991
3992 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
3993 * changes to take effect. */
3994 static void
3995 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
3996 struct tx_port *tx)
3997 OVS_REQUIRES(pmd->port_mutex)
3998 {
3999 hmap_remove(&pmd->tx_ports, &tx->node);
4000 free(tx);
4001 pmd->need_reload = true;
4002 }
4003 \f
4004 static char *
4005 dpif_netdev_get_datapath_version(void)
4006 {
4007 return xstrdup("<built-in>");
4008 }
4009
4010 static void
4011 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
4012 uint16_t tcp_flags, long long now)
4013 {
4014 uint16_t flags;
4015
4016 atomic_store_relaxed(&netdev_flow->stats.used, now);
4017 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
4018 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
4019 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
4020 flags |= tcp_flags;
4021 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
4022 }
4023
4024 static void
4025 dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
4026 enum dp_stat_type type, int cnt)
4027 {
4028 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
4029 }
4030
4031 static int
4032 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
4033 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
4034 enum dpif_upcall_type type, const struct nlattr *userdata,
4035 struct ofpbuf *actions, struct ofpbuf *put_actions)
4036 {
4037 struct dp_netdev *dp = pmd->dp;
4038
4039 if (OVS_UNLIKELY(!dp->upcall_cb)) {
4040 return ENODEV;
4041 }
4042
4043 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
4044 struct ds ds = DS_EMPTY_INITIALIZER;
4045 char *packet_str;
4046 struct ofpbuf key;
4047 struct odp_flow_key_parms odp_parms = {
4048 .flow = flow,
4049 .mask = wc ? &wc->masks : NULL,
4050 .support = dp_netdev_support,
4051 };
4052
4053 ofpbuf_init(&key, 0);
4054 odp_flow_key_from_flow(&odp_parms, &key);
4055 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
4056 dp_packet_size(packet_));
4057
4058 odp_flow_key_format(key.data, key.size, &ds);
4059
4060 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
4061 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
4062
4063 ofpbuf_uninit(&key);
4064 free(packet_str);
4065
4066 ds_destroy(&ds);
4067 }
4068
4069 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
4070 actions, wc, put_actions, dp->upcall_aux);
4071 }
4072
4073 static inline uint32_t
4074 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
4075 const struct miniflow *mf)
4076 {
4077 uint32_t hash, recirc_depth;
4078
4079 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4080 hash = dp_packet_get_rss_hash(packet);
4081 } else {
4082 hash = miniflow_hash_5tuple(mf, 0);
4083 dp_packet_set_rss_hash(packet, hash);
4084 }
4085
4086 /* The RSS hash must account for the recirculation depth to avoid
4087 * collisions in the exact match cache */
4088 recirc_depth = *recirc_depth_get_unsafe();
4089 if (OVS_UNLIKELY(recirc_depth)) {
4090 hash = hash_finish(hash, recirc_depth);
4091 dp_packet_set_rss_hash(packet, hash);
4092 }
4093 return hash;
4094 }
4095
4096 struct packet_batch_per_flow {
4097 unsigned int byte_count;
4098 uint16_t tcp_flags;
4099 struct dp_netdev_flow *flow;
4100
4101 struct dp_packet_batch array;
4102 };
4103
4104 static inline void
4105 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
4106 struct dp_packet *packet,
4107 const struct miniflow *mf)
4108 {
4109 batch->byte_count += dp_packet_size(packet);
4110 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
4111 batch->array.packets[batch->array.count++] = packet;
4112 }
4113
4114 static inline void
4115 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
4116 struct dp_netdev_flow *flow)
4117 {
4118 flow->batch = batch;
4119
4120 batch->flow = flow;
4121 dp_packet_batch_init(&batch->array);
4122 batch->byte_count = 0;
4123 batch->tcp_flags = 0;
4124 }
4125
4126 static inline void
4127 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
4128 struct dp_netdev_pmd_thread *pmd,
4129 long long now)
4130 {
4131 struct dp_netdev_actions *actions;
4132 struct dp_netdev_flow *flow = batch->flow;
4133
4134 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
4135 batch->tcp_flags, now);
4136
4137 actions = dp_netdev_flow_get_actions(flow);
4138
4139 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
4140 actions->actions, actions->size, now);
4141 }
4142
4143 static inline void
4144 dp_netdev_queue_batches(struct dp_packet *pkt,
4145 struct dp_netdev_flow *flow, const struct miniflow *mf,
4146 struct packet_batch_per_flow *batches, size_t *n_batches)
4147 {
4148 struct packet_batch_per_flow *batch = flow->batch;
4149
4150 if (OVS_UNLIKELY(!batch)) {
4151 batch = &batches[(*n_batches)++];
4152 packet_batch_per_flow_init(batch, flow);
4153 }
4154
4155 packet_batch_per_flow_update(batch, pkt, mf);
4156 }
4157
4158 /* Try to process all ('cnt') the 'packets' using only the exact match cache
4159 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
4160 * miniflow is copied into 'keys' and the packet pointer is moved at the
4161 * beginning of the 'packets' array.
4162 *
4163 * The function returns the number of packets that needs to be processed in the
4164 * 'packets' array (they have been moved to the beginning of the vector).
4165 *
4166 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
4167 * initialized by this function using 'port_no'.
4168 */
4169 static inline size_t
4170 emc_processing(struct dp_netdev_pmd_thread *pmd,
4171 struct dp_packet_batch *packets_,
4172 struct netdev_flow_key *keys,
4173 struct packet_batch_per_flow batches[], size_t *n_batches,
4174 bool md_is_valid, odp_port_t port_no)
4175 {
4176 struct emc_cache *flow_cache = &pmd->flow_cache;
4177 struct netdev_flow_key *key = &keys[0];
4178 size_t n_missed = 0, n_dropped = 0;
4179 struct dp_packet *packet;
4180 const size_t size = dp_packet_batch_size(packets_);
4181 int i;
4182
4183 DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets_) {
4184 struct dp_netdev_flow *flow;
4185
4186 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
4187 dp_packet_delete(packet);
4188 n_dropped++;
4189 continue;
4190 }
4191
4192 if (i != size - 1) {
4193 struct dp_packet **packets = packets_->packets;
4194 /* Prefetch next packet data and metadata. */
4195 OVS_PREFETCH(dp_packet_data(packets[i+1]));
4196 pkt_metadata_prefetch_init(&packets[i+1]->md);
4197 }
4198
4199 if (!md_is_valid) {
4200 pkt_metadata_init(&packet->md, port_no);
4201 }
4202 miniflow_extract(packet, &key->mf);
4203 key->len = 0; /* Not computed yet. */
4204 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
4205
4206 flow = emc_lookup(flow_cache, key);
4207 if (OVS_LIKELY(flow)) {
4208 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
4209 n_batches);
4210 } else {
4211 /* Exact match cache missed. Group missed packets together at
4212 * the beginning of the 'packets' array. */
4213 dp_packet_batch_refill(packets_, packet, i);
4214 /* 'key[n_missed]' contains the key of the current packet and it
4215 * must be returned to the caller. The next key should be extracted
4216 * to 'keys[n_missed + 1]'. */
4217 key = &keys[++n_missed];
4218 }
4219 }
4220
4221 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, size - n_dropped - n_missed);
4222
4223 return dp_packet_batch_size(packets_);
4224 }
4225
4226 static inline void
4227 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
4228 const struct netdev_flow_key *key,
4229 struct ofpbuf *actions, struct ofpbuf *put_actions,
4230 int *lost_cnt, long long now)
4231 {
4232 struct ofpbuf *add_actions;
4233 struct dp_packet_batch b;
4234 struct match match;
4235 ovs_u128 ufid;
4236 int error;
4237
4238 match.tun_md.valid = false;
4239 miniflow_expand(&key->mf, &match.flow);
4240
4241 ofpbuf_clear(actions);
4242 ofpbuf_clear(put_actions);
4243
4244 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
4245 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
4246 &ufid, DPIF_UC_MISS, NULL, actions,
4247 put_actions);
4248 if (OVS_UNLIKELY(error && error != ENOSPC)) {
4249 dp_packet_delete(packet);
4250 (*lost_cnt)++;
4251 return;
4252 }
4253
4254 /* The Netlink encoding of datapath flow keys cannot express
4255 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
4256 * tag is interpreted as exact match on the fact that there is no
4257 * VLAN. Unless we refactor a lot of code that translates between
4258 * Netlink and struct flow representations, we have to do the same
4259 * here. */
4260 if (!match.wc.masks.vlan_tci) {
4261 match.wc.masks.vlan_tci = htons(0xffff);
4262 }
4263
4264 /* We can't allow the packet batching in the next loop to execute
4265 * the actions. Otherwise, if there are any slow path actions,
4266 * we'll send the packet up twice. */
4267 dp_packet_batch_init_packet(&b, packet);
4268 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
4269 actions->data, actions->size, now);
4270
4271 add_actions = put_actions->size ? put_actions : actions;
4272 if (OVS_LIKELY(error != ENOSPC)) {
4273 struct dp_netdev_flow *netdev_flow;
4274
4275 /* XXX: There's a race window where a flow covering this packet
4276 * could have already been installed since we last did the flow
4277 * lookup before upcall. This could be solved by moving the
4278 * mutex lock outside the loop, but that's an awful long time
4279 * to be locking everyone out of making flow installs. If we
4280 * move to a per-core classifier, it would be reasonable. */
4281 ovs_mutex_lock(&pmd->flow_mutex);
4282 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
4283 if (OVS_LIKELY(!netdev_flow)) {
4284 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
4285 add_actions->data,
4286 add_actions->size);
4287 }
4288 ovs_mutex_unlock(&pmd->flow_mutex);
4289 emc_probabilistic_insert(pmd, key, netdev_flow);
4290 }
4291 }
4292
4293 static inline void
4294 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
4295 struct dp_packet_batch *packets_,
4296 struct netdev_flow_key *keys,
4297 struct packet_batch_per_flow batches[], size_t *n_batches,
4298 odp_port_t in_port,
4299 long long now)
4300 {
4301 int cnt = packets_->count;
4302 #if !defined(__CHECKER__) && !defined(_WIN32)
4303 const size_t PKT_ARRAY_SIZE = cnt;
4304 #else
4305 /* Sparse or MSVC doesn't like variable length array. */
4306 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4307 #endif
4308 struct dp_packet **packets = packets_->packets;
4309 struct dpcls *cls;
4310 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
4311 struct dp_netdev *dp = pmd->dp;
4312 int miss_cnt = 0, lost_cnt = 0;
4313 int lookup_cnt = 0, add_lookup_cnt;
4314 bool any_miss;
4315 size_t i;
4316
4317 for (i = 0; i < cnt; i++) {
4318 /* Key length is needed in all the cases, hash computed on demand. */
4319 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
4320 }
4321 /* Get the classifier for the in_port */
4322 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
4323 if (OVS_LIKELY(cls)) {
4324 any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
4325 } else {
4326 any_miss = true;
4327 memset(rules, 0, sizeof(rules));
4328 }
4329 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4330 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
4331 struct ofpbuf actions, put_actions;
4332
4333 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
4334 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
4335
4336 for (i = 0; i < cnt; i++) {
4337 struct dp_netdev_flow *netdev_flow;
4338
4339 if (OVS_LIKELY(rules[i])) {
4340 continue;
4341 }
4342
4343 /* It's possible that an earlier slow path execution installed
4344 * a rule covering this flow. In this case, it's a lot cheaper
4345 * to catch it here than execute a miss. */
4346 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
4347 &add_lookup_cnt);
4348 if (netdev_flow) {
4349 lookup_cnt += add_lookup_cnt;
4350 rules[i] = &netdev_flow->cr;
4351 continue;
4352 }
4353
4354 miss_cnt++;
4355 handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
4356 &put_actions, &lost_cnt, now);
4357 }
4358
4359 ofpbuf_uninit(&actions);
4360 ofpbuf_uninit(&put_actions);
4361 fat_rwlock_unlock(&dp->upcall_rwlock);
4362 } else if (OVS_UNLIKELY(any_miss)) {
4363 for (i = 0; i < cnt; i++) {
4364 if (OVS_UNLIKELY(!rules[i])) {
4365 dp_packet_delete(packets[i]);
4366 lost_cnt++;
4367 miss_cnt++;
4368 }
4369 }
4370 }
4371
4372 for (i = 0; i < cnt; i++) {
4373 struct dp_packet *packet = packets[i];
4374 struct dp_netdev_flow *flow;
4375
4376 if (OVS_UNLIKELY(!rules[i])) {
4377 continue;
4378 }
4379
4380 flow = dp_netdev_flow_cast(rules[i]);
4381
4382 emc_probabilistic_insert(pmd, &keys[i], flow);
4383 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
4384 }
4385
4386 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
4387 dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
4388 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
4389 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
4390 }
4391
4392 /* Packets enter the datapath from a port (or from recirculation) here.
4393 *
4394 * For performance reasons a caller may choose not to initialize the metadata
4395 * in 'packets': in this case 'mdinit' is false and this function needs to
4396 * initialize it using 'port_no'. If the metadata in 'packets' is already
4397 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
4398 static void
4399 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
4400 struct dp_packet_batch *packets,
4401 bool md_is_valid, odp_port_t port_no)
4402 {
4403 int cnt = packets->count;
4404 #if !defined(__CHECKER__) && !defined(_WIN32)
4405 const size_t PKT_ARRAY_SIZE = cnt;
4406 #else
4407 /* Sparse or MSVC doesn't like variable length array. */
4408 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4409 #endif
4410 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct netdev_flow_key keys[PKT_ARRAY_SIZE];
4411 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
4412 long long now = time_msec();
4413 size_t n_batches;
4414 odp_port_t in_port;
4415
4416 n_batches = 0;
4417 emc_processing(pmd, packets, keys, batches, &n_batches,
4418 md_is_valid, port_no);
4419 if (!dp_packet_batch_is_empty(packets)) {
4420 /* Get ingress port from first packet's metadata. */
4421 in_port = packets->packets[0]->md.in_port.odp_port;
4422 fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now);
4423 }
4424
4425 /* All the flow batches need to be reset before any call to
4426 * packet_batch_per_flow_execute() as it could potentially trigger
4427 * recirculation. When a packet matching flow ‘j’ happens to be
4428 * recirculated, the nested call to dp_netdev_input__() could potentially
4429 * classify the packet as matching another flow - say 'k'. It could happen
4430 * that in the previous call to dp_netdev_input__() that same flow 'k' had
4431 * already its own batches[k] still waiting to be served. So if its
4432 * ‘batch’ member is not reset, the recirculated packet would be wrongly
4433 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
4434 size_t i;
4435 for (i = 0; i < n_batches; i++) {
4436 batches[i].flow->batch = NULL;
4437 }
4438
4439 for (i = 0; i < n_batches; i++) {
4440 packet_batch_per_flow_execute(&batches[i], pmd, now);
4441 }
4442 }
4443
4444 static void
4445 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
4446 struct dp_packet_batch *packets,
4447 odp_port_t port_no)
4448 {
4449 dp_netdev_input__(pmd, packets, false, port_no);
4450 }
4451
4452 static void
4453 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
4454 struct dp_packet_batch *packets)
4455 {
4456 dp_netdev_input__(pmd, packets, true, 0);
4457 }
4458
4459 struct dp_netdev_execute_aux {
4460 struct dp_netdev_pmd_thread *pmd;
4461 long long now;
4462 const struct flow *flow;
4463 };
4464
4465 static void
4466 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
4467 void *aux)
4468 {
4469 struct dp_netdev *dp = get_dp_netdev(dpif);
4470 dp->dp_purge_aux = aux;
4471 dp->dp_purge_cb = cb;
4472 }
4473
4474 static void
4475 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
4476 void *aux)
4477 {
4478 struct dp_netdev *dp = get_dp_netdev(dpif);
4479 dp->upcall_aux = aux;
4480 dp->upcall_cb = cb;
4481 }
4482
4483 static void
4484 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
4485 long long now, bool purge)
4486 {
4487 struct tx_port *tx;
4488 struct dp_netdev_port *port;
4489 long long interval;
4490
4491 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
4492 if (!tx->port->dynamic_txqs) {
4493 continue;
4494 }
4495 interval = now - tx->last_used;
4496 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
4497 port = tx->port;
4498 ovs_mutex_lock(&port->txq_used_mutex);
4499 port->txq_used[tx->qid]--;
4500 ovs_mutex_unlock(&port->txq_used_mutex);
4501 tx->qid = -1;
4502 }
4503 }
4504 }
4505
4506 static int
4507 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
4508 struct tx_port *tx, long long now)
4509 {
4510 struct dp_netdev_port *port;
4511 long long interval;
4512 int i, min_cnt, min_qid;
4513
4514 if (OVS_UNLIKELY(!now)) {
4515 now = time_msec();
4516 }
4517
4518 interval = now - tx->last_used;
4519 tx->last_used = now;
4520
4521 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
4522 return tx->qid;
4523 }
4524
4525 port = tx->port;
4526
4527 ovs_mutex_lock(&port->txq_used_mutex);
4528 if (tx->qid >= 0) {
4529 port->txq_used[tx->qid]--;
4530 tx->qid = -1;
4531 }
4532
4533 min_cnt = -1;
4534 min_qid = 0;
4535 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
4536 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
4537 min_cnt = port->txq_used[i];
4538 min_qid = i;
4539 }
4540 }
4541
4542 port->txq_used[min_qid]++;
4543 tx->qid = min_qid;
4544
4545 ovs_mutex_unlock(&port->txq_used_mutex);
4546
4547 dpif_netdev_xps_revalidate_pmd(pmd, now, false);
4548
4549 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
4550 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
4551 return min_qid;
4552 }
4553
4554 static struct tx_port *
4555 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
4556 odp_port_t port_no)
4557 {
4558 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
4559 }
4560
4561 static struct tx_port *
4562 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
4563 odp_port_t port_no)
4564 {
4565 return tx_port_lookup(&pmd->send_port_cache, port_no);
4566 }
4567
4568 static int
4569 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
4570 const struct nlattr *attr,
4571 struct dp_packet_batch *batch)
4572 {
4573 struct tx_port *tun_port;
4574 const struct ovs_action_push_tnl *data;
4575 int err;
4576
4577 data = nl_attr_get(attr);
4578
4579 tun_port = pmd_tnl_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
4580 if (!tun_port) {
4581 err = -EINVAL;
4582 goto error;
4583 }
4584 err = netdev_push_header(tun_port->port->netdev, batch, data);
4585 if (!err) {
4586 return 0;
4587 }
4588 error:
4589 dp_packet_delete_batch(batch, true);
4590 return err;
4591 }
4592
4593 static void
4594 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
4595 struct dp_packet *packet, bool may_steal,
4596 struct flow *flow, ovs_u128 *ufid,
4597 struct ofpbuf *actions,
4598 const struct nlattr *userdata, long long now)
4599 {
4600 struct dp_packet_batch b;
4601 int error;
4602
4603 ofpbuf_clear(actions);
4604
4605 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
4606 DPIF_UC_ACTION, userdata, actions,
4607 NULL);
4608 if (!error || error == ENOSPC) {
4609 dp_packet_batch_init_packet(&b, packet);
4610 dp_netdev_execute_actions(pmd, &b, may_steal, flow,
4611 actions->data, actions->size, now);
4612 } else if (may_steal) {
4613 dp_packet_delete(packet);
4614 }
4615 }
4616
4617 static void
4618 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
4619 const struct nlattr *a, bool may_steal)
4620 {
4621 struct dp_netdev_execute_aux *aux = aux_;
4622 uint32_t *depth = recirc_depth_get();
4623 struct dp_netdev_pmd_thread *pmd = aux->pmd;
4624 struct dp_netdev *dp = pmd->dp;
4625 int type = nl_attr_type(a);
4626 long long now = aux->now;
4627 struct tx_port *p;
4628
4629 switch ((enum ovs_action_attr)type) {
4630 case OVS_ACTION_ATTR_OUTPUT:
4631 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
4632 if (OVS_LIKELY(p)) {
4633 int tx_qid;
4634 bool dynamic_txqs;
4635
4636 dynamic_txqs = p->port->dynamic_txqs;
4637 if (dynamic_txqs) {
4638 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
4639 } else {
4640 tx_qid = pmd->static_tx_qid;
4641 }
4642
4643 netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
4644 dynamic_txqs);
4645 return;
4646 }
4647 break;
4648
4649 case OVS_ACTION_ATTR_TUNNEL_PUSH:
4650 if (*depth < MAX_RECIRC_DEPTH) {
4651 struct dp_packet_batch tnl_pkt;
4652 struct dp_packet_batch *orig_packets_ = packets_;
4653 int err;
4654
4655 if (!may_steal) {
4656 dp_packet_batch_clone(&tnl_pkt, packets_);
4657 packets_ = &tnl_pkt;
4658 dp_packet_batch_reset_cutlen(orig_packets_);
4659 }
4660
4661 dp_packet_batch_apply_cutlen(packets_);
4662
4663 err = push_tnl_action(pmd, a, packets_);
4664 if (!err) {
4665 (*depth)++;
4666 dp_netdev_recirculate(pmd, packets_);
4667 (*depth)--;
4668 }
4669 return;
4670 }
4671 break;
4672
4673 case OVS_ACTION_ATTR_TUNNEL_POP:
4674 if (*depth < MAX_RECIRC_DEPTH) {
4675 struct dp_packet_batch *orig_packets_ = packets_;
4676 odp_port_t portno = nl_attr_get_odp_port(a);
4677
4678 p = pmd_tnl_port_cache_lookup(pmd, portno);
4679 if (p) {
4680 struct dp_packet_batch tnl_pkt;
4681
4682 if (!may_steal) {
4683 dp_packet_batch_clone(&tnl_pkt, packets_);
4684 packets_ = &tnl_pkt;
4685 dp_packet_batch_reset_cutlen(orig_packets_);
4686 }
4687
4688 dp_packet_batch_apply_cutlen(packets_);
4689
4690 netdev_pop_header(p->port->netdev, packets_);
4691 if (dp_packet_batch_is_empty(packets_)) {
4692 return;
4693 }
4694
4695 struct dp_packet *packet;
4696 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
4697 packet->md.in_port.odp_port = portno;
4698 }
4699
4700 (*depth)++;
4701 dp_netdev_recirculate(pmd, packets_);
4702 (*depth)--;
4703 return;
4704 }
4705 }
4706 break;
4707
4708 case OVS_ACTION_ATTR_USERSPACE:
4709 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4710 struct dp_packet_batch *orig_packets_ = packets_;
4711 const struct nlattr *userdata;
4712 struct dp_packet_batch usr_pkt;
4713 struct ofpbuf actions;
4714 struct flow flow;
4715 ovs_u128 ufid;
4716 bool clone = false;
4717
4718 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
4719 ofpbuf_init(&actions, 0);
4720
4721 if (packets_->trunc) {
4722 if (!may_steal) {
4723 dp_packet_batch_clone(&usr_pkt, packets_);
4724 packets_ = &usr_pkt;
4725 clone = true;
4726 dp_packet_batch_reset_cutlen(orig_packets_);
4727 }
4728
4729 dp_packet_batch_apply_cutlen(packets_);
4730 }
4731
4732 struct dp_packet *packet;
4733 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
4734 flow_extract(packet, &flow);
4735 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
4736 dp_execute_userspace_action(pmd, packet, may_steal, &flow,
4737 &ufid, &actions, userdata, now);
4738 }
4739
4740 if (clone) {
4741 dp_packet_delete_batch(packets_, true);
4742 }
4743
4744 ofpbuf_uninit(&actions);
4745 fat_rwlock_unlock(&dp->upcall_rwlock);
4746
4747 return;
4748 }
4749 break;
4750
4751 case OVS_ACTION_ATTR_RECIRC:
4752 if (*depth < MAX_RECIRC_DEPTH) {
4753 struct dp_packet_batch recirc_pkts;
4754
4755 if (!may_steal) {
4756 dp_packet_batch_clone(&recirc_pkts, packets_);
4757 packets_ = &recirc_pkts;
4758 }
4759
4760 struct dp_packet *packet;
4761 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
4762 packet->md.recirc_id = nl_attr_get_u32(a);
4763 }
4764
4765 (*depth)++;
4766 dp_netdev_recirculate(pmd, packets_);
4767 (*depth)--;
4768
4769 return;
4770 }
4771
4772 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
4773 break;
4774
4775 case OVS_ACTION_ATTR_CT: {
4776 const struct nlattr *b;
4777 bool commit = false;
4778 unsigned int left;
4779 uint16_t zone = 0;
4780 const char *helper = NULL;
4781 const uint32_t *setmark = NULL;
4782 const struct ovs_key_ct_labels *setlabel = NULL;
4783
4784 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
4785 nl_attr_get_size(a)) {
4786 enum ovs_ct_attr sub_type = nl_attr_type(b);
4787
4788 switch(sub_type) {
4789 case OVS_CT_ATTR_COMMIT:
4790 commit = true;
4791 break;
4792 case OVS_CT_ATTR_ZONE:
4793 zone = nl_attr_get_u16(b);
4794 break;
4795 case OVS_CT_ATTR_HELPER:
4796 helper = nl_attr_get_string(b);
4797 break;
4798 case OVS_CT_ATTR_MARK:
4799 setmark = nl_attr_get(b);
4800 break;
4801 case OVS_CT_ATTR_LABELS:
4802 setlabel = nl_attr_get(b);
4803 break;
4804 case OVS_CT_ATTR_NAT:
4805 case OVS_CT_ATTR_UNSPEC:
4806 case __OVS_CT_ATTR_MAX:
4807 OVS_NOT_REACHED();
4808 }
4809 }
4810
4811 conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, commit,
4812 zone, setmark, setlabel, helper);
4813 break;
4814 }
4815
4816 case OVS_ACTION_ATTR_METER:
4817 case OVS_ACTION_ATTR_PUSH_VLAN:
4818 case OVS_ACTION_ATTR_POP_VLAN:
4819 case OVS_ACTION_ATTR_PUSH_MPLS:
4820 case OVS_ACTION_ATTR_POP_MPLS:
4821 case OVS_ACTION_ATTR_SET:
4822 case OVS_ACTION_ATTR_SET_MASKED:
4823 case OVS_ACTION_ATTR_SAMPLE:
4824 case OVS_ACTION_ATTR_HASH:
4825 case OVS_ACTION_ATTR_UNSPEC:
4826 case OVS_ACTION_ATTR_TRUNC:
4827 case OVS_ACTION_ATTR_PUSH_ETH:
4828 case OVS_ACTION_ATTR_POP_ETH:
4829 case OVS_ACTION_ATTR_CLONE:
4830 case __OVS_ACTION_ATTR_MAX:
4831 OVS_NOT_REACHED();
4832 }
4833
4834 dp_packet_delete_batch(packets_, may_steal);
4835 }
4836
4837 static void
4838 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
4839 struct dp_packet_batch *packets,
4840 bool may_steal, const struct flow *flow,
4841 const struct nlattr *actions, size_t actions_len,
4842 long long now)
4843 {
4844 struct dp_netdev_execute_aux aux = { pmd, now, flow };
4845
4846 odp_execute_actions(&aux, packets, may_steal, actions,
4847 actions_len, dp_execute_cb);
4848 }
4849
4850 struct dp_netdev_ct_dump {
4851 struct ct_dpif_dump_state up;
4852 struct conntrack_dump dump;
4853 struct conntrack *ct;
4854 struct dp_netdev *dp;
4855 };
4856
4857 static int
4858 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
4859 const uint16_t *pzone)
4860 {
4861 struct dp_netdev *dp = get_dp_netdev(dpif);
4862 struct dp_netdev_ct_dump *dump;
4863
4864 dump = xzalloc(sizeof *dump);
4865 dump->dp = dp;
4866 dump->ct = &dp->conntrack;
4867
4868 conntrack_dump_start(&dp->conntrack, &dump->dump, pzone);
4869
4870 *dump_ = &dump->up;
4871
4872 return 0;
4873 }
4874
4875 static int
4876 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
4877 struct ct_dpif_dump_state *dump_,
4878 struct ct_dpif_entry *entry)
4879 {
4880 struct dp_netdev_ct_dump *dump;
4881
4882 INIT_CONTAINER(dump, dump_, up);
4883
4884 return conntrack_dump_next(&dump->dump, entry);
4885 }
4886
4887 static int
4888 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
4889 struct ct_dpif_dump_state *dump_)
4890 {
4891 struct dp_netdev_ct_dump *dump;
4892 int err;
4893
4894 INIT_CONTAINER(dump, dump_, up);
4895
4896 err = conntrack_dump_done(&dump->dump);
4897
4898 free(dump);
4899
4900 return err;
4901 }
4902
4903 static int
4904 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
4905 {
4906 struct dp_netdev *dp = get_dp_netdev(dpif);
4907
4908 return conntrack_flush(&dp->conntrack, zone);
4909 }
4910
4911 const struct dpif_class dpif_netdev_class = {
4912 "netdev",
4913 dpif_netdev_init,
4914 dpif_netdev_enumerate,
4915 dpif_netdev_port_open_type,
4916 dpif_netdev_open,
4917 dpif_netdev_close,
4918 dpif_netdev_destroy,
4919 dpif_netdev_run,
4920 dpif_netdev_wait,
4921 dpif_netdev_get_stats,
4922 dpif_netdev_port_add,
4923 dpif_netdev_port_del,
4924 dpif_netdev_port_set_config,
4925 dpif_netdev_port_query_by_number,
4926 dpif_netdev_port_query_by_name,
4927 NULL, /* port_get_pid */
4928 dpif_netdev_port_dump_start,
4929 dpif_netdev_port_dump_next,
4930 dpif_netdev_port_dump_done,
4931 dpif_netdev_port_poll,
4932 dpif_netdev_port_poll_wait,
4933 dpif_netdev_flow_flush,
4934 dpif_netdev_flow_dump_create,
4935 dpif_netdev_flow_dump_destroy,
4936 dpif_netdev_flow_dump_thread_create,
4937 dpif_netdev_flow_dump_thread_destroy,
4938 dpif_netdev_flow_dump_next,
4939 dpif_netdev_operate,
4940 NULL, /* recv_set */
4941 NULL, /* handlers_set */
4942 dpif_netdev_set_config,
4943 dpif_netdev_queue_to_priority,
4944 NULL, /* recv */
4945 NULL, /* recv_wait */
4946 NULL, /* recv_purge */
4947 dpif_netdev_register_dp_purge_cb,
4948 dpif_netdev_register_upcall_cb,
4949 dpif_netdev_enable_upcall,
4950 dpif_netdev_disable_upcall,
4951 dpif_netdev_get_datapath_version,
4952 dpif_netdev_ct_dump_start,
4953 dpif_netdev_ct_dump_next,
4954 dpif_netdev_ct_dump_done,
4955 dpif_netdev_ct_flush,
4956 dpif_netdev_meter_get_features,
4957 dpif_netdev_meter_set,
4958 dpif_netdev_meter_get,
4959 dpif_netdev_meter_del,
4960 };
4961
4962 static void
4963 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
4964 const char *argv[], void *aux OVS_UNUSED)
4965 {
4966 struct dp_netdev_port *port;
4967 struct dp_netdev *dp;
4968 odp_port_t port_no;
4969
4970 ovs_mutex_lock(&dp_netdev_mutex);
4971 dp = shash_find_data(&dp_netdevs, argv[1]);
4972 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
4973 ovs_mutex_unlock(&dp_netdev_mutex);
4974 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4975 return;
4976 }
4977 ovs_refcount_ref(&dp->ref_cnt);
4978 ovs_mutex_unlock(&dp_netdev_mutex);
4979
4980 ovs_mutex_lock(&dp->port_mutex);
4981 if (get_port_by_name(dp, argv[2], &port)) {
4982 unixctl_command_reply_error(conn, "unknown port");
4983 goto exit;
4984 }
4985
4986 port_no = u32_to_odp(atoi(argv[3]));
4987 if (!port_no || port_no == ODPP_NONE) {
4988 unixctl_command_reply_error(conn, "bad port number");
4989 goto exit;
4990 }
4991 if (dp_netdev_lookup_port(dp, port_no)) {
4992 unixctl_command_reply_error(conn, "port number already in use");
4993 goto exit;
4994 }
4995
4996 /* Remove port. */
4997 hmap_remove(&dp->ports, &port->node);
4998 reconfigure_datapath(dp);
4999
5000 /* Reinsert with new port number. */
5001 port->port_no = port_no;
5002 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
5003 reconfigure_datapath(dp);
5004
5005 seq_change(dp->port_seq);
5006 unixctl_command_reply(conn, NULL);
5007
5008 exit:
5009 ovs_mutex_unlock(&dp->port_mutex);
5010 dp_netdev_unref(dp);
5011 }
5012
5013 static void
5014 dpif_dummy_register__(const char *type)
5015 {
5016 struct dpif_class *class;
5017
5018 class = xmalloc(sizeof *class);
5019 *class = dpif_netdev_class;
5020 class->type = xstrdup(type);
5021 dp_register_provider(class);
5022 }
5023
5024 static void
5025 dpif_dummy_override(const char *type)
5026 {
5027 int error;
5028
5029 /*
5030 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
5031 * a userland-only build. It's useful for testsuite.
5032 */
5033 error = dp_unregister_provider(type);
5034 if (error == 0 || error == EAFNOSUPPORT) {
5035 dpif_dummy_register__(type);
5036 }
5037 }
5038
5039 void
5040 dpif_dummy_register(enum dummy_level level)
5041 {
5042 if (level == DUMMY_OVERRIDE_ALL) {
5043 struct sset types;
5044 const char *type;
5045
5046 sset_init(&types);
5047 dp_enumerate_types(&types);
5048 SSET_FOR_EACH (type, &types) {
5049 dpif_dummy_override(type);
5050 }
5051 sset_destroy(&types);
5052 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
5053 dpif_dummy_override("system");
5054 }
5055
5056 dpif_dummy_register__("dummy");
5057
5058 unixctl_command_register("dpif-dummy/change-port-number",
5059 "dp port new-number",
5060 3, 3, dpif_dummy_change_port_number, NULL);
5061 }
5062 \f
5063 /* Datapath Classifier. */
5064
5065 /* A set of rules that all have the same fields wildcarded. */
5066 struct dpcls_subtable {
5067 /* The fields are only used by writers. */
5068 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
5069
5070 /* These fields are accessed by readers. */
5071 struct cmap rules; /* Contains "struct dpcls_rule"s. */
5072 uint32_t hit_cnt; /* Number of match hits in subtable in current
5073 optimization interval. */
5074 struct netdev_flow_key mask; /* Wildcards for fields (const). */
5075 /* 'mask' must be the last field, additional space is allocated here. */
5076 };
5077
5078 /* Initializes 'cls' as a classifier that initially contains no classification
5079 * rules. */
5080 static void
5081 dpcls_init(struct dpcls *cls)
5082 {
5083 cmap_init(&cls->subtables_map);
5084 pvector_init(&cls->subtables);
5085 }
5086
5087 static void
5088 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
5089 {
5090 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
5091 pvector_remove(&cls->subtables, subtable);
5092 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
5093 subtable->mask.hash);
5094 cmap_destroy(&subtable->rules);
5095 ovsrcu_postpone(free, subtable);
5096 }
5097
5098 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
5099 * caller's responsibility.
5100 * May only be called after all the readers have been terminated. */
5101 static void
5102 dpcls_destroy(struct dpcls *cls)
5103 {
5104 if (cls) {
5105 struct dpcls_subtable *subtable;
5106
5107 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
5108 ovs_assert(cmap_count(&subtable->rules) == 0);
5109 dpcls_destroy_subtable(cls, subtable);
5110 }
5111 cmap_destroy(&cls->subtables_map);
5112 pvector_destroy(&cls->subtables);
5113 }
5114 }
5115
5116 static struct dpcls_subtable *
5117 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
5118 {
5119 struct dpcls_subtable *subtable;
5120
5121 /* Need to add one. */
5122 subtable = xmalloc(sizeof *subtable
5123 - sizeof subtable->mask.mf + mask->len);
5124 cmap_init(&subtable->rules);
5125 subtable->hit_cnt = 0;
5126 netdev_flow_key_clone(&subtable->mask, mask);
5127 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
5128 /* Add the new subtable at the end of the pvector (with no hits yet) */
5129 pvector_insert(&cls->subtables, subtable, 0);
5130 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
5131 cmap_count(&cls->subtables_map), subtable, cls->in_port);
5132 pvector_publish(&cls->subtables);
5133
5134 return subtable;
5135 }
5136
5137 static inline struct dpcls_subtable *
5138 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
5139 {
5140 struct dpcls_subtable *subtable;
5141
5142 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
5143 &cls->subtables_map) {
5144 if (netdev_flow_key_equal(&subtable->mask, mask)) {
5145 return subtable;
5146 }
5147 }
5148 return dpcls_create_subtable(cls, mask);
5149 }
5150
5151
5152 /* Periodically sort the dpcls subtable vectors according to hit counts */
5153 static void
5154 dpcls_sort_subtable_vector(struct dpcls *cls)
5155 {
5156 struct pvector *pvec = &cls->subtables;
5157 struct dpcls_subtable *subtable;
5158
5159 PVECTOR_FOR_EACH (subtable, pvec) {
5160 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
5161 subtable->hit_cnt = 0;
5162 }
5163 pvector_publish(pvec);
5164 }
5165
5166 static inline void
5167 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)
5168 {
5169 struct dpcls *cls;
5170 long long int now = time_msec();
5171
5172 if (now > pmd->next_optimization) {
5173 /* Try to obtain the flow lock to block out revalidator threads.
5174 * If not possible, just try next time. */
5175 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
5176 /* Optimize each classifier */
5177 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
5178 dpcls_sort_subtable_vector(cls);
5179 }
5180 ovs_mutex_unlock(&pmd->flow_mutex);
5181 /* Start new measuring interval */
5182 pmd->next_optimization = now + DPCLS_OPTIMIZATION_INTERVAL;
5183 }
5184 }
5185 }
5186
5187 /* Insert 'rule' into 'cls'. */
5188 static void
5189 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
5190 const struct netdev_flow_key *mask)
5191 {
5192 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
5193
5194 /* Refer to subtable's mask, also for later removal. */
5195 rule->mask = &subtable->mask;
5196 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
5197 }
5198
5199 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
5200 static void
5201 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
5202 {
5203 struct dpcls_subtable *subtable;
5204
5205 ovs_assert(rule->mask);
5206
5207 /* Get subtable from reference in rule->mask. */
5208 INIT_CONTAINER(subtable, rule->mask, mask);
5209 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
5210 == 0) {
5211 /* Delete empty subtable. */
5212 dpcls_destroy_subtable(cls, subtable);
5213 pvector_publish(&cls->subtables);
5214 }
5215 }
5216
5217 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
5218 * in 'mask' the values in 'key' and 'target' are the same. */
5219 static inline bool
5220 dpcls_rule_matches_key(const struct dpcls_rule *rule,
5221 const struct netdev_flow_key *target)
5222 {
5223 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
5224 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5225 uint64_t value;
5226
5227 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
5228 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
5229 return false;
5230 }
5231 }
5232 return true;
5233 }
5234
5235 /* For each miniflow in 'keys' performs a classifier lookup writing the result
5236 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
5237 * NULL it is skipped.
5238 *
5239 * This function is optimized for use in the userspace datapath and therefore
5240 * does not implement a lot of features available in the standard
5241 * classifier_lookup() function. Specifically, it does not implement
5242 * priorities, instead returning any rule which matches the flow.
5243 *
5244 * Returns true if all miniflows found a corresponding rule. */
5245 static bool
5246 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
5247 struct dpcls_rule **rules, const size_t cnt,
5248 int *num_lookups_p)
5249 {
5250 /* The received 'cnt' miniflows are the search-keys that will be processed
5251 * to find a matching entry into the available subtables.
5252 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
5253 typedef uint32_t map_type;
5254 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
5255 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
5256
5257 struct dpcls_subtable *subtable;
5258
5259 map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
5260 map_type found_map;
5261 uint32_t hashes[MAP_BITS];
5262 const struct cmap_node *nodes[MAP_BITS];
5263
5264 if (cnt != MAP_BITS) {
5265 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
5266 }
5267 memset(rules, 0, cnt * sizeof *rules);
5268
5269 int lookups_match = 0, subtable_pos = 1;
5270
5271 /* The Datapath classifier - aka dpcls - is composed of subtables.
5272 * Subtables are dynamically created as needed when new rules are inserted.
5273 * Each subtable collects rules with matches on a specific subset of packet
5274 * fields as defined by the subtable's mask. We proceed to process every
5275 * search-key against each subtable, but when a match is found for a
5276 * search-key, the search for that key can stop because the rules are
5277 * non-overlapping. */
5278 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
5279 int i;
5280
5281 /* Compute hashes for the remaining keys. Each search-key is
5282 * masked with the subtable's mask to avoid hashing the wildcarded
5283 * bits. */
5284 ULLONG_FOR_EACH_1(i, keys_map) {
5285 hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
5286 &subtable->mask);
5287 }
5288 /* Lookup. */
5289 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
5290 /* Check results. When the i-th bit of found_map is set, it means
5291 * that a set of nodes with a matching hash value was found for the
5292 * i-th search-key. Due to possible hash collisions we need to check
5293 * which of the found rules, if any, really matches our masked
5294 * search-key. */
5295 ULLONG_FOR_EACH_1(i, found_map) {
5296 struct dpcls_rule *rule;
5297
5298 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
5299 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
5300 rules[i] = rule;
5301 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
5302 * within one second optimization interval. */
5303 subtable->hit_cnt++;
5304 lookups_match += subtable_pos;
5305 goto next;
5306 }
5307 }
5308 /* None of the found rules was a match. Reset the i-th bit to
5309 * keep searching this key in the next subtable. */
5310 ULLONG_SET0(found_map, i); /* Did not match. */
5311 next:
5312 ; /* Keep Sparse happy. */
5313 }
5314 keys_map &= ~found_map; /* Clear the found rules. */
5315 if (!keys_map) {
5316 if (num_lookups_p) {
5317 *num_lookups_p = lookups_match;
5318 }
5319 return true; /* All found. */
5320 }
5321 subtable_pos++;
5322 }
5323 if (num_lookups_p) {
5324 *num_lookups_p = lookups_match;
5325 }
5326 return false; /* Some misses. */
5327 }