]> git.proxmox.com Git - ovs.git/blob - lib/dpif-netdev.c
Merge branch 'dpdk_merge' of https://github.com/istokes/ovs into HEAD
[ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <sys/types.h>
26 #include <netinet/in.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/ioctl.h>
31 #include <sys/socket.h>
32 #include <sys/stat.h>
33 #include <unistd.h>
34
35 #include "bitmap.h"
36 #include "cmap.h"
37 #include "conntrack.h"
38 #include "coverage.h"
39 #include "ct-dpif.h"
40 #include "csum.h"
41 #include "dp-packet.h"
42 #include "dpif.h"
43 #include "dpif-netdev-perf.h"
44 #include "dpif-provider.h"
45 #include "dummy.h"
46 #include "fat-rwlock.h"
47 #include "flow.h"
48 #include "hmapx.h"
49 #include "id-pool.h"
50 #include "latch.h"
51 #include "netdev.h"
52 #include "netdev-vport.h"
53 #include "netlink.h"
54 #include "odp-execute.h"
55 #include "odp-util.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "openvswitch/list.h"
58 #include "openvswitch/match.h"
59 #include "openvswitch/ofp-print.h"
60 #include "openvswitch/ofp-util.h"
61 #include "openvswitch/ofpbuf.h"
62 #include "openvswitch/shash.h"
63 #include "openvswitch/vlog.h"
64 #include "ovs-numa.h"
65 #include "ovs-rcu.h"
66 #include "packets.h"
67 #include "openvswitch/poll-loop.h"
68 #include "pvector.h"
69 #include "random.h"
70 #include "seq.h"
71 #include "smap.h"
72 #include "sset.h"
73 #include "timeval.h"
74 #include "tnl-neigh-cache.h"
75 #include "tnl-ports.h"
76 #include "unixctl.h"
77 #include "util.h"
78
79 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
80
81 #define FLOW_DUMP_MAX_BATCH 50
82 /* Use per thread recirc_depth to prevent recirculation loop. */
83 #define MAX_RECIRC_DEPTH 6
84 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
85
86 /* Use instant packet send by default. */
87 #define DEFAULT_TX_FLUSH_INTERVAL 0
88
89 /* Configuration parameters. */
90 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
91 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
92 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
93 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
94
95 /* Protects against changes to 'dp_netdevs'. */
96 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
97
98 /* Contains all 'struct dp_netdev's. */
99 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
100 = SHASH_INITIALIZER(&dp_netdevs);
101
102 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
103
104 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
105 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
106 | CS_SRC_NAT | CS_DST_NAT)
107 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
108
109 static struct odp_support dp_netdev_support = {
110 .max_vlan_headers = SIZE_MAX,
111 .max_mpls_depth = SIZE_MAX,
112 .recirc = true,
113 .ct_state = true,
114 .ct_zone = true,
115 .ct_mark = true,
116 .ct_label = true,
117 .ct_state_nat = true,
118 .ct_orig_tuple = true,
119 .ct_orig_tuple6 = true,
120 };
121
122 /* Stores a miniflow with inline values */
123
124 struct netdev_flow_key {
125 uint32_t hash; /* Hash function differs for different users. */
126 uint32_t len; /* Length of the following miniflow (incl. map). */
127 struct miniflow mf;
128 uint64_t buf[FLOW_MAX_PACKET_U64S];
129 };
130
131 /* Exact match cache for frequently used flows
132 *
133 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
134 * search its entries for a miniflow that matches exactly the miniflow of the
135 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
136 *
137 * A cache entry holds a reference to its 'dp_netdev_flow'.
138 *
139 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
140 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
141 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
142 * value is the index of a cache entry where the miniflow could be.
143 *
144 *
145 * Thread-safety
146 * =============
147 *
148 * Each pmd_thread has its own private exact match cache.
149 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
150 */
151
152 #define EM_FLOW_HASH_SHIFT 13
153 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
154 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
155 #define EM_FLOW_HASH_SEGS 2
156
157 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
158 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
159 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
160 DEFAULT_EM_FLOW_INSERT_INV_PROB)
161
162 struct emc_entry {
163 struct dp_netdev_flow *flow;
164 struct netdev_flow_key key; /* key.hash used for emc hash value. */
165 };
166
167 struct emc_cache {
168 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
169 int sweep_idx; /* For emc_cache_slow_sweep(). */
170 };
171
172 /* Iterate in the exact match cache through every entry that might contain a
173 * miniflow with hash 'HASH'. */
174 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
175 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
176 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
177 i__ < EM_FLOW_HASH_SEGS; \
178 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
179 \f
180 /* Simple non-wildcarding single-priority classifier. */
181
182 /* Time in microseconds between successive optimizations of the dpcls
183 * subtable vector */
184 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
185
186 /* Time in microseconds of the interval in which rxq processing cycles used
187 * in rxq to pmd assignments is measured and stored. */
188 #define PMD_RXQ_INTERVAL_LEN 10000000LL
189
190 /* Number of intervals for which cycles are stored
191 * and used during rxq to pmd assignment. */
192 #define PMD_RXQ_INTERVAL_MAX 6
193
194 struct dpcls {
195 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
196 odp_port_t in_port;
197 struct cmap subtables_map;
198 struct pvector subtables;
199 };
200
201 /* A rule to be inserted to the classifier. */
202 struct dpcls_rule {
203 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
204 struct netdev_flow_key *mask; /* Subtable's mask. */
205 struct netdev_flow_key flow; /* Matching key. */
206 /* 'flow' must be the last field, additional space is allocated here. */
207 };
208
209 static void dpcls_init(struct dpcls *);
210 static void dpcls_destroy(struct dpcls *);
211 static void dpcls_sort_subtable_vector(struct dpcls *);
212 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
213 const struct netdev_flow_key *mask);
214 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
215 static bool dpcls_lookup(struct dpcls *cls,
216 const struct netdev_flow_key keys[],
217 struct dpcls_rule **rules, size_t cnt,
218 int *num_lookups_p);
219 \f
220 /* Set of supported meter flags */
221 #define DP_SUPPORTED_METER_FLAGS_MASK \
222 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
223
224 /* Set of supported meter band types */
225 #define DP_SUPPORTED_METER_BAND_TYPES \
226 ( 1 << OFPMBT13_DROP )
227
228 struct dp_meter_band {
229 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
230 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
231 uint64_t packet_count;
232 uint64_t byte_count;
233 };
234
235 struct dp_meter {
236 uint16_t flags;
237 uint16_t n_bands;
238 uint32_t max_delta_t;
239 uint64_t used;
240 uint64_t packet_count;
241 uint64_t byte_count;
242 struct dp_meter_band bands[];
243 };
244
245 /* Datapath based on the network device interface from netdev.h.
246 *
247 *
248 * Thread-safety
249 * =============
250 *
251 * Some members, marked 'const', are immutable. Accessing other members
252 * requires synchronization, as noted in more detail below.
253 *
254 * Acquisition order is, from outermost to innermost:
255 *
256 * dp_netdev_mutex (global)
257 * port_mutex
258 * non_pmd_mutex
259 */
260 struct dp_netdev {
261 const struct dpif_class *const class;
262 const char *const name;
263 struct dpif *dpif;
264 struct ovs_refcount ref_cnt;
265 atomic_flag destroyed;
266
267 /* Ports.
268 *
269 * Any lookup into 'ports' or any access to the dp_netdev_ports found
270 * through 'ports' requires taking 'port_mutex'. */
271 struct ovs_mutex port_mutex;
272 struct hmap ports;
273 struct seq *port_seq; /* Incremented whenever a port changes. */
274
275 /* The time that a packet can wait in output batch for sending. */
276 atomic_uint32_t tx_flush_interval;
277
278 /* Meters. */
279 struct ovs_mutex meter_locks[N_METER_LOCKS];
280 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
281
282 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
283 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
284
285 /* Protects access to ofproto-dpif-upcall interface during revalidator
286 * thread synchronization. */
287 struct fat_rwlock upcall_rwlock;
288 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
289 void *upcall_aux;
290
291 /* Callback function for notifying the purging of dp flows (during
292 * reseting pmd deletion). */
293 dp_purge_callback *dp_purge_cb;
294 void *dp_purge_aux;
295
296 /* Stores all 'struct dp_netdev_pmd_thread's. */
297 struct cmap poll_threads;
298 /* id pool for per thread static_tx_qid. */
299 struct id_pool *tx_qid_pool;
300 struct ovs_mutex tx_qid_pool_mutex;
301
302 /* Protects the access of the 'struct dp_netdev_pmd_thread'
303 * instance for non-pmd thread. */
304 struct ovs_mutex non_pmd_mutex;
305
306 /* Each pmd thread will store its pointer to
307 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
308 ovsthread_key_t per_pmd_key;
309
310 struct seq *reconfigure_seq;
311 uint64_t last_reconfigure_seq;
312
313 /* Cpu mask for pin of pmd threads. */
314 char *pmd_cmask;
315
316 uint64_t last_tnl_conf_seq;
317
318 struct conntrack conntrack;
319 };
320
321 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
322 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
323 {
324 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
325 }
326
327 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
328 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
329 {
330 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
331 }
332
333
334 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
335 odp_port_t)
336 OVS_REQUIRES(dp->port_mutex);
337
338 enum rxq_cycles_counter_type {
339 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
340 processing packets during the current
341 interval. */
342 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
343 during rxq to pmd assignment. */
344 RXQ_N_CYCLES
345 };
346
347 #define XPS_TIMEOUT 500000LL /* In microseconds. */
348
349 /* Contained by struct dp_netdev_port's 'rxqs' member. */
350 struct dp_netdev_rxq {
351 struct dp_netdev_port *port;
352 struct netdev_rxq *rx;
353 unsigned core_id; /* Core to which this queue should be
354 pinned. OVS_CORE_UNSPEC if the
355 queue doesn't need to be pinned to a
356 particular core. */
357 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
358 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
359
360 /* Counters of cycles spent successfully polling and processing pkts. */
361 atomic_ullong cycles[RXQ_N_CYCLES];
362 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
363 sum them to yield the cycles used for an rxq. */
364 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
365 };
366
367 /* A port in a netdev-based datapath. */
368 struct dp_netdev_port {
369 odp_port_t port_no;
370 bool dynamic_txqs; /* If true XPS will be used. */
371 bool need_reconfigure; /* True if we should reconfigure netdev. */
372 struct netdev *netdev;
373 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
374 struct netdev_saved_flags *sf;
375 struct dp_netdev_rxq *rxqs;
376 unsigned n_rxq; /* Number of elements in 'rxqs' */
377 unsigned *txq_used; /* Number of threads that use each tx queue. */
378 struct ovs_mutex txq_used_mutex;
379 char *type; /* Port type as requested by user. */
380 char *rxq_affinity_list; /* Requested affinity of rx queues. */
381 };
382
383 /* Contained by struct dp_netdev_flow's 'stats' member. */
384 struct dp_netdev_flow_stats {
385 atomic_llong used; /* Last used time, in monotonic msecs. */
386 atomic_ullong packet_count; /* Number of packets matched. */
387 atomic_ullong byte_count; /* Number of bytes matched. */
388 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
389 };
390
391 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
392 *
393 *
394 * Thread-safety
395 * =============
396 *
397 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
398 * its pmd thread's classifier. The text below calls this classifier 'cls'.
399 *
400 * Motivation
401 * ----------
402 *
403 * The thread safety rules described here for "struct dp_netdev_flow" are
404 * motivated by two goals:
405 *
406 * - Prevent threads that read members of "struct dp_netdev_flow" from
407 * reading bad data due to changes by some thread concurrently modifying
408 * those members.
409 *
410 * - Prevent two threads making changes to members of a given "struct
411 * dp_netdev_flow" from interfering with each other.
412 *
413 *
414 * Rules
415 * -----
416 *
417 * A flow 'flow' may be accessed without a risk of being freed during an RCU
418 * grace period. Code that needs to hold onto a flow for a while
419 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
420 *
421 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
422 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
423 * from modification.
424 *
425 * Some members, marked 'const', are immutable. Accessing other members
426 * requires synchronization, as noted in more detail below.
427 */
428 struct dp_netdev_flow {
429 const struct flow flow; /* Unmasked flow that created this entry. */
430 /* Hash table index by unmasked flow. */
431 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
432 /* 'flow_table'. */
433 const ovs_u128 ufid; /* Unique flow identifier. */
434 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
435 /* flow. */
436
437 /* Number of references.
438 * The classifier owns one reference.
439 * Any thread trying to keep a rule from being freed should hold its own
440 * reference. */
441 struct ovs_refcount ref_cnt;
442
443 bool dead;
444
445 /* Statistics. */
446 struct dp_netdev_flow_stats stats;
447
448 /* Actions. */
449 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
450
451 /* While processing a group of input packets, the datapath uses the next
452 * member to store a pointer to the output batch for the flow. It is
453 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
454 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
455 struct packet_batch_per_flow *batch;
456
457 /* Packet classification. */
458 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
459 /* 'cr' must be the last member. */
460 };
461
462 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
463 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
464 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
465 struct flow *, bool);
466
467 /* A set of datapath actions within a "struct dp_netdev_flow".
468 *
469 *
470 * Thread-safety
471 * =============
472 *
473 * A struct dp_netdev_actions 'actions' is protected with RCU. */
474 struct dp_netdev_actions {
475 /* These members are immutable: they do not change during the struct's
476 * lifetime. */
477 unsigned int size; /* Size of 'actions', in bytes. */
478 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
479 };
480
481 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
482 size_t);
483 struct dp_netdev_actions *dp_netdev_flow_get_actions(
484 const struct dp_netdev_flow *);
485 static void dp_netdev_actions_free(struct dp_netdev_actions *);
486
487 struct polled_queue {
488 struct dp_netdev_rxq *rxq;
489 odp_port_t port_no;
490 };
491
492 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
493 struct rxq_poll {
494 struct dp_netdev_rxq *rxq;
495 struct hmap_node node;
496 };
497
498 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
499 * 'tnl_port_cache' or 'tx_ports'. */
500 struct tx_port {
501 struct dp_netdev_port *port;
502 int qid;
503 long long last_used;
504 struct hmap_node node;
505 long long flush_time;
506 struct dp_packet_batch output_pkts;
507 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
508 };
509
510 /* A set of properties for the current processing loop that is not directly
511 * associated with the pmd thread itself, but with the packets being
512 * processed or the short-term system configuration (for example, time).
513 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
514 struct dp_netdev_pmd_thread_ctx {
515 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
516 long long now;
517 /* RX queue from which last packet was received. */
518 struct dp_netdev_rxq *last_rxq;
519 };
520
521 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
522 * the performance overhead of interrupt processing. Therefore netdev can
523 * not implement rx-wait for these devices. dpif-netdev needs to poll
524 * these device to check for recv buffer. pmd-thread does polling for
525 * devices assigned to itself.
526 *
527 * DPDK used PMD for accessing NIC.
528 *
529 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
530 * I/O of all non-pmd threads. There will be no actual thread created
531 * for the instance.
532 *
533 * Each struct has its own flow cache and classifier per managed ingress port.
534 * For packets received on ingress port, a look up is done on corresponding PMD
535 * thread's flow cache and in case of a miss, lookup is performed in the
536 * corresponding classifier of port. Packets are executed with the found
537 * actions in either case.
538 * */
539 struct dp_netdev_pmd_thread {
540 struct dp_netdev *dp;
541 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
542 struct cmap_node node; /* In 'dp->poll_threads'. */
543
544 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
545 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
546
547 /* Per thread exact-match cache. Note, the instance for cpu core
548 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
549 * need to be protected by 'non_pmd_mutex'. Every other instance
550 * will only be accessed by its own pmd thread. */
551 struct emc_cache flow_cache;
552
553 /* Flow-Table and classifiers
554 *
555 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
556 * changes to 'classifiers' must be made while still holding the
557 * 'flow_mutex'.
558 */
559 struct ovs_mutex flow_mutex;
560 struct cmap flow_table OVS_GUARDED; /* Flow table. */
561
562 /* One classifier per in_port polled by the pmd */
563 struct cmap classifiers;
564 /* Periodically sort subtable vectors according to hit frequencies */
565 long long int next_optimization;
566 /* End of the next time interval for which processing cycles
567 are stored for each polled rxq. */
568 long long int rxq_next_cycle_store;
569
570 /* Last interval timestamp. */
571 uint64_t intrvl_tsc_prev;
572 /* Last interval cycles. */
573 atomic_ullong intrvl_cycles;
574
575 /* Current context of the PMD thread. */
576 struct dp_netdev_pmd_thread_ctx ctx;
577
578 struct latch exit_latch; /* For terminating the pmd thread. */
579 struct seq *reload_seq;
580 uint64_t last_reload_seq;
581 atomic_bool reload; /* Do we need to reload ports? */
582 pthread_t thread;
583 unsigned core_id; /* CPU core id of this pmd thread. */
584 int numa_id; /* numa node id of this pmd thread. */
585 bool isolated;
586
587 /* Queue id used by this pmd thread to send packets on all netdevs if
588 * XPS disabled for this netdev. All static_tx_qid's are unique and less
589 * than 'cmap_count(dp->poll_threads)'. */
590 uint32_t static_tx_qid;
591
592 /* Number of filled output batches. */
593 int n_output_batches;
594
595 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
596 /* List of rx queues to poll. */
597 struct hmap poll_list OVS_GUARDED;
598 /* Map of 'tx_port's used for transmission. Written by the main thread,
599 * read by the pmd thread. */
600 struct hmap tx_ports OVS_GUARDED;
601
602 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
603 * ports (that support push_tunnel/pop_tunnel), the other contains ports
604 * with at least one txq (that support send). A port can be in both.
605 *
606 * There are two separate maps to make sure that we don't try to execute
607 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
608 *
609 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
610 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
611 * other instance will only be accessed by its own pmd thread. */
612 struct hmap tnl_port_cache;
613 struct hmap send_port_cache;
614
615 /* Keep track of detailed PMD performance statistics. */
616 struct pmd_perf_stats perf_stats;
617
618 /* Set to true if the pmd thread needs to be reloaded. */
619 bool need_reload;
620 };
621
622 /* Interface to netdev-based datapath. */
623 struct dpif_netdev {
624 struct dpif dpif;
625 struct dp_netdev *dp;
626 uint64_t last_port_seq;
627 };
628
629 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
630 struct dp_netdev_port **portp)
631 OVS_REQUIRES(dp->port_mutex);
632 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
633 struct dp_netdev_port **portp)
634 OVS_REQUIRES(dp->port_mutex);
635 static void dp_netdev_free(struct dp_netdev *)
636 OVS_REQUIRES(dp_netdev_mutex);
637 static int do_add_port(struct dp_netdev *dp, const char *devname,
638 const char *type, odp_port_t port_no)
639 OVS_REQUIRES(dp->port_mutex);
640 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
641 OVS_REQUIRES(dp->port_mutex);
642 static int dpif_netdev_open(const struct dpif_class *, const char *name,
643 bool create, struct dpif **);
644 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
645 struct dp_packet_batch *,
646 bool may_steal, const struct flow *flow,
647 const struct nlattr *actions,
648 size_t actions_len);
649 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
650 struct dp_packet_batch *, odp_port_t port_no);
651 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
652 struct dp_packet_batch *);
653
654 static void dp_netdev_disable_upcall(struct dp_netdev *);
655 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
656 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
657 struct dp_netdev *dp, unsigned core_id,
658 int numa_id);
659 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
660 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
661 OVS_REQUIRES(dp->port_mutex);
662
663 static void *pmd_thread_main(void *);
664 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
665 unsigned core_id);
666 static struct dp_netdev_pmd_thread *
667 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
668 static void dp_netdev_del_pmd(struct dp_netdev *dp,
669 struct dp_netdev_pmd_thread *pmd);
670 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
671 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
672 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
673 struct dp_netdev_port *port)
674 OVS_REQUIRES(pmd->port_mutex);
675 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
676 struct tx_port *tx)
677 OVS_REQUIRES(pmd->port_mutex);
678 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
679 struct dp_netdev_rxq *rxq)
680 OVS_REQUIRES(pmd->port_mutex);
681 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
682 struct rxq_poll *poll)
683 OVS_REQUIRES(pmd->port_mutex);
684 static int
685 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
686 bool force);
687
688 static void reconfigure_datapath(struct dp_netdev *dp)
689 OVS_REQUIRES(dp->port_mutex);
690 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
691 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
692 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
693 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
694 OVS_REQUIRES(pmd->port_mutex);
695 static inline void
696 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
697 struct polled_queue *poll_list, int poll_cnt);
698 static void
699 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
700 enum rxq_cycles_counter_type type,
701 unsigned long long cycles);
702 static uint64_t
703 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
704 enum rxq_cycles_counter_type type);
705 static void
706 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
707 unsigned long long cycles);
708 static uint64_t
709 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
710 static void
711 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
712 bool purge);
713 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
714 struct tx_port *tx);
715
716 static inline bool emc_entry_alive(struct emc_entry *ce);
717 static void emc_clear_entry(struct emc_entry *ce);
718
719 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
720
721 static void
722 emc_cache_init(struct emc_cache *flow_cache)
723 {
724 int i;
725
726 flow_cache->sweep_idx = 0;
727 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
728 flow_cache->entries[i].flow = NULL;
729 flow_cache->entries[i].key.hash = 0;
730 flow_cache->entries[i].key.len = sizeof(struct miniflow);
731 flowmap_init(&flow_cache->entries[i].key.mf.map);
732 }
733 }
734
735 static void
736 emc_cache_uninit(struct emc_cache *flow_cache)
737 {
738 int i;
739
740 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
741 emc_clear_entry(&flow_cache->entries[i]);
742 }
743 }
744
745 /* Check and clear dead flow references slowly (one entry at each
746 * invocation). */
747 static void
748 emc_cache_slow_sweep(struct emc_cache *flow_cache)
749 {
750 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
751
752 if (!emc_entry_alive(entry)) {
753 emc_clear_entry(entry);
754 }
755 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
756 }
757
758 /* Updates the time in PMD threads context and should be called in three cases:
759 *
760 * 1. PMD structure initialization:
761 * - dp_netdev_configure_pmd()
762 *
763 * 2. Before processing of the new packet batch:
764 * - dpif_netdev_execute()
765 * - dp_netdev_process_rxq_port()
766 *
767 * 3. At least once per polling iteration in main polling threads if no
768 * packets received on current iteration:
769 * - dpif_netdev_run()
770 * - pmd_thread_main()
771 *
772 * 'pmd->ctx.now' should be used without update in all other cases if possible.
773 */
774 static inline void
775 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
776 {
777 pmd->ctx.now = time_usec();
778 }
779
780 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
781 bool
782 dpif_is_netdev(const struct dpif *dpif)
783 {
784 return dpif->dpif_class->open == dpif_netdev_open;
785 }
786
787 static struct dpif_netdev *
788 dpif_netdev_cast(const struct dpif *dpif)
789 {
790 ovs_assert(dpif_is_netdev(dpif));
791 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
792 }
793
794 static struct dp_netdev *
795 get_dp_netdev(const struct dpif *dpif)
796 {
797 return dpif_netdev_cast(dpif)->dp;
798 }
799 \f
800 enum pmd_info_type {
801 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
802 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
803 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
804 };
805
806 static void
807 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
808 {
809 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
810 ? "main thread" : "pmd thread");
811 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
812 ds_put_format(reply, " numa_id %d", pmd->numa_id);
813 }
814 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
815 ds_put_format(reply, " core_id %u", pmd->core_id);
816 }
817 ds_put_cstr(reply, ":\n");
818 }
819
820 static void
821 pmd_info_show_stats(struct ds *reply,
822 struct dp_netdev_pmd_thread *pmd)
823 {
824 uint64_t stats[PMD_N_STATS];
825 uint64_t total_cycles, total_packets;
826 double passes_per_pkt = 0;
827 double lookups_per_hit = 0;
828 double packets_per_batch = 0;
829
830 pmd_perf_read_counters(&pmd->perf_stats, stats);
831 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
832 + stats[PMD_CYCLES_ITER_BUSY];
833 total_packets = stats[PMD_STAT_RECV];
834
835 format_pmd_thread(reply, pmd);
836
837 if (total_packets > 0) {
838 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
839 / (double) total_packets;
840 }
841 if (stats[PMD_STAT_MASKED_HIT] > 0) {
842 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
843 / (double) stats[PMD_STAT_MASKED_HIT];
844 }
845 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
846 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
847 / (double) stats[PMD_STAT_SENT_BATCHES];
848 }
849
850 ds_put_format(reply,
851 "\tpackets received: %"PRIu64"\n"
852 "\tpacket recirculations: %"PRIu64"\n"
853 "\tavg. datapath passes per packet: %.02f\n"
854 "\temc hits: %"PRIu64"\n"
855 "\tmegaflow hits: %"PRIu64"\n"
856 "\tavg. subtable lookups per megaflow hit: %.02f\n"
857 "\tmiss with success upcall: %"PRIu64"\n"
858 "\tmiss with failed upcall: %"PRIu64"\n"
859 "\tavg. packets per output batch: %.02f\n",
860 total_packets, stats[PMD_STAT_RECIRC],
861 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
862 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
863 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
864 packets_per_batch);
865
866 if (total_cycles == 0) {
867 return;
868 }
869
870 ds_put_format(reply,
871 "\tidle cycles: %"PRIu64" (%.02f%%)\n"
872 "\tprocessing cycles: %"PRIu64" (%.02f%%)\n",
873 stats[PMD_CYCLES_ITER_IDLE],
874 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
875 stats[PMD_CYCLES_ITER_BUSY],
876 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
877
878 if (total_packets == 0) {
879 return;
880 }
881
882 ds_put_format(reply,
883 "\tavg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
884 total_cycles / (double) total_packets,
885 total_cycles, total_packets);
886
887 ds_put_format(reply,
888 "\tavg processing cycles per packet: "
889 "%.02f (%"PRIu64"/%"PRIu64")\n",
890 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
891 stats[PMD_CYCLES_ITER_BUSY], total_packets);
892 }
893
894 static int
895 compare_poll_list(const void *a_, const void *b_)
896 {
897 const struct rxq_poll *a = a_;
898 const struct rxq_poll *b = b_;
899
900 const char *namea = netdev_rxq_get_name(a->rxq->rx);
901 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
902
903 int cmp = strcmp(namea, nameb);
904 if (!cmp) {
905 return netdev_rxq_get_queue_id(a->rxq->rx)
906 - netdev_rxq_get_queue_id(b->rxq->rx);
907 } else {
908 return cmp;
909 }
910 }
911
912 static void
913 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
914 size_t *n)
915 {
916 struct rxq_poll *ret, *poll;
917 size_t i;
918
919 *n = hmap_count(&pmd->poll_list);
920 if (!*n) {
921 ret = NULL;
922 } else {
923 ret = xcalloc(*n, sizeof *ret);
924 i = 0;
925 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
926 ret[i] = *poll;
927 i++;
928 }
929 ovs_assert(i == *n);
930 qsort(ret, *n, sizeof *ret, compare_poll_list);
931 }
932
933 *list = ret;
934 }
935
936 static void
937 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
938 {
939 if (pmd->core_id != NON_PMD_CORE_ID) {
940 struct rxq_poll *list;
941 size_t n_rxq;
942 uint64_t total_cycles = 0;
943
944 ds_put_format(reply,
945 "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
946 pmd->numa_id, pmd->core_id, (pmd->isolated)
947 ? "true" : "false");
948
949 ovs_mutex_lock(&pmd->port_mutex);
950 sorted_poll_list(pmd, &list, &n_rxq);
951
952 /* Get the total pmd cycles for an interval. */
953 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
954 /* Estimate the cycles to cover all intervals. */
955 total_cycles *= PMD_RXQ_INTERVAL_MAX;
956
957 for (int i = 0; i < n_rxq; i++) {
958 struct dp_netdev_rxq *rxq = list[i].rxq;
959 const char *name = netdev_rxq_get_name(rxq->rx);
960 uint64_t proc_cycles = 0;
961
962 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
963 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
964 }
965 ds_put_format(reply, "\tport: %-16s\tqueue-id: %2d", name,
966 netdev_rxq_get_queue_id(list[i].rxq->rx));
967 ds_put_format(reply, "\tpmd usage: ");
968 if (total_cycles) {
969 ds_put_format(reply, "%2"PRIu64"",
970 proc_cycles * 100 / total_cycles);
971 ds_put_cstr(reply, " %");
972 } else {
973 ds_put_format(reply, "%s", "NOT AVAIL");
974 }
975 ds_put_cstr(reply, "\n");
976 }
977 ovs_mutex_unlock(&pmd->port_mutex);
978 free(list);
979 }
980 }
981
982 static int
983 compare_poll_thread_list(const void *a_, const void *b_)
984 {
985 const struct dp_netdev_pmd_thread *a, *b;
986
987 a = *(struct dp_netdev_pmd_thread **)a_;
988 b = *(struct dp_netdev_pmd_thread **)b_;
989
990 if (a->core_id < b->core_id) {
991 return -1;
992 }
993 if (a->core_id > b->core_id) {
994 return 1;
995 }
996 return 0;
997 }
998
999 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1000 * this list, as long as we do not go to quiescent state. */
1001 static void
1002 sorted_poll_thread_list(struct dp_netdev *dp,
1003 struct dp_netdev_pmd_thread ***list,
1004 size_t *n)
1005 {
1006 struct dp_netdev_pmd_thread *pmd;
1007 struct dp_netdev_pmd_thread **pmd_list;
1008 size_t k = 0, n_pmds;
1009
1010 n_pmds = cmap_count(&dp->poll_threads);
1011 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1012
1013 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1014 if (k >= n_pmds) {
1015 break;
1016 }
1017 pmd_list[k++] = pmd;
1018 }
1019
1020 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1021
1022 *list = pmd_list;
1023 *n = k;
1024 }
1025
1026 static void
1027 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1028 const char *argv[], void *aux OVS_UNUSED)
1029 {
1030 struct ds reply = DS_EMPTY_INITIALIZER;
1031 struct dp_netdev *dp = NULL;
1032
1033 ovs_mutex_lock(&dp_netdev_mutex);
1034
1035 if (argc == 2) {
1036 dp = shash_find_data(&dp_netdevs, argv[1]);
1037 } else if (shash_count(&dp_netdevs) == 1) {
1038 /* There's only one datapath */
1039 dp = shash_first(&dp_netdevs)->data;
1040 }
1041
1042 if (!dp) {
1043 ovs_mutex_unlock(&dp_netdev_mutex);
1044 unixctl_command_reply_error(conn,
1045 "please specify an existing datapath");
1046 return;
1047 }
1048
1049 dp_netdev_request_reconfigure(dp);
1050 ovs_mutex_unlock(&dp_netdev_mutex);
1051 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1052 unixctl_command_reply(conn, ds_cstr(&reply));
1053 ds_destroy(&reply);
1054 }
1055
1056 static void
1057 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1058 void *aux)
1059 {
1060 struct ds reply = DS_EMPTY_INITIALIZER;
1061 struct dp_netdev_pmd_thread **pmd_list;
1062 struct dp_netdev *dp = NULL;
1063 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1064 unsigned int core_id;
1065 bool filter_on_pmd = false;
1066 size_t n;
1067
1068 ovs_mutex_lock(&dp_netdev_mutex);
1069
1070 while (argc > 1) {
1071 if (!strcmp(argv[1], "-pmd") && argc >= 3) {
1072 if (str_to_uint(argv[2], 10, &core_id)) {
1073 filter_on_pmd = true;
1074 }
1075 argc -= 2;
1076 argv += 2;
1077 } else {
1078 dp = shash_find_data(&dp_netdevs, argv[1]);
1079 argc -= 1;
1080 argv += 1;
1081 }
1082 }
1083
1084 if (!dp) {
1085 if (shash_count(&dp_netdevs) == 1) {
1086 /* There's only one datapath */
1087 dp = shash_first(&dp_netdevs)->data;
1088 } else {
1089 ovs_mutex_unlock(&dp_netdev_mutex);
1090 unixctl_command_reply_error(conn,
1091 "please specify an existing datapath");
1092 return;
1093 }
1094 }
1095
1096 sorted_poll_thread_list(dp, &pmd_list, &n);
1097 for (size_t i = 0; i < n; i++) {
1098 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1099 if (!pmd) {
1100 break;
1101 }
1102 if (filter_on_pmd && pmd->core_id != core_id) {
1103 continue;
1104 }
1105 if (type == PMD_INFO_SHOW_RXQ) {
1106 pmd_info_show_rxq(&reply, pmd);
1107 } else if (type == PMD_INFO_CLEAR_STATS) {
1108 pmd_perf_stats_clear(&pmd->perf_stats);
1109 } else if (type == PMD_INFO_SHOW_STATS) {
1110 pmd_info_show_stats(&reply, pmd);
1111 }
1112 }
1113 free(pmd_list);
1114
1115 ovs_mutex_unlock(&dp_netdev_mutex);
1116
1117 unixctl_command_reply(conn, ds_cstr(&reply));
1118 ds_destroy(&reply);
1119 }
1120 \f
1121 static int
1122 dpif_netdev_init(void)
1123 {
1124 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1125 clear_aux = PMD_INFO_CLEAR_STATS,
1126 poll_aux = PMD_INFO_SHOW_RXQ;
1127
1128 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1129 0, 3, dpif_netdev_pmd_info,
1130 (void *)&show_aux);
1131 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1132 0, 3, dpif_netdev_pmd_info,
1133 (void *)&clear_aux);
1134 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1135 0, 3, dpif_netdev_pmd_info,
1136 (void *)&poll_aux);
1137 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1138 0, 1, dpif_netdev_pmd_rebalance,
1139 NULL);
1140 return 0;
1141 }
1142
1143 static int
1144 dpif_netdev_enumerate(struct sset *all_dps,
1145 const struct dpif_class *dpif_class)
1146 {
1147 struct shash_node *node;
1148
1149 ovs_mutex_lock(&dp_netdev_mutex);
1150 SHASH_FOR_EACH(node, &dp_netdevs) {
1151 struct dp_netdev *dp = node->data;
1152 if (dpif_class != dp->class) {
1153 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1154 * If the class doesn't match, skip this dpif. */
1155 continue;
1156 }
1157 sset_add(all_dps, node->name);
1158 }
1159 ovs_mutex_unlock(&dp_netdev_mutex);
1160
1161 return 0;
1162 }
1163
1164 static bool
1165 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1166 {
1167 return class != &dpif_netdev_class;
1168 }
1169
1170 static const char *
1171 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1172 {
1173 return strcmp(type, "internal") ? type
1174 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1175 : "tap";
1176 }
1177
1178 static struct dpif *
1179 create_dpif_netdev(struct dp_netdev *dp)
1180 {
1181 uint16_t netflow_id = hash_string(dp->name, 0);
1182 struct dpif_netdev *dpif;
1183
1184 ovs_refcount_ref(&dp->ref_cnt);
1185
1186 dpif = xmalloc(sizeof *dpif);
1187 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1188 dpif->dp = dp;
1189 dpif->last_port_seq = seq_read(dp->port_seq);
1190
1191 return &dpif->dpif;
1192 }
1193
1194 /* Choose an unused, non-zero port number and return it on success.
1195 * Return ODPP_NONE on failure. */
1196 static odp_port_t
1197 choose_port(struct dp_netdev *dp, const char *name)
1198 OVS_REQUIRES(dp->port_mutex)
1199 {
1200 uint32_t port_no;
1201
1202 if (dp->class != &dpif_netdev_class) {
1203 const char *p;
1204 int start_no = 0;
1205
1206 /* If the port name begins with "br", start the number search at
1207 * 100 to make writing tests easier. */
1208 if (!strncmp(name, "br", 2)) {
1209 start_no = 100;
1210 }
1211
1212 /* If the port name contains a number, try to assign that port number.
1213 * This can make writing unit tests easier because port numbers are
1214 * predictable. */
1215 for (p = name; *p != '\0'; p++) {
1216 if (isdigit((unsigned char) *p)) {
1217 port_no = start_no + strtol(p, NULL, 10);
1218 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1219 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1220 return u32_to_odp(port_no);
1221 }
1222 break;
1223 }
1224 }
1225 }
1226
1227 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1228 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1229 return u32_to_odp(port_no);
1230 }
1231 }
1232
1233 return ODPP_NONE;
1234 }
1235
1236 static int
1237 create_dp_netdev(const char *name, const struct dpif_class *class,
1238 struct dp_netdev **dpp)
1239 OVS_REQUIRES(dp_netdev_mutex)
1240 {
1241 struct dp_netdev *dp;
1242 int error;
1243
1244 dp = xzalloc(sizeof *dp);
1245 shash_add(&dp_netdevs, name, dp);
1246
1247 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1248 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1249 ovs_refcount_init(&dp->ref_cnt);
1250 atomic_flag_clear(&dp->destroyed);
1251
1252 ovs_mutex_init(&dp->port_mutex);
1253 hmap_init(&dp->ports);
1254 dp->port_seq = seq_create();
1255 fat_rwlock_init(&dp->upcall_rwlock);
1256
1257 dp->reconfigure_seq = seq_create();
1258 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1259
1260 for (int i = 0; i < N_METER_LOCKS; ++i) {
1261 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1262 }
1263
1264 /* Disable upcalls by default. */
1265 dp_netdev_disable_upcall(dp);
1266 dp->upcall_aux = NULL;
1267 dp->upcall_cb = NULL;
1268
1269 conntrack_init(&dp->conntrack);
1270
1271 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1272 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1273
1274 cmap_init(&dp->poll_threads);
1275
1276 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1277 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1278 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1279
1280 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1281 ovsthread_key_create(&dp->per_pmd_key, NULL);
1282
1283 ovs_mutex_lock(&dp->port_mutex);
1284 /* non-PMD will be created before all other threads and will
1285 * allocate static_tx_qid = 0. */
1286 dp_netdev_set_nonpmd(dp);
1287
1288 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1289 "internal"),
1290 ODPP_LOCAL);
1291 ovs_mutex_unlock(&dp->port_mutex);
1292 if (error) {
1293 dp_netdev_free(dp);
1294 return error;
1295 }
1296
1297 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1298 *dpp = dp;
1299 return 0;
1300 }
1301
1302 static void
1303 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1304 {
1305 seq_change(dp->reconfigure_seq);
1306 }
1307
1308 static bool
1309 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1310 {
1311 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1312 }
1313
1314 static int
1315 dpif_netdev_open(const struct dpif_class *class, const char *name,
1316 bool create, struct dpif **dpifp)
1317 {
1318 struct dp_netdev *dp;
1319 int error;
1320
1321 ovs_mutex_lock(&dp_netdev_mutex);
1322 dp = shash_find_data(&dp_netdevs, name);
1323 if (!dp) {
1324 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1325 } else {
1326 error = (dp->class != class ? EINVAL
1327 : create ? EEXIST
1328 : 0);
1329 }
1330 if (!error) {
1331 *dpifp = create_dpif_netdev(dp);
1332 dp->dpif = *dpifp;
1333 }
1334 ovs_mutex_unlock(&dp_netdev_mutex);
1335
1336 return error;
1337 }
1338
1339 static void
1340 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1341 OVS_NO_THREAD_SAFETY_ANALYSIS
1342 {
1343 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1344 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1345
1346 /* Before freeing a lock we should release it */
1347 fat_rwlock_unlock(&dp->upcall_rwlock);
1348 fat_rwlock_destroy(&dp->upcall_rwlock);
1349 }
1350
1351 static void
1352 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1353 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1354 {
1355 if (dp->meters[meter_id]) {
1356 free(dp->meters[meter_id]);
1357 dp->meters[meter_id] = NULL;
1358 }
1359 }
1360
1361 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1362 * through the 'dp_netdevs' shash while freeing 'dp'. */
1363 static void
1364 dp_netdev_free(struct dp_netdev *dp)
1365 OVS_REQUIRES(dp_netdev_mutex)
1366 {
1367 struct dp_netdev_port *port, *next;
1368
1369 shash_find_and_delete(&dp_netdevs, dp->name);
1370
1371 ovs_mutex_lock(&dp->port_mutex);
1372 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1373 do_del_port(dp, port);
1374 }
1375 ovs_mutex_unlock(&dp->port_mutex);
1376
1377 dp_netdev_destroy_all_pmds(dp, true);
1378 cmap_destroy(&dp->poll_threads);
1379
1380 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1381 id_pool_destroy(dp->tx_qid_pool);
1382
1383 ovs_mutex_destroy(&dp->non_pmd_mutex);
1384 ovsthread_key_delete(dp->per_pmd_key);
1385
1386 conntrack_destroy(&dp->conntrack);
1387
1388
1389 seq_destroy(dp->reconfigure_seq);
1390
1391 seq_destroy(dp->port_seq);
1392 hmap_destroy(&dp->ports);
1393 ovs_mutex_destroy(&dp->port_mutex);
1394
1395 /* Upcalls must be disabled at this point */
1396 dp_netdev_destroy_upcall_lock(dp);
1397
1398 int i;
1399
1400 for (i = 0; i < MAX_METERS; ++i) {
1401 meter_lock(dp, i);
1402 dp_delete_meter(dp, i);
1403 meter_unlock(dp, i);
1404 }
1405 for (i = 0; i < N_METER_LOCKS; ++i) {
1406 ovs_mutex_destroy(&dp->meter_locks[i]);
1407 }
1408
1409 free(dp->pmd_cmask);
1410 free(CONST_CAST(char *, dp->name));
1411 free(dp);
1412 }
1413
1414 static void
1415 dp_netdev_unref(struct dp_netdev *dp)
1416 {
1417 if (dp) {
1418 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1419 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1420 ovs_mutex_lock(&dp_netdev_mutex);
1421 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1422 dp_netdev_free(dp);
1423 }
1424 ovs_mutex_unlock(&dp_netdev_mutex);
1425 }
1426 }
1427
1428 static void
1429 dpif_netdev_close(struct dpif *dpif)
1430 {
1431 struct dp_netdev *dp = get_dp_netdev(dpif);
1432
1433 dp_netdev_unref(dp);
1434 free(dpif);
1435 }
1436
1437 static int
1438 dpif_netdev_destroy(struct dpif *dpif)
1439 {
1440 struct dp_netdev *dp = get_dp_netdev(dpif);
1441
1442 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1443 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1444 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1445 OVS_NOT_REACHED();
1446 }
1447 }
1448
1449 return 0;
1450 }
1451
1452 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1453 * load/store semantics. While the increment is not atomic, the load and
1454 * store operations are, making it impossible to read inconsistent values.
1455 *
1456 * This is used to update thread local stats counters. */
1457 static void
1458 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1459 {
1460 unsigned long long tmp;
1461
1462 atomic_read_relaxed(var, &tmp);
1463 tmp += n;
1464 atomic_store_relaxed(var, tmp);
1465 }
1466
1467 static int
1468 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1469 {
1470 struct dp_netdev *dp = get_dp_netdev(dpif);
1471 struct dp_netdev_pmd_thread *pmd;
1472 uint64_t pmd_stats[PMD_N_STATS];
1473
1474 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1475 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1476 stats->n_flows += cmap_count(&pmd->flow_table);
1477 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1478 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1479 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1480 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1481 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1482 }
1483 stats->n_masks = UINT32_MAX;
1484 stats->n_mask_hit = UINT64_MAX;
1485
1486 return 0;
1487 }
1488
1489 static void
1490 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1491 {
1492 if (pmd->core_id == NON_PMD_CORE_ID) {
1493 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1494 ovs_mutex_lock(&pmd->port_mutex);
1495 pmd_load_cached_ports(pmd);
1496 ovs_mutex_unlock(&pmd->port_mutex);
1497 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1498 return;
1499 }
1500
1501 ovs_mutex_lock(&pmd->cond_mutex);
1502 seq_change(pmd->reload_seq);
1503 atomic_store_relaxed(&pmd->reload, true);
1504 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1505 ovs_mutex_unlock(&pmd->cond_mutex);
1506 }
1507
1508 static uint32_t
1509 hash_port_no(odp_port_t port_no)
1510 {
1511 return hash_int(odp_to_u32(port_no), 0);
1512 }
1513
1514 static int
1515 port_create(const char *devname, const char *type,
1516 odp_port_t port_no, struct dp_netdev_port **portp)
1517 {
1518 struct netdev_saved_flags *sf;
1519 struct dp_netdev_port *port;
1520 enum netdev_flags flags;
1521 struct netdev *netdev;
1522 int error;
1523
1524 *portp = NULL;
1525
1526 /* Open and validate network device. */
1527 error = netdev_open(devname, type, &netdev);
1528 if (error) {
1529 return error;
1530 }
1531 /* XXX reject non-Ethernet devices */
1532
1533 netdev_get_flags(netdev, &flags);
1534 if (flags & NETDEV_LOOPBACK) {
1535 VLOG_ERR("%s: cannot add a loopback device", devname);
1536 error = EINVAL;
1537 goto out;
1538 }
1539
1540 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1541 if (error) {
1542 VLOG_ERR("%s: cannot set promisc flag", devname);
1543 goto out;
1544 }
1545
1546 port = xzalloc(sizeof *port);
1547 port->port_no = port_no;
1548 port->netdev = netdev;
1549 port->type = xstrdup(type);
1550 port->sf = sf;
1551 port->need_reconfigure = true;
1552 ovs_mutex_init(&port->txq_used_mutex);
1553
1554 *portp = port;
1555
1556 return 0;
1557
1558 out:
1559 netdev_close(netdev);
1560 return error;
1561 }
1562
1563 static int
1564 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1565 odp_port_t port_no)
1566 OVS_REQUIRES(dp->port_mutex)
1567 {
1568 struct dp_netdev_port *port;
1569 int error;
1570
1571 /* Reject devices already in 'dp'. */
1572 if (!get_port_by_name(dp, devname, &port)) {
1573 return EEXIST;
1574 }
1575
1576 error = port_create(devname, type, port_no, &port);
1577 if (error) {
1578 return error;
1579 }
1580
1581 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1582 seq_change(dp->port_seq);
1583
1584 reconfigure_datapath(dp);
1585
1586 return 0;
1587 }
1588
1589 static int
1590 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1591 odp_port_t *port_nop)
1592 {
1593 struct dp_netdev *dp = get_dp_netdev(dpif);
1594 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1595 const char *dpif_port;
1596 odp_port_t port_no;
1597 int error;
1598
1599 ovs_mutex_lock(&dp->port_mutex);
1600 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1601 if (*port_nop != ODPP_NONE) {
1602 port_no = *port_nop;
1603 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1604 } else {
1605 port_no = choose_port(dp, dpif_port);
1606 error = port_no == ODPP_NONE ? EFBIG : 0;
1607 }
1608 if (!error) {
1609 *port_nop = port_no;
1610 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1611 }
1612 ovs_mutex_unlock(&dp->port_mutex);
1613
1614 return error;
1615 }
1616
1617 static int
1618 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1619 {
1620 struct dp_netdev *dp = get_dp_netdev(dpif);
1621 int error;
1622
1623 ovs_mutex_lock(&dp->port_mutex);
1624 if (port_no == ODPP_LOCAL) {
1625 error = EINVAL;
1626 } else {
1627 struct dp_netdev_port *port;
1628
1629 error = get_port_by_number(dp, port_no, &port);
1630 if (!error) {
1631 do_del_port(dp, port);
1632 }
1633 }
1634 ovs_mutex_unlock(&dp->port_mutex);
1635
1636 return error;
1637 }
1638
1639 static bool
1640 is_valid_port_number(odp_port_t port_no)
1641 {
1642 return port_no != ODPP_NONE;
1643 }
1644
1645 static struct dp_netdev_port *
1646 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1647 OVS_REQUIRES(dp->port_mutex)
1648 {
1649 struct dp_netdev_port *port;
1650
1651 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1652 if (port->port_no == port_no) {
1653 return port;
1654 }
1655 }
1656 return NULL;
1657 }
1658
1659 static int
1660 get_port_by_number(struct dp_netdev *dp,
1661 odp_port_t port_no, struct dp_netdev_port **portp)
1662 OVS_REQUIRES(dp->port_mutex)
1663 {
1664 if (!is_valid_port_number(port_no)) {
1665 *portp = NULL;
1666 return EINVAL;
1667 } else {
1668 *portp = dp_netdev_lookup_port(dp, port_no);
1669 return *portp ? 0 : ENODEV;
1670 }
1671 }
1672
1673 static void
1674 port_destroy(struct dp_netdev_port *port)
1675 {
1676 if (!port) {
1677 return;
1678 }
1679
1680 netdev_close(port->netdev);
1681 netdev_restore_flags(port->sf);
1682
1683 for (unsigned i = 0; i < port->n_rxq; i++) {
1684 netdev_rxq_close(port->rxqs[i].rx);
1685 }
1686 ovs_mutex_destroy(&port->txq_used_mutex);
1687 free(port->rxq_affinity_list);
1688 free(port->txq_used);
1689 free(port->rxqs);
1690 free(port->type);
1691 free(port);
1692 }
1693
1694 static int
1695 get_port_by_name(struct dp_netdev *dp,
1696 const char *devname, struct dp_netdev_port **portp)
1697 OVS_REQUIRES(dp->port_mutex)
1698 {
1699 struct dp_netdev_port *port;
1700
1701 HMAP_FOR_EACH (port, node, &dp->ports) {
1702 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1703 *portp = port;
1704 return 0;
1705 }
1706 }
1707
1708 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1709 * existing port. */
1710 return ENODEV;
1711 }
1712
1713 /* Returns 'true' if there is a port with pmd netdev. */
1714 static bool
1715 has_pmd_port(struct dp_netdev *dp)
1716 OVS_REQUIRES(dp->port_mutex)
1717 {
1718 struct dp_netdev_port *port;
1719
1720 HMAP_FOR_EACH (port, node, &dp->ports) {
1721 if (netdev_is_pmd(port->netdev)) {
1722 return true;
1723 }
1724 }
1725
1726 return false;
1727 }
1728
1729 static void
1730 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1731 OVS_REQUIRES(dp->port_mutex)
1732 {
1733 hmap_remove(&dp->ports, &port->node);
1734 seq_change(dp->port_seq);
1735
1736 reconfigure_datapath(dp);
1737
1738 port_destroy(port);
1739 }
1740
1741 static void
1742 answer_port_query(const struct dp_netdev_port *port,
1743 struct dpif_port *dpif_port)
1744 {
1745 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1746 dpif_port->type = xstrdup(port->type);
1747 dpif_port->port_no = port->port_no;
1748 }
1749
1750 static int
1751 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1752 struct dpif_port *dpif_port)
1753 {
1754 struct dp_netdev *dp = get_dp_netdev(dpif);
1755 struct dp_netdev_port *port;
1756 int error;
1757
1758 ovs_mutex_lock(&dp->port_mutex);
1759 error = get_port_by_number(dp, port_no, &port);
1760 if (!error && dpif_port) {
1761 answer_port_query(port, dpif_port);
1762 }
1763 ovs_mutex_unlock(&dp->port_mutex);
1764
1765 return error;
1766 }
1767
1768 static int
1769 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1770 struct dpif_port *dpif_port)
1771 {
1772 struct dp_netdev *dp = get_dp_netdev(dpif);
1773 struct dp_netdev_port *port;
1774 int error;
1775
1776 ovs_mutex_lock(&dp->port_mutex);
1777 error = get_port_by_name(dp, devname, &port);
1778 if (!error && dpif_port) {
1779 answer_port_query(port, dpif_port);
1780 }
1781 ovs_mutex_unlock(&dp->port_mutex);
1782
1783 return error;
1784 }
1785
1786 static void
1787 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1788 {
1789 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1790 free(flow);
1791 }
1792
1793 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1794 {
1795 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1796 ovsrcu_postpone(dp_netdev_flow_free, flow);
1797 }
1798 }
1799
1800 static uint32_t
1801 dp_netdev_flow_hash(const ovs_u128 *ufid)
1802 {
1803 return ufid->u32[0];
1804 }
1805
1806 static inline struct dpcls *
1807 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1808 odp_port_t in_port)
1809 {
1810 struct dpcls *cls;
1811 uint32_t hash = hash_port_no(in_port);
1812 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1813 if (cls->in_port == in_port) {
1814 /* Port classifier exists already */
1815 return cls;
1816 }
1817 }
1818 return NULL;
1819 }
1820
1821 static inline struct dpcls *
1822 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1823 odp_port_t in_port)
1824 OVS_REQUIRES(pmd->flow_mutex)
1825 {
1826 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1827 uint32_t hash = hash_port_no(in_port);
1828
1829 if (!cls) {
1830 /* Create new classifier for in_port */
1831 cls = xmalloc(sizeof(*cls));
1832 dpcls_init(cls);
1833 cls->in_port = in_port;
1834 cmap_insert(&pmd->classifiers, &cls->node, hash);
1835 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1836 }
1837 return cls;
1838 }
1839
1840 static void
1841 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1842 struct dp_netdev_flow *flow)
1843 OVS_REQUIRES(pmd->flow_mutex)
1844 {
1845 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
1846 struct dpcls *cls;
1847 odp_port_t in_port = flow->flow.in_port.odp_port;
1848
1849 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1850 ovs_assert(cls != NULL);
1851 dpcls_remove(cls, &flow->cr);
1852 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
1853 flow->dead = true;
1854
1855 dp_netdev_flow_unref(flow);
1856 }
1857
1858 static void
1859 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
1860 {
1861 struct dp_netdev_flow *netdev_flow;
1862
1863 ovs_mutex_lock(&pmd->flow_mutex);
1864 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1865 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
1866 }
1867 ovs_mutex_unlock(&pmd->flow_mutex);
1868 }
1869
1870 static int
1871 dpif_netdev_flow_flush(struct dpif *dpif)
1872 {
1873 struct dp_netdev *dp = get_dp_netdev(dpif);
1874 struct dp_netdev_pmd_thread *pmd;
1875
1876 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1877 dp_netdev_pmd_flow_flush(pmd);
1878 }
1879
1880 return 0;
1881 }
1882
1883 struct dp_netdev_port_state {
1884 struct hmap_position position;
1885 char *name;
1886 };
1887
1888 static int
1889 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1890 {
1891 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1892 return 0;
1893 }
1894
1895 static int
1896 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
1897 struct dpif_port *dpif_port)
1898 {
1899 struct dp_netdev_port_state *state = state_;
1900 struct dp_netdev *dp = get_dp_netdev(dpif);
1901 struct hmap_node *node;
1902 int retval;
1903
1904 ovs_mutex_lock(&dp->port_mutex);
1905 node = hmap_at_position(&dp->ports, &state->position);
1906 if (node) {
1907 struct dp_netdev_port *port;
1908
1909 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1910
1911 free(state->name);
1912 state->name = xstrdup(netdev_get_name(port->netdev));
1913 dpif_port->name = state->name;
1914 dpif_port->type = port->type;
1915 dpif_port->port_no = port->port_no;
1916
1917 retval = 0;
1918 } else {
1919 retval = EOF;
1920 }
1921 ovs_mutex_unlock(&dp->port_mutex);
1922
1923 return retval;
1924 }
1925
1926 static int
1927 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
1928 {
1929 struct dp_netdev_port_state *state = state_;
1930 free(state->name);
1931 free(state);
1932 return 0;
1933 }
1934
1935 static int
1936 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
1937 {
1938 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1939 uint64_t new_port_seq;
1940 int error;
1941
1942 new_port_seq = seq_read(dpif->dp->port_seq);
1943 if (dpif->last_port_seq != new_port_seq) {
1944 dpif->last_port_seq = new_port_seq;
1945 error = ENOBUFS;
1946 } else {
1947 error = EAGAIN;
1948 }
1949
1950 return error;
1951 }
1952
1953 static void
1954 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1955 {
1956 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1957
1958 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
1959 }
1960
1961 static struct dp_netdev_flow *
1962 dp_netdev_flow_cast(const struct dpcls_rule *cr)
1963 {
1964 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
1965 }
1966
1967 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1968 {
1969 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1970 }
1971
1972 /* netdev_flow_key utilities.
1973 *
1974 * netdev_flow_key is basically a miniflow. We use these functions
1975 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1976 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1977 *
1978 * - Since we are dealing exclusively with miniflows created by
1979 * miniflow_extract(), if the map is different the miniflow is different.
1980 * Therefore we can be faster by comparing the map and the miniflow in a
1981 * single memcmp().
1982 * - These functions can be inlined by the compiler. */
1983
1984 /* Given the number of bits set in miniflow's maps, returns the size of the
1985 * 'netdev_flow_key.mf' */
1986 static inline size_t
1987 netdev_flow_key_size(size_t flow_u64s)
1988 {
1989 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
1990 }
1991
1992 static inline bool
1993 netdev_flow_key_equal(const struct netdev_flow_key *a,
1994 const struct netdev_flow_key *b)
1995 {
1996 /* 'b->len' may be not set yet. */
1997 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
1998 }
1999
2000 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2001 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2002 * generated by miniflow_extract. */
2003 static inline bool
2004 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2005 const struct miniflow *mf)
2006 {
2007 return !memcmp(&key->mf, mf, key->len);
2008 }
2009
2010 static inline void
2011 netdev_flow_key_clone(struct netdev_flow_key *dst,
2012 const struct netdev_flow_key *src)
2013 {
2014 memcpy(dst, src,
2015 offsetof(struct netdev_flow_key, mf) + src->len);
2016 }
2017
2018 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2019 static inline void
2020 netdev_flow_mask_init(struct netdev_flow_key *mask,
2021 const struct match *match)
2022 {
2023 uint64_t *dst = miniflow_values(&mask->mf);
2024 struct flowmap fmap;
2025 uint32_t hash = 0;
2026 size_t idx;
2027
2028 /* Only check masks that make sense for the flow. */
2029 flow_wc_map(&match->flow, &fmap);
2030 flowmap_init(&mask->mf.map);
2031
2032 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2033 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2034
2035 if (mask_u64) {
2036 flowmap_set(&mask->mf.map, idx, 1);
2037 *dst++ = mask_u64;
2038 hash = hash_add64(hash, mask_u64);
2039 }
2040 }
2041
2042 map_t map;
2043
2044 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2045 hash = hash_add64(hash, map);
2046 }
2047
2048 size_t n = dst - miniflow_get_values(&mask->mf);
2049
2050 mask->hash = hash_finish(hash, n * 8);
2051 mask->len = netdev_flow_key_size(n);
2052 }
2053
2054 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2055 static inline void
2056 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2057 const struct flow *flow,
2058 const struct netdev_flow_key *mask)
2059 {
2060 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2061 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2062 uint32_t hash = 0;
2063 uint64_t value;
2064
2065 dst->len = mask->len;
2066 dst->mf = mask->mf; /* Copy maps. */
2067
2068 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2069 *dst_u64 = value & *mask_u64++;
2070 hash = hash_add64(hash, *dst_u64++);
2071 }
2072 dst->hash = hash_finish(hash,
2073 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2074 }
2075
2076 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2077 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
2078 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2079
2080 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2081 * 'mask'. */
2082 static inline uint32_t
2083 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2084 const struct netdev_flow_key *mask)
2085 {
2086 const uint64_t *p = miniflow_get_values(&mask->mf);
2087 uint32_t hash = 0;
2088 uint64_t value;
2089
2090 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2091 hash = hash_add64(hash, value & *p++);
2092 }
2093
2094 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2095 }
2096
2097 static inline bool
2098 emc_entry_alive(struct emc_entry *ce)
2099 {
2100 return ce->flow && !ce->flow->dead;
2101 }
2102
2103 static void
2104 emc_clear_entry(struct emc_entry *ce)
2105 {
2106 if (ce->flow) {
2107 dp_netdev_flow_unref(ce->flow);
2108 ce->flow = NULL;
2109 }
2110 }
2111
2112 static inline void
2113 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2114 const struct netdev_flow_key *key)
2115 {
2116 if (ce->flow != flow) {
2117 if (ce->flow) {
2118 dp_netdev_flow_unref(ce->flow);
2119 }
2120
2121 if (dp_netdev_flow_ref(flow)) {
2122 ce->flow = flow;
2123 } else {
2124 ce->flow = NULL;
2125 }
2126 }
2127 if (key) {
2128 netdev_flow_key_clone(&ce->key, key);
2129 }
2130 }
2131
2132 static inline void
2133 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2134 struct dp_netdev_flow *flow)
2135 {
2136 struct emc_entry *to_be_replaced = NULL;
2137 struct emc_entry *current_entry;
2138
2139 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2140 if (netdev_flow_key_equal(&current_entry->key, key)) {
2141 /* We found the entry with the 'mf' miniflow */
2142 emc_change_entry(current_entry, flow, NULL);
2143 return;
2144 }
2145
2146 /* Replacement policy: put the flow in an empty (not alive) entry, or
2147 * in the first entry where it can be */
2148 if (!to_be_replaced
2149 || (emc_entry_alive(to_be_replaced)
2150 && !emc_entry_alive(current_entry))
2151 || current_entry->key.hash < to_be_replaced->key.hash) {
2152 to_be_replaced = current_entry;
2153 }
2154 }
2155 /* We didn't find the miniflow in the cache.
2156 * The 'to_be_replaced' entry is where the new flow will be stored */
2157
2158 emc_change_entry(to_be_replaced, flow, key);
2159 }
2160
2161 static inline void
2162 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2163 const struct netdev_flow_key *key,
2164 struct dp_netdev_flow *flow)
2165 {
2166 /* Insert an entry into the EMC based on probability value 'min'. By
2167 * default the value is UINT32_MAX / 100 which yields an insertion
2168 * probability of 1/100 ie. 1% */
2169
2170 uint32_t min;
2171 atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
2172
2173 if (min && random_uint32() <= min) {
2174 emc_insert(&pmd->flow_cache, key, flow);
2175 }
2176 }
2177
2178 static inline struct dp_netdev_flow *
2179 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2180 {
2181 struct emc_entry *current_entry;
2182
2183 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2184 if (current_entry->key.hash == key->hash
2185 && emc_entry_alive(current_entry)
2186 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2187
2188 /* We found the entry with the 'key->mf' miniflow */
2189 return current_entry->flow;
2190 }
2191 }
2192
2193 return NULL;
2194 }
2195
2196 static struct dp_netdev_flow *
2197 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2198 const struct netdev_flow_key *key,
2199 int *lookup_num_p)
2200 {
2201 struct dpcls *cls;
2202 struct dpcls_rule *rule;
2203 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
2204 struct dp_netdev_flow *netdev_flow = NULL;
2205
2206 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2207 if (OVS_LIKELY(cls)) {
2208 dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2209 netdev_flow = dp_netdev_flow_cast(rule);
2210 }
2211 return netdev_flow;
2212 }
2213
2214 static struct dp_netdev_flow *
2215 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2216 const ovs_u128 *ufidp, const struct nlattr *key,
2217 size_t key_len)
2218 {
2219 struct dp_netdev_flow *netdev_flow;
2220 struct flow flow;
2221 ovs_u128 ufid;
2222
2223 /* If a UFID is not provided, determine one based on the key. */
2224 if (!ufidp && key && key_len
2225 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2226 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2227 ufidp = &ufid;
2228 }
2229
2230 if (ufidp) {
2231 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2232 &pmd->flow_table) {
2233 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2234 return netdev_flow;
2235 }
2236 }
2237 }
2238
2239 return NULL;
2240 }
2241
2242 static void
2243 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2244 struct dpif_flow_stats *stats)
2245 {
2246 struct dp_netdev_flow *netdev_flow;
2247 unsigned long long n;
2248 long long used;
2249 uint16_t flags;
2250
2251 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2252
2253 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2254 stats->n_packets = n;
2255 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2256 stats->n_bytes = n;
2257 atomic_read_relaxed(&netdev_flow->stats.used, &used);
2258 stats->used = used;
2259 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2260 stats->tcp_flags = flags;
2261 }
2262
2263 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2264 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2265 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2266 * protect them. */
2267 static void
2268 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2269 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2270 struct dpif_flow *flow, bool terse)
2271 {
2272 if (terse) {
2273 memset(flow, 0, sizeof *flow);
2274 } else {
2275 struct flow_wildcards wc;
2276 struct dp_netdev_actions *actions;
2277 size_t offset;
2278 struct odp_flow_key_parms odp_parms = {
2279 .flow = &netdev_flow->flow,
2280 .mask = &wc.masks,
2281 .support = dp_netdev_support,
2282 };
2283
2284 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2285 /* in_port is exact matched, but we have left it out from the mask for
2286 * optimnization reasons. Add in_port back to the mask. */
2287 wc.masks.in_port.odp_port = ODPP_NONE;
2288
2289 /* Key */
2290 offset = key_buf->size;
2291 flow->key = ofpbuf_tail(key_buf);
2292 odp_flow_key_from_flow(&odp_parms, key_buf);
2293 flow->key_len = key_buf->size - offset;
2294
2295 /* Mask */
2296 offset = mask_buf->size;
2297 flow->mask = ofpbuf_tail(mask_buf);
2298 odp_parms.key_buf = key_buf;
2299 odp_flow_key_from_mask(&odp_parms, mask_buf);
2300 flow->mask_len = mask_buf->size - offset;
2301
2302 /* Actions */
2303 actions = dp_netdev_flow_get_actions(netdev_flow);
2304 flow->actions = actions->actions;
2305 flow->actions_len = actions->size;
2306 }
2307
2308 flow->ufid = netdev_flow->ufid;
2309 flow->ufid_present = true;
2310 flow->pmd_id = netdev_flow->pmd_id;
2311 get_dpif_flow_stats(netdev_flow, &flow->stats);
2312 }
2313
2314 static int
2315 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2316 const struct nlattr *mask_key,
2317 uint32_t mask_key_len, const struct flow *flow,
2318 struct flow_wildcards *wc, bool probe)
2319 {
2320 enum odp_key_fitness fitness;
2321
2322 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
2323 if (fitness) {
2324 if (!probe) {
2325 /* This should not happen: it indicates that
2326 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2327 * disagree on the acceptable form of a mask. Log the problem
2328 * as an error, with enough details to enable debugging. */
2329 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2330
2331 if (!VLOG_DROP_ERR(&rl)) {
2332 struct ds s;
2333
2334 ds_init(&s);
2335 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2336 true);
2337 VLOG_ERR("internal error parsing flow mask %s (%s)",
2338 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2339 ds_destroy(&s);
2340 }
2341 }
2342
2343 return EINVAL;
2344 }
2345
2346 return 0;
2347 }
2348
2349 static int
2350 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2351 struct flow *flow, bool probe)
2352 {
2353 if (odp_flow_key_to_flow(key, key_len, flow)) {
2354 if (!probe) {
2355 /* This should not happen: it indicates that
2356 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2357 * the acceptable form of a flow. Log the problem as an error,
2358 * with enough details to enable debugging. */
2359 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2360
2361 if (!VLOG_DROP_ERR(&rl)) {
2362 struct ds s;
2363
2364 ds_init(&s);
2365 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2366 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2367 ds_destroy(&s);
2368 }
2369 }
2370
2371 return EINVAL;
2372 }
2373
2374 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2375 return EINVAL;
2376 }
2377
2378 return 0;
2379 }
2380
2381 static int
2382 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2383 {
2384 struct dp_netdev *dp = get_dp_netdev(dpif);
2385 struct dp_netdev_flow *netdev_flow;
2386 struct dp_netdev_pmd_thread *pmd;
2387 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2388 struct hmapx_node *node;
2389 int error = EINVAL;
2390
2391 if (get->pmd_id == PMD_ID_NULL) {
2392 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2393 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2394 dp_netdev_pmd_unref(pmd);
2395 }
2396 }
2397 } else {
2398 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2399 if (!pmd) {
2400 goto out;
2401 }
2402 hmapx_add(&to_find, pmd);
2403 }
2404
2405 if (!hmapx_count(&to_find)) {
2406 goto out;
2407 }
2408
2409 HMAPX_FOR_EACH (node, &to_find) {
2410 pmd = (struct dp_netdev_pmd_thread *) node->data;
2411 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2412 get->key_len);
2413 if (netdev_flow) {
2414 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2415 get->flow, false);
2416 error = 0;
2417 break;
2418 } else {
2419 error = ENOENT;
2420 }
2421 }
2422
2423 HMAPX_FOR_EACH (node, &to_find) {
2424 pmd = (struct dp_netdev_pmd_thread *) node->data;
2425 dp_netdev_pmd_unref(pmd);
2426 }
2427 out:
2428 hmapx_destroy(&to_find);
2429 return error;
2430 }
2431
2432 static struct dp_netdev_flow *
2433 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2434 struct match *match, const ovs_u128 *ufid,
2435 const struct nlattr *actions, size_t actions_len)
2436 OVS_REQUIRES(pmd->flow_mutex)
2437 {
2438 struct dp_netdev_flow *flow;
2439 struct netdev_flow_key mask;
2440 struct dpcls *cls;
2441
2442 /* Make sure in_port is exact matched before we read it. */
2443 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
2444 odp_port_t in_port = match->flow.in_port.odp_port;
2445
2446 /* As we select the dpcls based on the port number, each netdev flow
2447 * belonging to the same dpcls will have the same odp_port value.
2448 * For performance reasons we wildcard odp_port here in the mask. In the
2449 * typical case dp_hash is also wildcarded, and the resulting 8-byte
2450 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
2451 * will not be part of the subtable mask.
2452 * This will speed up the hash computation during dpcls_lookup() because
2453 * there is one less call to hash_add64() in this case. */
2454 match->wc.masks.in_port.odp_port = 0;
2455 netdev_flow_mask_init(&mask, match);
2456 match->wc.masks.in_port.odp_port = ODPP_NONE;
2457
2458 /* Make sure wc does not have metadata. */
2459 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2460 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2461
2462 /* Do not allocate extra space. */
2463 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2464 memset(&flow->stats, 0, sizeof flow->stats);
2465 flow->dead = false;
2466 flow->batch = NULL;
2467 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2468 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2469 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2470 ovs_refcount_init(&flow->ref_cnt);
2471 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2472
2473 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2474
2475 /* Select dpcls for in_port. Relies on in_port to be exact match. */
2476 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2477 dpcls_insert(cls, &flow->cr, &mask);
2478
2479 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2480 dp_netdev_flow_hash(&flow->ufid));
2481
2482 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
2483 struct ds ds = DS_EMPTY_INITIALIZER;
2484 struct ofpbuf key_buf, mask_buf;
2485 struct odp_flow_key_parms odp_parms = {
2486 .flow = &match->flow,
2487 .mask = &match->wc.masks,
2488 .support = dp_netdev_support,
2489 };
2490
2491 ofpbuf_init(&key_buf, 0);
2492 ofpbuf_init(&mask_buf, 0);
2493
2494 odp_flow_key_from_flow(&odp_parms, &key_buf);
2495 odp_parms.key_buf = &key_buf;
2496 odp_flow_key_from_mask(&odp_parms, &mask_buf);
2497
2498 ds_put_cstr(&ds, "flow_add: ");
2499 odp_format_ufid(ufid, &ds);
2500 ds_put_cstr(&ds, " ");
2501 odp_flow_format(key_buf.data, key_buf.size,
2502 mask_buf.data, mask_buf.size,
2503 NULL, &ds, false);
2504 ds_put_cstr(&ds, ", actions:");
2505 format_odp_actions(&ds, actions, actions_len, NULL);
2506
2507 VLOG_DBG("%s", ds_cstr(&ds));
2508
2509 ofpbuf_uninit(&key_buf);
2510 ofpbuf_uninit(&mask_buf);
2511
2512 /* Add a printout of the actual match installed. */
2513 struct match m;
2514 ds_clear(&ds);
2515 ds_put_cstr(&ds, "flow match: ");
2516 miniflow_expand(&flow->cr.flow.mf, &m.flow);
2517 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2518 memset(&m.tun_md, 0, sizeof m.tun_md);
2519 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2520
2521 VLOG_DBG("%s", ds_cstr(&ds));
2522
2523 ds_destroy(&ds);
2524 }
2525
2526 return flow;
2527 }
2528
2529 static int
2530 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
2531 struct netdev_flow_key *key,
2532 struct match *match,
2533 ovs_u128 *ufid,
2534 const struct dpif_flow_put *put,
2535 struct dpif_flow_stats *stats)
2536 {
2537 struct dp_netdev_flow *netdev_flow;
2538 int error = 0;
2539
2540 if (stats) {
2541 memset(stats, 0, sizeof *stats);
2542 }
2543
2544 ovs_mutex_lock(&pmd->flow_mutex);
2545 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
2546 if (!netdev_flow) {
2547 if (put->flags & DPIF_FP_CREATE) {
2548 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2549 dp_netdev_flow_add(pmd, match, ufid, put->actions,
2550 put->actions_len);
2551 error = 0;
2552 } else {
2553 error = EFBIG;
2554 }
2555 } else {
2556 error = ENOENT;
2557 }
2558 } else {
2559 if (put->flags & DPIF_FP_MODIFY) {
2560 struct dp_netdev_actions *new_actions;
2561 struct dp_netdev_actions *old_actions;
2562
2563 new_actions = dp_netdev_actions_create(put->actions,
2564 put->actions_len);
2565
2566 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2567 ovsrcu_set(&netdev_flow->actions, new_actions);
2568
2569 if (stats) {
2570 get_dpif_flow_stats(netdev_flow, stats);
2571 }
2572 if (put->flags & DPIF_FP_ZERO_STATS) {
2573 /* XXX: The userspace datapath uses thread local statistics
2574 * (for flows), which should be updated only by the owning
2575 * thread. Since we cannot write on stats memory here,
2576 * we choose not to support this flag. Please note:
2577 * - This feature is currently used only by dpctl commands with
2578 * option --clear.
2579 * - Should the need arise, this operation can be implemented
2580 * by keeping a base value (to be update here) for each
2581 * counter, and subtracting it before outputting the stats */
2582 error = EOPNOTSUPP;
2583 }
2584
2585 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2586 } else if (put->flags & DPIF_FP_CREATE) {
2587 error = EEXIST;
2588 } else {
2589 /* Overlapping flow. */
2590 error = EINVAL;
2591 }
2592 }
2593 ovs_mutex_unlock(&pmd->flow_mutex);
2594 return error;
2595 }
2596
2597 static int
2598 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2599 {
2600 struct dp_netdev *dp = get_dp_netdev(dpif);
2601 struct netdev_flow_key key, mask;
2602 struct dp_netdev_pmd_thread *pmd;
2603 struct match match;
2604 ovs_u128 ufid;
2605 int error;
2606 bool probe = put->flags & DPIF_FP_PROBE;
2607
2608 if (put->stats) {
2609 memset(put->stats, 0, sizeof *put->stats);
2610 }
2611 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
2612 probe);
2613 if (error) {
2614 return error;
2615 }
2616 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2617 put->mask, put->mask_len,
2618 &match.flow, &match.wc, probe);
2619 if (error) {
2620 return error;
2621 }
2622
2623 if (put->ufid) {
2624 ufid = *put->ufid;
2625 } else {
2626 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2627 }
2628
2629 /* Must produce a netdev_flow_key for lookup.
2630 * Use the same method as employed to create the key when adding
2631 * the flow to the dplcs to make sure they match. */
2632 netdev_flow_mask_init(&mask, &match);
2633 netdev_flow_key_init_masked(&key, &match.flow, &mask);
2634
2635 if (put->pmd_id == PMD_ID_NULL) {
2636 if (cmap_count(&dp->poll_threads) == 0) {
2637 return EINVAL;
2638 }
2639 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2640 struct dpif_flow_stats pmd_stats;
2641 int pmd_error;
2642
2643 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
2644 &pmd_stats);
2645 if (pmd_error) {
2646 error = pmd_error;
2647 } else if (put->stats) {
2648 put->stats->n_packets += pmd_stats.n_packets;
2649 put->stats->n_bytes += pmd_stats.n_bytes;
2650 put->stats->used = MAX(put->stats->used, pmd_stats.used);
2651 put->stats->tcp_flags |= pmd_stats.tcp_flags;
2652 }
2653 }
2654 } else {
2655 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
2656 if (!pmd) {
2657 return EINVAL;
2658 }
2659 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
2660 dp_netdev_pmd_unref(pmd);
2661 }
2662
2663 return error;
2664 }
2665
2666 static int
2667 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
2668 struct dpif_flow_stats *stats,
2669 const struct dpif_flow_del *del)
2670 {
2671 struct dp_netdev_flow *netdev_flow;
2672 int error = 0;
2673
2674 ovs_mutex_lock(&pmd->flow_mutex);
2675 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2676 del->key_len);
2677 if (netdev_flow) {
2678 if (stats) {
2679 get_dpif_flow_stats(netdev_flow, stats);
2680 }
2681 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2682 } else {
2683 error = ENOENT;
2684 }
2685 ovs_mutex_unlock(&pmd->flow_mutex);
2686
2687 return error;
2688 }
2689
2690 static int
2691 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
2692 {
2693 struct dp_netdev *dp = get_dp_netdev(dpif);
2694 struct dp_netdev_pmd_thread *pmd;
2695 int error = 0;
2696
2697 if (del->stats) {
2698 memset(del->stats, 0, sizeof *del->stats);
2699 }
2700
2701 if (del->pmd_id == PMD_ID_NULL) {
2702 if (cmap_count(&dp->poll_threads) == 0) {
2703 return EINVAL;
2704 }
2705 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2706 struct dpif_flow_stats pmd_stats;
2707 int pmd_error;
2708
2709 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
2710 if (pmd_error) {
2711 error = pmd_error;
2712 } else if (del->stats) {
2713 del->stats->n_packets += pmd_stats.n_packets;
2714 del->stats->n_bytes += pmd_stats.n_bytes;
2715 del->stats->used = MAX(del->stats->used, pmd_stats.used);
2716 del->stats->tcp_flags |= pmd_stats.tcp_flags;
2717 }
2718 }
2719 } else {
2720 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
2721 if (!pmd) {
2722 return EINVAL;
2723 }
2724 error = flow_del_on_pmd(pmd, del->stats, del);
2725 dp_netdev_pmd_unref(pmd);
2726 }
2727
2728
2729 return error;
2730 }
2731
2732 struct dpif_netdev_flow_dump {
2733 struct dpif_flow_dump up;
2734 struct cmap_position poll_thread_pos;
2735 struct cmap_position flow_pos;
2736 struct dp_netdev_pmd_thread *cur_pmd;
2737 int status;
2738 struct ovs_mutex mutex;
2739 };
2740
2741 static struct dpif_netdev_flow_dump *
2742 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
2743 {
2744 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
2745 }
2746
2747 static struct dpif_flow_dump *
2748 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
2749 char *type OVS_UNUSED)
2750 {
2751 struct dpif_netdev_flow_dump *dump;
2752
2753 dump = xzalloc(sizeof *dump);
2754 dpif_flow_dump_init(&dump->up, dpif_);
2755 dump->up.terse = terse;
2756 ovs_mutex_init(&dump->mutex);
2757
2758 return &dump->up;
2759 }
2760
2761 static int
2762 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
2763 {
2764 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2765
2766 ovs_mutex_destroy(&dump->mutex);
2767 free(dump);
2768 return 0;
2769 }
2770
2771 struct dpif_netdev_flow_dump_thread {
2772 struct dpif_flow_dump_thread up;
2773 struct dpif_netdev_flow_dump *dump;
2774 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2775 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
2776 };
2777
2778 static struct dpif_netdev_flow_dump_thread *
2779 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2780 {
2781 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2782 }
2783
2784 static struct dpif_flow_dump_thread *
2785 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2786 {
2787 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2788 struct dpif_netdev_flow_dump_thread *thread;
2789
2790 thread = xmalloc(sizeof *thread);
2791 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2792 thread->dump = dump;
2793 return &thread->up;
2794 }
2795
2796 static void
2797 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2798 {
2799 struct dpif_netdev_flow_dump_thread *thread
2800 = dpif_netdev_flow_dump_thread_cast(thread_);
2801
2802 free(thread);
2803 }
2804
2805 static int
2806 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
2807 struct dpif_flow *flows, int max_flows)
2808 {
2809 struct dpif_netdev_flow_dump_thread *thread
2810 = dpif_netdev_flow_dump_thread_cast(thread_);
2811 struct dpif_netdev_flow_dump *dump = thread->dump;
2812 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
2813 int n_flows = 0;
2814 int i;
2815
2816 ovs_mutex_lock(&dump->mutex);
2817 if (!dump->status) {
2818 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2819 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2820 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2821 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2822
2823 /* First call to dump_next(), extracts the first pmd thread.
2824 * If there is no pmd thread, returns immediately. */
2825 if (!pmd) {
2826 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2827 if (!pmd) {
2828 ovs_mutex_unlock(&dump->mutex);
2829 return n_flows;
2830
2831 }
2832 }
2833
2834 do {
2835 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2836 struct cmap_node *node;
2837
2838 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2839 if (!node) {
2840 break;
2841 }
2842 netdev_flows[n_flows] = CONTAINER_OF(node,
2843 struct dp_netdev_flow,
2844 node);
2845 }
2846 /* When finishing dumping the current pmd thread, moves to
2847 * the next. */
2848 if (n_flows < flow_limit) {
2849 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2850 dp_netdev_pmd_unref(pmd);
2851 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2852 if (!pmd) {
2853 dump->status = EOF;
2854 break;
2855 }
2856 }
2857 /* Keeps the reference to next caller. */
2858 dump->cur_pmd = pmd;
2859
2860 /* If the current dump is empty, do not exit the loop, since the
2861 * remaining pmds could have flows to be dumped. Just dumps again
2862 * on the new 'pmd'. */
2863 } while (!n_flows);
2864 }
2865 ovs_mutex_unlock(&dump->mutex);
2866
2867 for (i = 0; i < n_flows; i++) {
2868 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2869 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2870 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2871 struct dpif_flow *f = &flows[i];
2872 struct ofpbuf key, mask;
2873
2874 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2875 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
2876 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2877 dump->up.terse);
2878 }
2879
2880 return n_flows;
2881 }
2882
2883 static int
2884 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
2885 OVS_NO_THREAD_SAFETY_ANALYSIS
2886 {
2887 struct dp_netdev *dp = get_dp_netdev(dpif);
2888 struct dp_netdev_pmd_thread *pmd;
2889 struct dp_packet_batch pp;
2890
2891 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2892 dp_packet_size(execute->packet) > UINT16_MAX) {
2893 return EINVAL;
2894 }
2895
2896 /* Tries finding the 'pmd'. If NULL is returned, that means
2897 * the current thread is a non-pmd thread and should use
2898 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
2899 pmd = ovsthread_getspecific(dp->per_pmd_key);
2900 if (!pmd) {
2901 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2902 if (!pmd) {
2903 return EBUSY;
2904 }
2905 }
2906
2907 if (execute->probe) {
2908 /* If this is part of a probe, Drop the packet, since executing
2909 * the action may actually cause spurious packets be sent into
2910 * the network. */
2911 if (pmd->core_id == NON_PMD_CORE_ID) {
2912 dp_netdev_pmd_unref(pmd);
2913 }
2914 return 0;
2915 }
2916
2917 /* If the current thread is non-pmd thread, acquires
2918 * the 'non_pmd_mutex'. */
2919 if (pmd->core_id == NON_PMD_CORE_ID) {
2920 ovs_mutex_lock(&dp->non_pmd_mutex);
2921 }
2922
2923 /* Update current time in PMD context. */
2924 pmd_thread_ctx_time_update(pmd);
2925
2926 /* The action processing expects the RSS hash to be valid, because
2927 * it's always initialized at the beginning of datapath processing.
2928 * In this case, though, 'execute->packet' may not have gone through
2929 * the datapath at all, it may have been generated by the upper layer
2930 * (OpenFlow packet-out, BFD frame, ...). */
2931 if (!dp_packet_rss_valid(execute->packet)) {
2932 dp_packet_set_rss_hash(execute->packet,
2933 flow_hash_5tuple(execute->flow, 0));
2934 }
2935
2936 dp_packet_batch_init_packet(&pp, execute->packet);
2937 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
2938 execute->actions, execute->actions_len);
2939 dp_netdev_pmd_flush_output_packets(pmd, true);
2940
2941 if (pmd->core_id == NON_PMD_CORE_ID) {
2942 ovs_mutex_unlock(&dp->non_pmd_mutex);
2943 dp_netdev_pmd_unref(pmd);
2944 }
2945
2946 return 0;
2947 }
2948
2949 static void
2950 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2951 {
2952 size_t i;
2953
2954 for (i = 0; i < n_ops; i++) {
2955 struct dpif_op *op = ops[i];
2956
2957 switch (op->type) {
2958 case DPIF_OP_FLOW_PUT:
2959 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2960 break;
2961
2962 case DPIF_OP_FLOW_DEL:
2963 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2964 break;
2965
2966 case DPIF_OP_EXECUTE:
2967 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2968 break;
2969
2970 case DPIF_OP_FLOW_GET:
2971 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2972 break;
2973 }
2974 }
2975 }
2976
2977 /* Applies datapath configuration from the database. Some of the changes are
2978 * actually applied in dpif_netdev_run(). */
2979 static int
2980 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
2981 {
2982 struct dp_netdev *dp = get_dp_netdev(dpif);
2983 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
2984 unsigned long long insert_prob =
2985 smap_get_ullong(other_config, "emc-insert-inv-prob",
2986 DEFAULT_EM_FLOW_INSERT_INV_PROB);
2987 uint32_t insert_min, cur_min;
2988 uint32_t tx_flush_interval, cur_tx_flush_interval;
2989
2990 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
2991 DEFAULT_TX_FLUSH_INTERVAL);
2992 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
2993 if (tx_flush_interval != cur_tx_flush_interval) {
2994 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
2995 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
2996 tx_flush_interval);
2997 }
2998
2999 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3000 free(dp->pmd_cmask);
3001 dp->pmd_cmask = nullable_xstrdup(cmask);
3002 dp_netdev_request_reconfigure(dp);
3003 }
3004
3005 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3006 if (insert_prob <= UINT32_MAX) {
3007 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3008 } else {
3009 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3010 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3011 }
3012
3013 if (insert_min != cur_min) {
3014 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3015 if (insert_min == 0) {
3016 VLOG_INFO("EMC has been disabled");
3017 } else {
3018 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3019 insert_prob, (100 / (float)insert_prob));
3020 }
3021 }
3022
3023 return 0;
3024 }
3025
3026 /* Parses affinity list and returns result in 'core_ids'. */
3027 static int
3028 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3029 {
3030 unsigned i;
3031 char *list, *copy, *key, *value;
3032 int error = 0;
3033
3034 for (i = 0; i < n_rxq; i++) {
3035 core_ids[i] = OVS_CORE_UNSPEC;
3036 }
3037
3038 if (!affinity_list) {
3039 return 0;
3040 }
3041
3042 list = copy = xstrdup(affinity_list);
3043
3044 while (ofputil_parse_key_value(&list, &key, &value)) {
3045 int rxq_id, core_id;
3046
3047 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3048 || !str_to_int(value, 0, &core_id) || core_id < 0) {
3049 error = EINVAL;
3050 break;
3051 }
3052
3053 if (rxq_id < n_rxq) {
3054 core_ids[rxq_id] = core_id;
3055 }
3056 }
3057
3058 free(copy);
3059 return error;
3060 }
3061
3062 /* Parses 'affinity_list' and applies configuration if it is valid. */
3063 static int
3064 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3065 const char *affinity_list)
3066 {
3067 unsigned *core_ids, i;
3068 int error = 0;
3069
3070 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3071 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3072 error = EINVAL;
3073 goto exit;
3074 }
3075
3076 for (i = 0; i < port->n_rxq; i++) {
3077 port->rxqs[i].core_id = core_ids[i];
3078 }
3079
3080 exit:
3081 free(core_ids);
3082 return error;
3083 }
3084
3085 /* Changes the affinity of port's rx queues. The changes are actually applied
3086 * in dpif_netdev_run(). */
3087 static int
3088 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
3089 const struct smap *cfg)
3090 {
3091 struct dp_netdev *dp = get_dp_netdev(dpif);
3092 struct dp_netdev_port *port;
3093 int error = 0;
3094 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
3095
3096 ovs_mutex_lock(&dp->port_mutex);
3097 error = get_port_by_number(dp, port_no, &port);
3098 if (error || !netdev_is_pmd(port->netdev)
3099 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
3100 goto unlock;
3101 }
3102
3103 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
3104 if (error) {
3105 goto unlock;
3106 }
3107 free(port->rxq_affinity_list);
3108 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
3109
3110 dp_netdev_request_reconfigure(dp);
3111 unlock:
3112 ovs_mutex_unlock(&dp->port_mutex);
3113 return error;
3114 }
3115
3116 static int
3117 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
3118 uint32_t queue_id, uint32_t *priority)
3119 {
3120 *priority = queue_id;
3121 return 0;
3122 }
3123
3124 \f
3125 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
3126 * a copy of the 'size' bytes of 'actions' input parameters. */
3127 struct dp_netdev_actions *
3128 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
3129 {
3130 struct dp_netdev_actions *netdev_actions;
3131
3132 netdev_actions = xmalloc(sizeof *netdev_actions + size);
3133 memcpy(netdev_actions->actions, actions, size);
3134 netdev_actions->size = size;
3135
3136 return netdev_actions;
3137 }
3138
3139 struct dp_netdev_actions *
3140 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
3141 {
3142 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
3143 }
3144
3145 static void
3146 dp_netdev_actions_free(struct dp_netdev_actions *actions)
3147 {
3148 free(actions);
3149 }
3150 \f
3151 static void
3152 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
3153 enum rxq_cycles_counter_type type,
3154 unsigned long long cycles)
3155 {
3156 atomic_store_relaxed(&rx->cycles[type], cycles);
3157 }
3158
3159 static void
3160 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
3161 enum rxq_cycles_counter_type type,
3162 unsigned long long cycles)
3163 {
3164 non_atomic_ullong_add(&rx->cycles[type], cycles);
3165 }
3166
3167 static uint64_t
3168 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
3169 enum rxq_cycles_counter_type type)
3170 {
3171 unsigned long long processing_cycles;
3172 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
3173 return processing_cycles;
3174 }
3175
3176 static void
3177 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
3178 unsigned long long cycles)
3179 {
3180 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
3181 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
3182 }
3183
3184 static uint64_t
3185 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
3186 {
3187 unsigned long long processing_cycles;
3188 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
3189 return processing_cycles;
3190 }
3191
3192 static int
3193 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
3194 struct tx_port *p)
3195 {
3196 int i;
3197 int tx_qid;
3198 int output_cnt;
3199 bool dynamic_txqs;
3200 struct cycle_timer timer;
3201 uint64_t cycles;
3202 uint32_t tx_flush_interval;
3203
3204 cycle_timer_start(&pmd->perf_stats, &timer);
3205
3206 dynamic_txqs = p->port->dynamic_txqs;
3207 if (dynamic_txqs) {
3208 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
3209 } else {
3210 tx_qid = pmd->static_tx_qid;
3211 }
3212
3213 output_cnt = dp_packet_batch_size(&p->output_pkts);
3214 ovs_assert(output_cnt > 0);
3215
3216 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
3217 dp_packet_batch_init(&p->output_pkts);
3218
3219 /* Update time of the next flush. */
3220 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
3221 p->flush_time = pmd->ctx.now + tx_flush_interval;
3222
3223 ovs_assert(pmd->n_output_batches > 0);
3224 pmd->n_output_batches--;
3225
3226 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
3227 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
3228
3229 /* Distribute send cycles evenly among transmitted packets and assign to
3230 * their respective rx queues. */
3231 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
3232 for (i = 0; i < output_cnt; i++) {
3233 if (p->output_pkts_rxqs[i]) {
3234 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
3235 RXQ_CYCLES_PROC_CURR, cycles);
3236 }
3237 }
3238
3239 return output_cnt;
3240 }
3241
3242 static int
3243 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
3244 bool force)
3245 {
3246 struct tx_port *p;
3247 int output_cnt = 0;
3248
3249 if (!pmd->n_output_batches) {
3250 return 0;
3251 }
3252
3253 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
3254 if (!dp_packet_batch_is_empty(&p->output_pkts)
3255 && (force || pmd->ctx.now >= p->flush_time)) {
3256 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
3257 }
3258 }
3259 return output_cnt;
3260 }
3261
3262 static int
3263 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
3264 struct dp_netdev_rxq *rxq,
3265 odp_port_t port_no)
3266 {
3267 struct dp_packet_batch batch;
3268 struct cycle_timer timer;
3269 int error;
3270 int batch_cnt = 0, output_cnt = 0;
3271 uint64_t cycles;
3272
3273 /* Measure duration for polling and processing rx burst. */
3274 cycle_timer_start(&pmd->perf_stats, &timer);
3275
3276 pmd->ctx.last_rxq = rxq;
3277 dp_packet_batch_init(&batch);
3278
3279 error = netdev_rxq_recv(rxq->rx, &batch);
3280 if (!error) {
3281 /* At least one packet received. */
3282 *recirc_depth_get() = 0;
3283 pmd_thread_ctx_time_update(pmd);
3284
3285 batch_cnt = batch.count;
3286 dp_netdev_input(pmd, &batch, port_no);
3287
3288 /* Assign processing cycles to rx queue. */
3289 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
3290 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
3291
3292 output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
3293 } else {
3294 /* Discard cycles. */
3295 cycle_timer_stop(&pmd->perf_stats, &timer);
3296 if (error != EAGAIN && error != EOPNOTSUPP) {
3297 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3298
3299 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3300 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
3301 }
3302 }
3303
3304 pmd->ctx.last_rxq = NULL;
3305
3306 return batch_cnt + output_cnt;
3307 }
3308
3309 static struct tx_port *
3310 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3311 {
3312 struct tx_port *tx;
3313
3314 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3315 if (tx->port->port_no == port_no) {
3316 return tx;
3317 }
3318 }
3319
3320 return NULL;
3321 }
3322
3323 static int
3324 port_reconfigure(struct dp_netdev_port *port)
3325 {
3326 struct netdev *netdev = port->netdev;
3327 int i, err;
3328
3329 port->need_reconfigure = false;
3330
3331 /* Closes the existing 'rxq's. */
3332 for (i = 0; i < port->n_rxq; i++) {
3333 netdev_rxq_close(port->rxqs[i].rx);
3334 port->rxqs[i].rx = NULL;
3335 }
3336 unsigned last_nrxq = port->n_rxq;
3337 port->n_rxq = 0;
3338
3339 /* Allows 'netdev' to apply the pending configuration changes. */
3340 if (netdev_is_reconf_required(netdev)) {
3341 err = netdev_reconfigure(netdev);
3342 if (err && (err != EOPNOTSUPP)) {
3343 VLOG_ERR("Failed to set interface %s new configuration",
3344 netdev_get_name(netdev));
3345 return err;
3346 }
3347 }
3348 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3349 port->rxqs = xrealloc(port->rxqs,
3350 sizeof *port->rxqs * netdev_n_rxq(netdev));
3351 /* Realloc 'used' counters for tx queues. */
3352 free(port->txq_used);
3353 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
3354
3355 for (i = 0; i < netdev_n_rxq(netdev); i++) {
3356 bool new_queue = i >= last_nrxq;
3357 if (new_queue) {
3358 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
3359 }
3360
3361 port->rxqs[i].port = port;
3362
3363 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
3364 if (err) {
3365 return err;
3366 }
3367 port->n_rxq++;
3368 }
3369
3370 /* Parse affinity list to apply configuration for new queues. */
3371 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
3372
3373 return 0;
3374 }
3375
3376 struct rr_numa_list {
3377 struct hmap numas; /* Contains 'struct rr_numa' */
3378 };
3379
3380 struct rr_numa {
3381 struct hmap_node node;
3382
3383 int numa_id;
3384
3385 /* Non isolated pmds on numa node 'numa_id' */
3386 struct dp_netdev_pmd_thread **pmds;
3387 int n_pmds;
3388
3389 int cur_index;
3390 bool idx_inc;
3391 };
3392
3393 static struct rr_numa *
3394 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
3395 {
3396 struct rr_numa *numa;
3397
3398 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
3399 if (numa->numa_id == numa_id) {
3400 return numa;
3401 }
3402 }
3403
3404 return NULL;
3405 }
3406
3407 /* Returns the next node in numa list following 'numa' in round-robin fashion.
3408 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
3409 * Returns NULL if 'rr' numa list is empty. */
3410 static struct rr_numa *
3411 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
3412 {
3413 struct hmap_node *node = NULL;
3414
3415 if (numa) {
3416 node = hmap_next(&rr->numas, &numa->node);
3417 }
3418 if (!node) {
3419 node = hmap_first(&rr->numas);
3420 }
3421
3422 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
3423 }
3424
3425 static void
3426 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
3427 {
3428 struct dp_netdev_pmd_thread *pmd;
3429 struct rr_numa *numa;
3430
3431 hmap_init(&rr->numas);
3432
3433 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3434 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3435 continue;
3436 }
3437
3438 numa = rr_numa_list_lookup(rr, pmd->numa_id);
3439 if (!numa) {
3440 numa = xzalloc(sizeof *numa);
3441 numa->numa_id = pmd->numa_id;
3442 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
3443 }
3444 numa->n_pmds++;
3445 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
3446 numa->pmds[numa->n_pmds - 1] = pmd;
3447 /* At least one pmd so initialise curr_idx and idx_inc. */
3448 numa->cur_index = 0;
3449 numa->idx_inc = true;
3450 }
3451 }
3452
3453 /* Returns the next pmd from the numa node in
3454 * incrementing or decrementing order. */
3455 static struct dp_netdev_pmd_thread *
3456 rr_numa_get_pmd(struct rr_numa *numa)
3457 {
3458 int numa_idx = numa->cur_index;
3459
3460 if (numa->idx_inc == true) {
3461 /* Incrementing through list of pmds. */
3462 if (numa->cur_index == numa->n_pmds-1) {
3463 /* Reached the last pmd. */
3464 numa->idx_inc = false;
3465 } else {
3466 numa->cur_index++;
3467 }
3468 } else {
3469 /* Decrementing through list of pmds. */
3470 if (numa->cur_index == 0) {
3471 /* Reached the first pmd. */
3472 numa->idx_inc = true;
3473 } else {
3474 numa->cur_index--;
3475 }
3476 }
3477 return numa->pmds[numa_idx];
3478 }
3479
3480 static void
3481 rr_numa_list_destroy(struct rr_numa_list *rr)
3482 {
3483 struct rr_numa *numa;
3484
3485 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
3486 free(numa->pmds);
3487 free(numa);
3488 }
3489 hmap_destroy(&rr->numas);
3490 }
3491
3492 /* Sort Rx Queues by the processing cycles they are consuming. */
3493 static int
3494 compare_rxq_cycles(const void *a, const void *b)
3495 {
3496 struct dp_netdev_rxq *qa;
3497 struct dp_netdev_rxq *qb;
3498 uint64_t cycles_qa, cycles_qb;
3499
3500 qa = *(struct dp_netdev_rxq **) a;
3501 qb = *(struct dp_netdev_rxq **) b;
3502
3503 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
3504 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
3505
3506 if (cycles_qa != cycles_qb) {
3507 return (cycles_qa < cycles_qb) ? 1 : -1;
3508 } else {
3509 /* Cycles are the same so tiebreak on port/queue id.
3510 * Tiebreaking (as opposed to return 0) ensures consistent
3511 * sort results across multiple OS's. */
3512 uint32_t port_qa = odp_to_u32(qa->port->port_no);
3513 uint32_t port_qb = odp_to_u32(qb->port->port_no);
3514 if (port_qa != port_qb) {
3515 return port_qa > port_qb ? 1 : -1;
3516 } else {
3517 return netdev_rxq_get_queue_id(qa->rx)
3518 - netdev_rxq_get_queue_id(qb->rx);
3519 }
3520 }
3521 }
3522
3523 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
3524 * queues and marks the pmds as isolated. Otherwise, assign non isolated
3525 * pmds to unpinned queues.
3526 *
3527 * If 'pinned' is false queues will be sorted by processing cycles they are
3528 * consuming and then assigned to pmds in round robin order.
3529 *
3530 * The function doesn't touch the pmd threads, it just stores the assignment
3531 * in the 'pmd' member of each rxq. */
3532 static void
3533 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
3534 {
3535 struct dp_netdev_port *port;
3536 struct rr_numa_list rr;
3537 struct rr_numa *non_local_numa = NULL;
3538 struct dp_netdev_rxq ** rxqs = NULL;
3539 int i, n_rxqs = 0;
3540 struct rr_numa *numa = NULL;
3541 int numa_id;
3542
3543 HMAP_FOR_EACH (port, node, &dp->ports) {
3544 if (!netdev_is_pmd(port->netdev)) {
3545 continue;
3546 }
3547
3548 for (int qid = 0; qid < port->n_rxq; qid++) {
3549 struct dp_netdev_rxq *q = &port->rxqs[qid];
3550
3551 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
3552 struct dp_netdev_pmd_thread *pmd;
3553
3554 pmd = dp_netdev_get_pmd(dp, q->core_id);
3555 if (!pmd) {
3556 VLOG_WARN("There is no PMD thread on core %d. Queue "
3557 "%d on port \'%s\' will not be polled.",
3558 q->core_id, qid, netdev_get_name(port->netdev));
3559 } else {
3560 q->pmd = pmd;
3561 pmd->isolated = true;
3562 dp_netdev_pmd_unref(pmd);
3563 }
3564 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
3565 uint64_t cycle_hist = 0;
3566
3567 if (n_rxqs == 0) {
3568 rxqs = xmalloc(sizeof *rxqs);
3569 } else {
3570 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
3571 }
3572 /* Sum the queue intervals and store the cycle history. */
3573 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
3574 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
3575 }
3576 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
3577
3578 /* Store the queue. */
3579 rxqs[n_rxqs++] = q;
3580 }
3581 }
3582 }
3583
3584 if (n_rxqs > 1) {
3585 /* Sort the queues in order of the processing cycles
3586 * they consumed during their last pmd interval. */
3587 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
3588 }
3589
3590 rr_numa_list_populate(dp, &rr);
3591 /* Assign the sorted queues to pmds in round robin. */
3592 for (i = 0; i < n_rxqs; i++) {
3593 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
3594 numa = rr_numa_list_lookup(&rr, numa_id);
3595 if (!numa) {
3596 /* There are no pmds on the queue's local NUMA node.
3597 Round robin on the NUMA nodes that do have pmds. */
3598 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
3599 if (!non_local_numa) {
3600 VLOG_ERR("There is no available (non-isolated) pmd "
3601 "thread for port \'%s\' queue %d. This queue "
3602 "will not be polled. Is pmd-cpu-mask set to "
3603 "zero? Or are all PMDs isolated to other "
3604 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
3605 netdev_rxq_get_queue_id(rxqs[i]->rx));
3606 continue;
3607 }
3608 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
3609 VLOG_WARN("There's no available (non-isolated) pmd thread "
3610 "on numa node %d. Queue %d on port \'%s\' will "
3611 "be assigned to the pmd on core %d "
3612 "(numa node %d). Expect reduced performance.",
3613 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
3614 netdev_rxq_get_name(rxqs[i]->rx),
3615 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
3616 } else {
3617 rxqs[i]->pmd = rr_numa_get_pmd(numa);
3618 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
3619 "rx queue %d (measured processing cycles %"PRIu64").",
3620 rxqs[i]->pmd->core_id, numa_id,
3621 netdev_rxq_get_name(rxqs[i]->rx),
3622 netdev_rxq_get_queue_id(rxqs[i]->rx),
3623 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
3624 }
3625 }
3626
3627 rr_numa_list_destroy(&rr);
3628 free(rxqs);
3629 }
3630
3631 static void
3632 reload_affected_pmds(struct dp_netdev *dp)
3633 {
3634 struct dp_netdev_pmd_thread *pmd;
3635
3636 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3637 if (pmd->need_reload) {
3638 dp_netdev_reload_pmd__(pmd);
3639 pmd->need_reload = false;
3640 }
3641 }
3642 }
3643
3644 static void
3645 reconfigure_pmd_threads(struct dp_netdev *dp)
3646 OVS_REQUIRES(dp->port_mutex)
3647 {
3648 struct dp_netdev_pmd_thread *pmd;
3649 struct ovs_numa_dump *pmd_cores;
3650 struct ovs_numa_info_core *core;
3651 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
3652 struct hmapx_node *node;
3653 bool changed = false;
3654 bool need_to_adjust_static_tx_qids = false;
3655
3656 /* The pmd threads should be started only if there's a pmd port in the
3657 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
3658 * NR_PMD_THREADS per numa node. */
3659 if (!has_pmd_port(dp)) {
3660 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
3661 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
3662 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
3663 } else {
3664 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
3665 }
3666
3667 /* We need to adjust 'static_tx_qid's only if we're reducing number of
3668 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
3669 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
3670 /* Adjustment is required to keep 'static_tx_qid's sequential and
3671 * avoid possible issues, for example, imbalanced tx queue usage
3672 * and unnecessary locking caused by remapping on netdev level. */
3673 need_to_adjust_static_tx_qids = true;
3674 }
3675
3676 /* Check for unwanted pmd threads */
3677 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3678 if (pmd->core_id == NON_PMD_CORE_ID) {
3679 continue;
3680 }
3681 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
3682 pmd->core_id)) {
3683 hmapx_add(&to_delete, pmd);
3684 } else if (need_to_adjust_static_tx_qids) {
3685 pmd->need_reload = true;
3686 }
3687 }
3688
3689 HMAPX_FOR_EACH (node, &to_delete) {
3690 pmd = (struct dp_netdev_pmd_thread *) node->data;
3691 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
3692 pmd->numa_id, pmd->core_id);
3693 dp_netdev_del_pmd(dp, pmd);
3694 }
3695 changed = !hmapx_is_empty(&to_delete);
3696 hmapx_destroy(&to_delete);
3697
3698 if (need_to_adjust_static_tx_qids) {
3699 /* 'static_tx_qid's are not sequential now.
3700 * Reload remaining threads to fix this. */
3701 reload_affected_pmds(dp);
3702 }
3703
3704 /* Check for required new pmd threads */
3705 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
3706 pmd = dp_netdev_get_pmd(dp, core->core_id);
3707 if (!pmd) {
3708 pmd = xzalloc(sizeof *pmd);
3709 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
3710 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
3711 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
3712 pmd->numa_id, pmd->core_id);
3713 changed = true;
3714 } else {
3715 dp_netdev_pmd_unref(pmd);
3716 }
3717 }
3718
3719 if (changed) {
3720 struct ovs_numa_info_numa *numa;
3721
3722 /* Log the number of pmd threads per numa node. */
3723 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
3724 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
3725 numa->n_cores, numa->numa_id);
3726 }
3727 }
3728
3729 ovs_numa_dump_destroy(pmd_cores);
3730 }
3731
3732 static void
3733 pmd_remove_stale_ports(struct dp_netdev *dp,
3734 struct dp_netdev_pmd_thread *pmd)
3735 OVS_EXCLUDED(pmd->port_mutex)
3736 OVS_REQUIRES(dp->port_mutex)
3737 {
3738 struct rxq_poll *poll, *poll_next;
3739 struct tx_port *tx, *tx_next;
3740
3741 ovs_mutex_lock(&pmd->port_mutex);
3742 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3743 struct dp_netdev_port *port = poll->rxq->port;
3744
3745 if (port->need_reconfigure
3746 || !hmap_contains(&dp->ports, &port->node)) {
3747 dp_netdev_del_rxq_from_pmd(pmd, poll);
3748 }
3749 }
3750 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
3751 struct dp_netdev_port *port = tx->port;
3752
3753 if (port->need_reconfigure
3754 || !hmap_contains(&dp->ports, &port->node)) {
3755 dp_netdev_del_port_tx_from_pmd(pmd, tx);
3756 }
3757 }
3758 ovs_mutex_unlock(&pmd->port_mutex);
3759 }
3760
3761 /* Must be called each time a port is added/removed or the cmask changes.
3762 * This creates and destroys pmd threads, reconfigures ports, opens their
3763 * rxqs and assigns all rxqs/txqs to pmd threads. */
3764 static void
3765 reconfigure_datapath(struct dp_netdev *dp)
3766 OVS_REQUIRES(dp->port_mutex)
3767 {
3768 struct dp_netdev_pmd_thread *pmd;
3769 struct dp_netdev_port *port;
3770 int wanted_txqs;
3771
3772 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
3773
3774 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
3775 * on the system and the user configuration. */
3776 reconfigure_pmd_threads(dp);
3777
3778 wanted_txqs = cmap_count(&dp->poll_threads);
3779
3780 /* The number of pmd threads might have changed, or a port can be new:
3781 * adjust the txqs. */
3782 HMAP_FOR_EACH (port, node, &dp->ports) {
3783 netdev_set_tx_multiq(port->netdev, wanted_txqs);
3784 }
3785
3786 /* Step 2: Remove from the pmd threads ports that have been removed or
3787 * need reconfiguration. */
3788
3789 /* Check for all the ports that need reconfiguration. We cache this in
3790 * 'port->need_reconfigure', because netdev_is_reconf_required() can
3791 * change at any time. */
3792 HMAP_FOR_EACH (port, node, &dp->ports) {
3793 if (netdev_is_reconf_required(port->netdev)) {
3794 port->need_reconfigure = true;
3795 }
3796 }
3797
3798 /* Remove from the pmd threads all the ports that have been deleted or
3799 * need reconfiguration. */
3800 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3801 pmd_remove_stale_ports(dp, pmd);
3802 }
3803
3804 /* Reload affected pmd threads. We must wait for the pmd threads before
3805 * reconfiguring the ports, because a port cannot be reconfigured while
3806 * it's being used. */
3807 reload_affected_pmds(dp);
3808
3809 /* Step 3: Reconfigure ports. */
3810
3811 /* We only reconfigure the ports that we determined above, because they're
3812 * not being used by any pmd thread at the moment. If a port fails to
3813 * reconfigure we remove it from the datapath. */
3814 struct dp_netdev_port *next_port;
3815 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
3816 int err;
3817
3818 if (!port->need_reconfigure) {
3819 continue;
3820 }
3821
3822 err = port_reconfigure(port);
3823 if (err) {
3824 hmap_remove(&dp->ports, &port->node);
3825 seq_change(dp->port_seq);
3826 port_destroy(port);
3827 } else {
3828 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
3829 }
3830 }
3831
3832 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
3833 * for now, we just update the 'pmd' pointer in each rxq to point to the
3834 * wanted thread according to the scheduling policy. */
3835
3836 /* Reset all the pmd threads to non isolated. */
3837 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3838 pmd->isolated = false;
3839 }
3840
3841 /* Reset all the queues to unassigned */
3842 HMAP_FOR_EACH (port, node, &dp->ports) {
3843 for (int i = 0; i < port->n_rxq; i++) {
3844 port->rxqs[i].pmd = NULL;
3845 }
3846 }
3847
3848 /* Add pinned queues and mark pmd threads isolated. */
3849 rxq_scheduling(dp, true);
3850
3851 /* Add non-pinned queues. */
3852 rxq_scheduling(dp, false);
3853
3854 /* Step 5: Remove queues not compliant with new scheduling. */
3855 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3856 struct rxq_poll *poll, *poll_next;
3857
3858 ovs_mutex_lock(&pmd->port_mutex);
3859 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3860 if (poll->rxq->pmd != pmd) {
3861 dp_netdev_del_rxq_from_pmd(pmd, poll);
3862 }
3863 }
3864 ovs_mutex_unlock(&pmd->port_mutex);
3865 }
3866
3867 /* Reload affected pmd threads. We must wait for the pmd threads to remove
3868 * the old queues before readding them, otherwise a queue can be polled by
3869 * two threads at the same time. */
3870 reload_affected_pmds(dp);
3871
3872 /* Step 6: Add queues from scheduling, if they're not there already. */
3873 HMAP_FOR_EACH (port, node, &dp->ports) {
3874 if (!netdev_is_pmd(port->netdev)) {
3875 continue;
3876 }
3877
3878 for (int qid = 0; qid < port->n_rxq; qid++) {
3879 struct dp_netdev_rxq *q = &port->rxqs[qid];
3880
3881 if (q->pmd) {
3882 ovs_mutex_lock(&q->pmd->port_mutex);
3883 dp_netdev_add_rxq_to_pmd(q->pmd, q);
3884 ovs_mutex_unlock(&q->pmd->port_mutex);
3885 }
3886 }
3887 }
3888
3889 /* Add every port to the tx cache of every pmd thread, if it's not
3890 * there already and if this pmd has at least one rxq to poll. */
3891 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3892 ovs_mutex_lock(&pmd->port_mutex);
3893 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
3894 HMAP_FOR_EACH (port, node, &dp->ports) {
3895 dp_netdev_add_port_tx_to_pmd(pmd, port);
3896 }
3897 }
3898 ovs_mutex_unlock(&pmd->port_mutex);
3899 }
3900
3901 /* Reload affected pmd threads. */
3902 reload_affected_pmds(dp);
3903 }
3904
3905 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
3906 static bool
3907 ports_require_restart(const struct dp_netdev *dp)
3908 OVS_REQUIRES(dp->port_mutex)
3909 {
3910 struct dp_netdev_port *port;
3911
3912 HMAP_FOR_EACH (port, node, &dp->ports) {
3913 if (netdev_is_reconf_required(port->netdev)) {
3914 return true;
3915 }
3916 }
3917
3918 return false;
3919 }
3920
3921 /* Return true if needs to revalidate datapath flows. */
3922 static bool
3923 dpif_netdev_run(struct dpif *dpif)
3924 {
3925 struct dp_netdev_port *port;
3926 struct dp_netdev *dp = get_dp_netdev(dpif);
3927 struct dp_netdev_pmd_thread *non_pmd;
3928 uint64_t new_tnl_seq;
3929 bool need_to_flush = true;
3930
3931 ovs_mutex_lock(&dp->port_mutex);
3932 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3933 if (non_pmd) {
3934 ovs_mutex_lock(&dp->non_pmd_mutex);
3935 HMAP_FOR_EACH (port, node, &dp->ports) {
3936 if (!netdev_is_pmd(port->netdev)) {
3937 int i;
3938
3939 for (i = 0; i < port->n_rxq; i++) {
3940 if (dp_netdev_process_rxq_port(non_pmd,
3941 &port->rxqs[i],
3942 port->port_no)) {
3943 need_to_flush = false;
3944 }
3945 }
3946 }
3947 }
3948 if (need_to_flush) {
3949 /* We didn't receive anything in the process loop.
3950 * Check if we need to send something.
3951 * There was no time updates on current iteration. */
3952 pmd_thread_ctx_time_update(non_pmd);
3953 dp_netdev_pmd_flush_output_packets(non_pmd, false);
3954 }
3955
3956 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
3957 ovs_mutex_unlock(&dp->non_pmd_mutex);
3958
3959 dp_netdev_pmd_unref(non_pmd);
3960 }
3961
3962 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
3963 reconfigure_datapath(dp);
3964 }
3965 ovs_mutex_unlock(&dp->port_mutex);
3966
3967 tnl_neigh_cache_run();
3968 tnl_port_map_run();
3969 new_tnl_seq = seq_read(tnl_conf_seq);
3970
3971 if (dp->last_tnl_conf_seq != new_tnl_seq) {
3972 dp->last_tnl_conf_seq = new_tnl_seq;
3973 return true;
3974 }
3975 return false;
3976 }
3977
3978 static void
3979 dpif_netdev_wait(struct dpif *dpif)
3980 {
3981 struct dp_netdev_port *port;
3982 struct dp_netdev *dp = get_dp_netdev(dpif);
3983
3984 ovs_mutex_lock(&dp_netdev_mutex);
3985 ovs_mutex_lock(&dp->port_mutex);
3986 HMAP_FOR_EACH (port, node, &dp->ports) {
3987 netdev_wait_reconf_required(port->netdev);
3988 if (!netdev_is_pmd(port->netdev)) {
3989 int i;
3990
3991 for (i = 0; i < port->n_rxq; i++) {
3992 netdev_rxq_wait(port->rxqs[i].rx);
3993 }
3994 }
3995 }
3996 ovs_mutex_unlock(&dp->port_mutex);
3997 ovs_mutex_unlock(&dp_netdev_mutex);
3998 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
3999 }
4000
4001 static void
4002 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
4003 {
4004 struct tx_port *tx_port_cached;
4005
4006 /* Flush all the queued packets. */
4007 dp_netdev_pmd_flush_output_packets(pmd, true);
4008 /* Free all used tx queue ids. */
4009 dpif_netdev_xps_revalidate_pmd(pmd, true);
4010
4011 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
4012 free(tx_port_cached);
4013 }
4014 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
4015 free(tx_port_cached);
4016 }
4017 }
4018
4019 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
4020 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
4021 * device, otherwise to 'pmd->send_port_cache' if the port has at least
4022 * one txq. */
4023 static void
4024 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
4025 OVS_REQUIRES(pmd->port_mutex)
4026 {
4027 struct tx_port *tx_port, *tx_port_cached;
4028
4029 pmd_free_cached_ports(pmd);
4030 hmap_shrink(&pmd->send_port_cache);
4031 hmap_shrink(&pmd->tnl_port_cache);
4032
4033 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
4034 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
4035 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4036 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
4037 hash_port_no(tx_port_cached->port->port_no));
4038 }
4039
4040 if (netdev_n_txq(tx_port->port->netdev)) {
4041 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4042 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
4043 hash_port_no(tx_port_cached->port->port_no));
4044 }
4045 }
4046 }
4047
4048 static void
4049 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4050 {
4051 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4052 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
4053 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
4054 ", numa_id %d.", pmd->core_id, pmd->numa_id);
4055 }
4056 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4057
4058 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
4059 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
4060 }
4061
4062 static void
4063 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4064 {
4065 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4066 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
4067 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4068 }
4069
4070 static int
4071 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
4072 struct polled_queue **ppoll_list)
4073 {
4074 struct polled_queue *poll_list = *ppoll_list;
4075 struct rxq_poll *poll;
4076 int i;
4077
4078 ovs_mutex_lock(&pmd->port_mutex);
4079 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
4080 * sizeof *poll_list);
4081
4082 i = 0;
4083 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4084 poll_list[i].rxq = poll->rxq;
4085 poll_list[i].port_no = poll->rxq->port->port_no;
4086 i++;
4087 }
4088
4089 pmd_load_cached_ports(pmd);
4090
4091 ovs_mutex_unlock(&pmd->port_mutex);
4092
4093 *ppoll_list = poll_list;
4094 return i;
4095 }
4096
4097 static void *
4098 pmd_thread_main(void *f_)
4099 {
4100 struct dp_netdev_pmd_thread *pmd = f_;
4101 struct pmd_perf_stats *s = &pmd->perf_stats;
4102 unsigned int lc = 0;
4103 struct polled_queue *poll_list;
4104 bool exiting;
4105 int poll_cnt;
4106 int i;
4107 int process_packets = 0;
4108
4109 poll_list = NULL;
4110
4111 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
4112 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
4113 ovs_numa_thread_setaffinity_core(pmd->core_id);
4114 dpdk_set_lcore_id(pmd->core_id);
4115 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4116 emc_cache_init(&pmd->flow_cache);
4117 reload:
4118 pmd_alloc_static_tx_qid(pmd);
4119
4120 /* List port/core affinity */
4121 for (i = 0; i < poll_cnt; i++) {
4122 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
4123 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
4124 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4125 /* Reset the rxq current cycles counter. */
4126 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
4127 }
4128
4129 if (!poll_cnt) {
4130 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
4131 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
4132 poll_block();
4133 }
4134 lc = UINT_MAX;
4135 }
4136
4137 pmd->intrvl_tsc_prev = 0;
4138 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
4139 cycles_counter_update(s);
4140 for (;;) {
4141 uint64_t iter_packets = 0;
4142
4143 pmd_perf_start_iteration(s);
4144 for (i = 0; i < poll_cnt; i++) {
4145 process_packets =
4146 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
4147 poll_list[i].port_no);
4148 iter_packets += process_packets;
4149 }
4150
4151 if (!iter_packets) {
4152 /* We didn't receive anything in the process loop.
4153 * Check if we need to send something.
4154 * There was no time updates on current iteration. */
4155 pmd_thread_ctx_time_update(pmd);
4156 iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
4157 }
4158
4159 if (lc++ > 1024) {
4160 bool reload;
4161
4162 lc = 0;
4163
4164 coverage_try_clear();
4165 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
4166 if (!ovsrcu_try_quiesce()) {
4167 emc_cache_slow_sweep(&pmd->flow_cache);
4168 }
4169
4170 atomic_read_relaxed(&pmd->reload, &reload);
4171 if (reload) {
4172 break;
4173 }
4174 }
4175 pmd_perf_end_iteration(s, iter_packets);
4176 }
4177
4178 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4179 exiting = latch_is_set(&pmd->exit_latch);
4180 /* Signal here to make sure the pmd finishes
4181 * reloading the updated configuration. */
4182 dp_netdev_pmd_reload_done(pmd);
4183
4184 pmd_free_static_tx_qid(pmd);
4185
4186 if (!exiting) {
4187 goto reload;
4188 }
4189
4190 emc_cache_uninit(&pmd->flow_cache);
4191 free(poll_list);
4192 pmd_free_cached_ports(pmd);
4193 return NULL;
4194 }
4195
4196 static void
4197 dp_netdev_disable_upcall(struct dp_netdev *dp)
4198 OVS_ACQUIRES(dp->upcall_rwlock)
4199 {
4200 fat_rwlock_wrlock(&dp->upcall_rwlock);
4201 }
4202
4203 \f
4204 /* Meters */
4205 static void
4206 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
4207 struct ofputil_meter_features *features)
4208 {
4209 features->max_meters = MAX_METERS;
4210 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
4211 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
4212 features->max_bands = MAX_BANDS;
4213 features->max_color = 0;
4214 }
4215
4216 /* Returns false when packet needs to be dropped. */
4217 static void
4218 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
4219 uint32_t meter_id, long long int now)
4220 {
4221 struct dp_meter *meter;
4222 struct dp_meter_band *band;
4223 struct dp_packet *packet;
4224 long long int long_delta_t; /* msec */
4225 uint32_t delta_t; /* msec */
4226 int i;
4227 const size_t cnt = dp_packet_batch_size(packets_);
4228 uint32_t bytes, volume;
4229 int exceeded_band[NETDEV_MAX_BURST];
4230 uint32_t exceeded_rate[NETDEV_MAX_BURST];
4231 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
4232
4233 if (meter_id >= MAX_METERS) {
4234 return;
4235 }
4236
4237 meter_lock(dp, meter_id);
4238 meter = dp->meters[meter_id];
4239 if (!meter) {
4240 goto out;
4241 }
4242
4243 /* Initialize as negative values. */
4244 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
4245 /* Initialize as zeroes. */
4246 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
4247
4248 /* All packets will hit the meter at the same time. */
4249 long_delta_t = (now - meter->used) / 1000; /* msec */
4250
4251 /* Make sure delta_t will not be too large, so that bucket will not
4252 * wrap around below. */
4253 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
4254 ? meter->max_delta_t : (uint32_t)long_delta_t;
4255
4256 /* Update meter stats. */
4257 meter->used = now;
4258 meter->packet_count += cnt;
4259 bytes = 0;
4260 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
4261 bytes += dp_packet_size(packet);
4262 }
4263 meter->byte_count += bytes;
4264
4265 /* Meters can operate in terms of packets per second or kilobits per
4266 * second. */
4267 if (meter->flags & OFPMF13_PKTPS) {
4268 /* Rate in packets/second, bucket 1/1000 packets. */
4269 /* msec * packets/sec = 1/1000 packets. */
4270 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
4271 } else {
4272 /* Rate in kbps, bucket in bits. */
4273 /* msec * kbps = bits */
4274 volume = bytes * 8;
4275 }
4276
4277 /* Update all bands and find the one hit with the highest rate for each
4278 * packet (if any). */
4279 for (int m = 0; m < meter->n_bands; ++m) {
4280 band = &meter->bands[m];
4281
4282 /* Update band's bucket. */
4283 band->bucket += delta_t * band->up.rate;
4284 if (band->bucket > band->up.burst_size) {
4285 band->bucket = band->up.burst_size;
4286 }
4287
4288 /* Drain the bucket for all the packets, if possible. */
4289 if (band->bucket >= volume) {
4290 band->bucket -= volume;
4291 } else {
4292 int band_exceeded_pkt;
4293
4294 /* Band limit hit, must process packet-by-packet. */
4295 if (meter->flags & OFPMF13_PKTPS) {
4296 band_exceeded_pkt = band->bucket / 1000;
4297 band->bucket %= 1000; /* Remainder stays in bucket. */
4298
4299 /* Update the exceeding band for each exceeding packet.
4300 * (Only one band will be fired by a packet, and that
4301 * can be different for each packet.) */
4302 for (i = band_exceeded_pkt; i < cnt; i++) {
4303 if (band->up.rate > exceeded_rate[i]) {
4304 exceeded_rate[i] = band->up.rate;
4305 exceeded_band[i] = m;
4306 }
4307 }
4308 } else {
4309 /* Packet sizes differ, must process one-by-one. */
4310 band_exceeded_pkt = cnt;
4311 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
4312 uint32_t bits = dp_packet_size(packet) * 8;
4313
4314 if (band->bucket >= bits) {
4315 band->bucket -= bits;
4316 } else {
4317 if (i < band_exceeded_pkt) {
4318 band_exceeded_pkt = i;
4319 }
4320 /* Update the exceeding band for the exceeding packet.
4321 * (Only one band will be fired by a packet, and that
4322 * can be different for each packet.) */
4323 if (band->up.rate > exceeded_rate[i]) {
4324 exceeded_rate[i] = band->up.rate;
4325 exceeded_band[i] = m;
4326 }
4327 }
4328 }
4329 }
4330 /* Remember the first exceeding packet. */
4331 if (exceeded_pkt > band_exceeded_pkt) {
4332 exceeded_pkt = band_exceeded_pkt;
4333 }
4334 }
4335 }
4336
4337 /* Fire the highest rate band exceeded by each packet.
4338 * Drop packets if needed, by swapping packet to the end that will be
4339 * ignored. */
4340 size_t j;
4341 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4342 if (exceeded_band[j] >= 0) {
4343 /* Meter drop packet. */
4344 band = &meter->bands[exceeded_band[j]];
4345 band->packet_count += 1;
4346 band->byte_count += dp_packet_size(packet);
4347
4348 dp_packet_delete(packet);
4349 } else {
4350 /* Meter accepts packet. */
4351 dp_packet_batch_refill(packets_, packet, j);
4352 }
4353 }
4354 out:
4355 meter_unlock(dp, meter_id);
4356 }
4357
4358 /* Meter set/get/del processing is still single-threaded. */
4359 static int
4360 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id,
4361 struct ofputil_meter_config *config)
4362 {
4363 struct dp_netdev *dp = get_dp_netdev(dpif);
4364 uint32_t mid = meter_id->uint32;
4365 struct dp_meter *meter;
4366 int i;
4367
4368 if (mid >= MAX_METERS) {
4369 return EFBIG; /* Meter_id out of range. */
4370 }
4371
4372 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK ||
4373 !(config->flags & (OFPMF13_KBPS | OFPMF13_PKTPS))) {
4374 return EBADF; /* Unsupported flags set */
4375 }
4376
4377 /* Validate bands */
4378 if (config->n_bands == 0 || config->n_bands > MAX_BANDS) {
4379 return EINVAL; /* Too many bands */
4380 }
4381
4382 /* Validate rates */
4383 for (i = 0; i < config->n_bands; i++) {
4384 if (config->bands[i].rate == 0) {
4385 return EDOM; /* rate must be non-zero */
4386 }
4387 }
4388
4389 for (i = 0; i < config->n_bands; ++i) {
4390 switch (config->bands[i].type) {
4391 case OFPMBT13_DROP:
4392 break;
4393 default:
4394 return ENODEV; /* Unsupported band type */
4395 }
4396 }
4397
4398 /* Allocate meter */
4399 meter = xzalloc(sizeof *meter
4400 + config->n_bands * sizeof(struct dp_meter_band));
4401 if (meter) {
4402 meter->flags = config->flags;
4403 meter->n_bands = config->n_bands;
4404 meter->max_delta_t = 0;
4405 meter->used = time_usec();
4406
4407 /* set up bands */
4408 for (i = 0; i < config->n_bands; ++i) {
4409 uint32_t band_max_delta_t;
4410
4411 /* Set burst size to a workable value if none specified. */
4412 if (config->bands[i].burst_size == 0) {
4413 config->bands[i].burst_size = config->bands[i].rate;
4414 }
4415
4416 meter->bands[i].up = config->bands[i];
4417 /* Convert burst size to the bucket units: */
4418 /* pkts => 1/1000 packets, kilobits => bits. */
4419 meter->bands[i].up.burst_size *= 1000;
4420 /* Initialize bucket to empty. */
4421 meter->bands[i].bucket = 0;
4422
4423 /* Figure out max delta_t that is enough to fill any bucket. */
4424 band_max_delta_t
4425 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
4426 if (band_max_delta_t > meter->max_delta_t) {
4427 meter->max_delta_t = band_max_delta_t;
4428 }
4429 }
4430
4431 meter_lock(dp, mid);
4432 dp_delete_meter(dp, mid); /* Free existing meter, if any */
4433 dp->meters[mid] = meter;
4434 meter_unlock(dp, mid);
4435
4436 return 0;
4437 }
4438 return ENOMEM;
4439 }
4440
4441 static int
4442 dpif_netdev_meter_get(const struct dpif *dpif,
4443 ofproto_meter_id meter_id_,
4444 struct ofputil_meter_stats *stats, uint16_t n_bands)
4445 {
4446 const struct dp_netdev *dp = get_dp_netdev(dpif);
4447 const struct dp_meter *meter;
4448 uint32_t meter_id = meter_id_.uint32;
4449
4450 if (meter_id >= MAX_METERS) {
4451 return EFBIG;
4452 }
4453 meter = dp->meters[meter_id];
4454 if (!meter) {
4455 return ENOENT;
4456 }
4457 if (stats) {
4458 int i = 0;
4459
4460 meter_lock(dp, meter_id);
4461 stats->packet_in_count = meter->packet_count;
4462 stats->byte_in_count = meter->byte_count;
4463
4464 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
4465 stats->bands[i].packet_count = meter->bands[i].packet_count;
4466 stats->bands[i].byte_count = meter->bands[i].byte_count;
4467 }
4468 meter_unlock(dp, meter_id);
4469
4470 stats->n_bands = i;
4471 }
4472 return 0;
4473 }
4474
4475 static int
4476 dpif_netdev_meter_del(struct dpif *dpif,
4477 ofproto_meter_id meter_id_,
4478 struct ofputil_meter_stats *stats, uint16_t n_bands)
4479 {
4480 struct dp_netdev *dp = get_dp_netdev(dpif);
4481 int error;
4482
4483 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
4484 if (!error) {
4485 uint32_t meter_id = meter_id_.uint32;
4486
4487 meter_lock(dp, meter_id);
4488 dp_delete_meter(dp, meter_id);
4489 meter_unlock(dp, meter_id);
4490 }
4491 return error;
4492 }
4493
4494 \f
4495 static void
4496 dpif_netdev_disable_upcall(struct dpif *dpif)
4497 OVS_NO_THREAD_SAFETY_ANALYSIS
4498 {
4499 struct dp_netdev *dp = get_dp_netdev(dpif);
4500 dp_netdev_disable_upcall(dp);
4501 }
4502
4503 static void
4504 dp_netdev_enable_upcall(struct dp_netdev *dp)
4505 OVS_RELEASES(dp->upcall_rwlock)
4506 {
4507 fat_rwlock_unlock(&dp->upcall_rwlock);
4508 }
4509
4510 static void
4511 dpif_netdev_enable_upcall(struct dpif *dpif)
4512 OVS_NO_THREAD_SAFETY_ANALYSIS
4513 {
4514 struct dp_netdev *dp = get_dp_netdev(dpif);
4515 dp_netdev_enable_upcall(dp);
4516 }
4517
4518 static void
4519 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
4520 {
4521 ovs_mutex_lock(&pmd->cond_mutex);
4522 atomic_store_relaxed(&pmd->reload, false);
4523 pmd->last_reload_seq = seq_read(pmd->reload_seq);
4524 xpthread_cond_signal(&pmd->cond);
4525 ovs_mutex_unlock(&pmd->cond_mutex);
4526 }
4527
4528 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
4529 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
4530 * 'core_id' is NON_PMD_CORE_ID).
4531 *
4532 * Caller must unrefs the returned reference. */
4533 static struct dp_netdev_pmd_thread *
4534 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
4535 {
4536 struct dp_netdev_pmd_thread *pmd;
4537 const struct cmap_node *pnode;
4538
4539 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
4540 if (!pnode) {
4541 return NULL;
4542 }
4543 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
4544
4545 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
4546 }
4547
4548 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
4549 static void
4550 dp_netdev_set_nonpmd(struct dp_netdev *dp)
4551 OVS_REQUIRES(dp->port_mutex)
4552 {
4553 struct dp_netdev_pmd_thread *non_pmd;
4554
4555 non_pmd = xzalloc(sizeof *non_pmd);
4556 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
4557 }
4558
4559 /* Caller must have valid pointer to 'pmd'. */
4560 static bool
4561 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
4562 {
4563 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
4564 }
4565
4566 static void
4567 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
4568 {
4569 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
4570 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
4571 }
4572 }
4573
4574 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
4575 * fails, keeps checking for next node until reaching the end of cmap.
4576 *
4577 * Caller must unrefs the returned reference. */
4578 static struct dp_netdev_pmd_thread *
4579 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
4580 {
4581 struct dp_netdev_pmd_thread *next;
4582
4583 do {
4584 struct cmap_node *node;
4585
4586 node = cmap_next_position(&dp->poll_threads, pos);
4587 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
4588 : NULL;
4589 } while (next && !dp_netdev_pmd_try_ref(next));
4590
4591 return next;
4592 }
4593
4594 /* Configures the 'pmd' based on the input argument. */
4595 static void
4596 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
4597 unsigned core_id, int numa_id)
4598 {
4599 pmd->dp = dp;
4600 pmd->core_id = core_id;
4601 pmd->numa_id = numa_id;
4602 pmd->need_reload = false;
4603 pmd->n_output_batches = 0;
4604
4605 ovs_refcount_init(&pmd->ref_cnt);
4606 latch_init(&pmd->exit_latch);
4607 pmd->reload_seq = seq_create();
4608 pmd->last_reload_seq = seq_read(pmd->reload_seq);
4609 atomic_init(&pmd->reload, false);
4610 xpthread_cond_init(&pmd->cond, NULL);
4611 ovs_mutex_init(&pmd->cond_mutex);
4612 ovs_mutex_init(&pmd->flow_mutex);
4613 ovs_mutex_init(&pmd->port_mutex);
4614 cmap_init(&pmd->flow_table);
4615 cmap_init(&pmd->classifiers);
4616 pmd->ctx.last_rxq = NULL;
4617 pmd_thread_ctx_time_update(pmd);
4618 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
4619 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
4620 hmap_init(&pmd->poll_list);
4621 hmap_init(&pmd->tx_ports);
4622 hmap_init(&pmd->tnl_port_cache);
4623 hmap_init(&pmd->send_port_cache);
4624 /* init the 'flow_cache' since there is no
4625 * actual thread created for NON_PMD_CORE_ID. */
4626 if (core_id == NON_PMD_CORE_ID) {
4627 emc_cache_init(&pmd->flow_cache);
4628 pmd_alloc_static_tx_qid(pmd);
4629 }
4630 pmd_perf_stats_init(&pmd->perf_stats);
4631 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
4632 hash_int(core_id, 0));
4633 }
4634
4635 static void
4636 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
4637 {
4638 struct dpcls *cls;
4639
4640 dp_netdev_pmd_flow_flush(pmd);
4641 hmap_destroy(&pmd->send_port_cache);
4642 hmap_destroy(&pmd->tnl_port_cache);
4643 hmap_destroy(&pmd->tx_ports);
4644 hmap_destroy(&pmd->poll_list);
4645 /* All flows (including their dpcls_rules) have been deleted already */
4646 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
4647 dpcls_destroy(cls);
4648 ovsrcu_postpone(free, cls);
4649 }
4650 cmap_destroy(&pmd->classifiers);
4651 cmap_destroy(&pmd->flow_table);
4652 ovs_mutex_destroy(&pmd->flow_mutex);
4653 latch_destroy(&pmd->exit_latch);
4654 seq_destroy(pmd->reload_seq);
4655 xpthread_cond_destroy(&pmd->cond);
4656 ovs_mutex_destroy(&pmd->cond_mutex);
4657 ovs_mutex_destroy(&pmd->port_mutex);
4658 free(pmd);
4659 }
4660
4661 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
4662 * and unrefs the struct. */
4663 static void
4664 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
4665 {
4666 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
4667 * but extra cleanup is necessary */
4668 if (pmd->core_id == NON_PMD_CORE_ID) {
4669 ovs_mutex_lock(&dp->non_pmd_mutex);
4670 emc_cache_uninit(&pmd->flow_cache);
4671 pmd_free_cached_ports(pmd);
4672 pmd_free_static_tx_qid(pmd);
4673 ovs_mutex_unlock(&dp->non_pmd_mutex);
4674 } else {
4675 latch_set(&pmd->exit_latch);
4676 dp_netdev_reload_pmd__(pmd);
4677 xpthread_join(pmd->thread, NULL);
4678 }
4679
4680 dp_netdev_pmd_clear_ports(pmd);
4681
4682 /* Purges the 'pmd''s flows after stopping the thread, but before
4683 * destroying the flows, so that the flow stats can be collected. */
4684 if (dp->dp_purge_cb) {
4685 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
4686 }
4687 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
4688 dp_netdev_pmd_unref(pmd);
4689 }
4690
4691 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
4692 * thread. */
4693 static void
4694 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
4695 {
4696 struct dp_netdev_pmd_thread *pmd;
4697 struct dp_netdev_pmd_thread **pmd_list;
4698 size_t k = 0, n_pmds;
4699
4700 n_pmds = cmap_count(&dp->poll_threads);
4701 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
4702
4703 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4704 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
4705 continue;
4706 }
4707 /* We cannot call dp_netdev_del_pmd(), since it alters
4708 * 'dp->poll_threads' (while we're iterating it) and it
4709 * might quiesce. */
4710 ovs_assert(k < n_pmds);
4711 pmd_list[k++] = pmd;
4712 }
4713
4714 for (size_t i = 0; i < k; i++) {
4715 dp_netdev_del_pmd(dp, pmd_list[i]);
4716 }
4717 free(pmd_list);
4718 }
4719
4720 /* Deletes all rx queues from pmd->poll_list and all the ports from
4721 * pmd->tx_ports. */
4722 static void
4723 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
4724 {
4725 struct rxq_poll *poll;
4726 struct tx_port *port;
4727
4728 ovs_mutex_lock(&pmd->port_mutex);
4729 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
4730 free(poll);
4731 }
4732 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
4733 free(port);
4734 }
4735 ovs_mutex_unlock(&pmd->port_mutex);
4736 }
4737
4738 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
4739 static void
4740 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
4741 struct dp_netdev_rxq *rxq)
4742 OVS_REQUIRES(pmd->port_mutex)
4743 {
4744 int qid = netdev_rxq_get_queue_id(rxq->rx);
4745 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
4746 struct rxq_poll *poll;
4747
4748 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
4749 if (poll->rxq == rxq) {
4750 /* 'rxq' is already polled by this thread. Do nothing. */
4751 return;
4752 }
4753 }
4754
4755 poll = xmalloc(sizeof *poll);
4756 poll->rxq = rxq;
4757 hmap_insert(&pmd->poll_list, &poll->node, hash);
4758
4759 pmd->need_reload = true;
4760 }
4761
4762 /* Delete 'poll' from poll_list of PMD thread. */
4763 static void
4764 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
4765 struct rxq_poll *poll)
4766 OVS_REQUIRES(pmd->port_mutex)
4767 {
4768 hmap_remove(&pmd->poll_list, &poll->node);
4769 free(poll);
4770
4771 pmd->need_reload = true;
4772 }
4773
4774 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
4775 * changes to take effect. */
4776 static void
4777 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
4778 struct dp_netdev_port *port)
4779 OVS_REQUIRES(pmd->port_mutex)
4780 {
4781 struct tx_port *tx;
4782
4783 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
4784 if (tx) {
4785 /* 'port' is already on this thread tx cache. Do nothing. */
4786 return;
4787 }
4788
4789 tx = xzalloc(sizeof *tx);
4790
4791 tx->port = port;
4792 tx->qid = -1;
4793 tx->flush_time = 0LL;
4794 dp_packet_batch_init(&tx->output_pkts);
4795
4796 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
4797 pmd->need_reload = true;
4798 }
4799
4800 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
4801 * changes to take effect. */
4802 static void
4803 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
4804 struct tx_port *tx)
4805 OVS_REQUIRES(pmd->port_mutex)
4806 {
4807 hmap_remove(&pmd->tx_ports, &tx->node);
4808 free(tx);
4809 pmd->need_reload = true;
4810 }
4811 \f
4812 static char *
4813 dpif_netdev_get_datapath_version(void)
4814 {
4815 return xstrdup("<built-in>");
4816 }
4817
4818 static void
4819 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
4820 uint16_t tcp_flags, long long now)
4821 {
4822 uint16_t flags;
4823
4824 atomic_store_relaxed(&netdev_flow->stats.used, now);
4825 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
4826 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
4827 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
4828 flags |= tcp_flags;
4829 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
4830 }
4831
4832 static int
4833 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
4834 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
4835 enum dpif_upcall_type type, const struct nlattr *userdata,
4836 struct ofpbuf *actions, struct ofpbuf *put_actions)
4837 {
4838 struct dp_netdev *dp = pmd->dp;
4839
4840 if (OVS_UNLIKELY(!dp->upcall_cb)) {
4841 return ENODEV;
4842 }
4843
4844 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
4845 struct ds ds = DS_EMPTY_INITIALIZER;
4846 char *packet_str;
4847 struct ofpbuf key;
4848 struct odp_flow_key_parms odp_parms = {
4849 .flow = flow,
4850 .mask = wc ? &wc->masks : NULL,
4851 .support = dp_netdev_support,
4852 };
4853
4854 ofpbuf_init(&key, 0);
4855 odp_flow_key_from_flow(&odp_parms, &key);
4856 packet_str = ofp_dp_packet_to_string(packet_);
4857
4858 odp_flow_key_format(key.data, key.size, &ds);
4859
4860 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
4861 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
4862
4863 ofpbuf_uninit(&key);
4864 free(packet_str);
4865
4866 ds_destroy(&ds);
4867 }
4868
4869 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
4870 actions, wc, put_actions, dp->upcall_aux);
4871 }
4872
4873 static inline uint32_t
4874 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
4875 const struct miniflow *mf)
4876 {
4877 uint32_t hash;
4878
4879 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4880 hash = dp_packet_get_rss_hash(packet);
4881 } else {
4882 hash = miniflow_hash_5tuple(mf, 0);
4883 dp_packet_set_rss_hash(packet, hash);
4884 }
4885
4886 return hash;
4887 }
4888
4889 static inline uint32_t
4890 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
4891 const struct miniflow *mf)
4892 {
4893 uint32_t hash, recirc_depth;
4894
4895 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4896 hash = dp_packet_get_rss_hash(packet);
4897 } else {
4898 hash = miniflow_hash_5tuple(mf, 0);
4899 dp_packet_set_rss_hash(packet, hash);
4900 }
4901
4902 /* The RSS hash must account for the recirculation depth to avoid
4903 * collisions in the exact match cache */
4904 recirc_depth = *recirc_depth_get_unsafe();
4905 if (OVS_UNLIKELY(recirc_depth)) {
4906 hash = hash_finish(hash, recirc_depth);
4907 dp_packet_set_rss_hash(packet, hash);
4908 }
4909 return hash;
4910 }
4911
4912 struct packet_batch_per_flow {
4913 unsigned int byte_count;
4914 uint16_t tcp_flags;
4915 struct dp_netdev_flow *flow;
4916
4917 struct dp_packet_batch array;
4918 };
4919
4920 static inline void
4921 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
4922 struct dp_packet *packet,
4923 const struct miniflow *mf)
4924 {
4925 batch->byte_count += dp_packet_size(packet);
4926 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
4927 batch->array.packets[batch->array.count++] = packet;
4928 }
4929
4930 static inline void
4931 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
4932 struct dp_netdev_flow *flow)
4933 {
4934 flow->batch = batch;
4935
4936 batch->flow = flow;
4937 dp_packet_batch_init(&batch->array);
4938 batch->byte_count = 0;
4939 batch->tcp_flags = 0;
4940 }
4941
4942 static inline void
4943 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
4944 struct dp_netdev_pmd_thread *pmd)
4945 {
4946 struct dp_netdev_actions *actions;
4947 struct dp_netdev_flow *flow = batch->flow;
4948
4949 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
4950 batch->tcp_flags, pmd->ctx.now / 1000);
4951
4952 actions = dp_netdev_flow_get_actions(flow);
4953
4954 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
4955 actions->actions, actions->size);
4956 }
4957
4958 static inline void
4959 dp_netdev_queue_batches(struct dp_packet *pkt,
4960 struct dp_netdev_flow *flow, const struct miniflow *mf,
4961 struct packet_batch_per_flow *batches,
4962 size_t *n_batches)
4963 {
4964 struct packet_batch_per_flow *batch = flow->batch;
4965
4966 if (OVS_UNLIKELY(!batch)) {
4967 batch = &batches[(*n_batches)++];
4968 packet_batch_per_flow_init(batch, flow);
4969 }
4970
4971 packet_batch_per_flow_update(batch, pkt, mf);
4972 }
4973
4974 /* Try to process all ('cnt') the 'packets' using only the exact match cache
4975 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
4976 * miniflow is copied into 'keys' and the packet pointer is moved at the
4977 * beginning of the 'packets' array.
4978 *
4979 * The function returns the number of packets that needs to be processed in the
4980 * 'packets' array (they have been moved to the beginning of the vector).
4981 *
4982 * For performance reasons a caller may choose not to initialize the metadata
4983 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
4984 * is not valid and must be initialized by this function using 'port_no'.
4985 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
4986 * will be ignored.
4987 */
4988 static inline size_t
4989 emc_processing(struct dp_netdev_pmd_thread *pmd,
4990 struct dp_packet_batch *packets_,
4991 struct netdev_flow_key *keys,
4992 struct packet_batch_per_flow batches[], size_t *n_batches,
4993 bool md_is_valid, odp_port_t port_no)
4994 {
4995 struct emc_cache *flow_cache = &pmd->flow_cache;
4996 struct netdev_flow_key *key = &keys[0];
4997 size_t n_missed = 0, n_dropped = 0;
4998 struct dp_packet *packet;
4999 const size_t cnt = dp_packet_batch_size(packets_);
5000 uint32_t cur_min;
5001 int i;
5002
5003 atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
5004 pmd_perf_update_counter(&pmd->perf_stats,
5005 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
5006 cnt);
5007
5008 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
5009 struct dp_netdev_flow *flow;
5010
5011 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
5012 dp_packet_delete(packet);
5013 n_dropped++;
5014 continue;
5015 }
5016
5017 if (i != cnt - 1) {
5018 struct dp_packet **packets = packets_->packets;
5019 /* Prefetch next packet data and metadata. */
5020 OVS_PREFETCH(dp_packet_data(packets[i+1]));
5021 pkt_metadata_prefetch_init(&packets[i+1]->md);
5022 }
5023
5024 if (!md_is_valid) {
5025 pkt_metadata_init(&packet->md, port_no);
5026 }
5027 miniflow_extract(packet, &key->mf);
5028 key->len = 0; /* Not computed yet. */
5029 /* If EMC is disabled skip hash computation and emc_lookup */
5030 if (cur_min) {
5031 if (!md_is_valid) {
5032 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
5033 &key->mf);
5034 } else {
5035 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
5036 }
5037 flow = emc_lookup(flow_cache, key);
5038 } else {
5039 flow = NULL;
5040 }
5041 if (OVS_LIKELY(flow)) {
5042 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
5043 n_batches);
5044 } else {
5045 /* Exact match cache missed. Group missed packets together at
5046 * the beginning of the 'packets' array. */
5047 dp_packet_batch_refill(packets_, packet, i);
5048 /* 'key[n_missed]' contains the key of the current packet and it
5049 * must be returned to the caller. The next key should be extracted
5050 * to 'keys[n_missed + 1]'. */
5051 key = &keys[++n_missed];
5052 }
5053 }
5054
5055 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT,
5056 cnt - n_dropped - n_missed);
5057
5058 return dp_packet_batch_size(packets_);
5059 }
5060
5061 static inline int
5062 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
5063 struct dp_packet *packet,
5064 const struct netdev_flow_key *key,
5065 struct ofpbuf *actions, struct ofpbuf *put_actions)
5066 {
5067 struct ofpbuf *add_actions;
5068 struct dp_packet_batch b;
5069 struct match match;
5070 ovs_u128 ufid;
5071 int error;
5072
5073 match.tun_md.valid = false;
5074 miniflow_expand(&key->mf, &match.flow);
5075
5076 ofpbuf_clear(actions);
5077 ofpbuf_clear(put_actions);
5078
5079 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
5080 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
5081 &ufid, DPIF_UC_MISS, NULL, actions,
5082 put_actions);
5083 if (OVS_UNLIKELY(error && error != ENOSPC)) {
5084 dp_packet_delete(packet);
5085 return error;
5086 }
5087
5088 /* The Netlink encoding of datapath flow keys cannot express
5089 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
5090 * tag is interpreted as exact match on the fact that there is no
5091 * VLAN. Unless we refactor a lot of code that translates between
5092 * Netlink and struct flow representations, we have to do the same
5093 * here. */
5094 if (!match.wc.masks.vlans[0].tci) {
5095 match.wc.masks.vlans[0].tci = htons(0xffff);
5096 }
5097
5098 /* We can't allow the packet batching in the next loop to execute
5099 * the actions. Otherwise, if there are any slow path actions,
5100 * we'll send the packet up twice. */
5101 dp_packet_batch_init_packet(&b, packet);
5102 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
5103 actions->data, actions->size);
5104
5105 add_actions = put_actions->size ? put_actions : actions;
5106 if (OVS_LIKELY(error != ENOSPC)) {
5107 struct dp_netdev_flow *netdev_flow;
5108
5109 /* XXX: There's a race window where a flow covering this packet
5110 * could have already been installed since we last did the flow
5111 * lookup before upcall. This could be solved by moving the
5112 * mutex lock outside the loop, but that's an awful long time
5113 * to be locking everyone out of making flow installs. If we
5114 * move to a per-core classifier, it would be reasonable. */
5115 ovs_mutex_lock(&pmd->flow_mutex);
5116 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
5117 if (OVS_LIKELY(!netdev_flow)) {
5118 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
5119 add_actions->data,
5120 add_actions->size);
5121 }
5122 ovs_mutex_unlock(&pmd->flow_mutex);
5123 emc_probabilistic_insert(pmd, key, netdev_flow);
5124 }
5125 return error;
5126 }
5127
5128 static inline void
5129 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
5130 struct dp_packet_batch *packets_,
5131 struct netdev_flow_key *keys,
5132 struct packet_batch_per_flow batches[],
5133 size_t *n_batches,
5134 odp_port_t in_port)
5135 {
5136 const size_t cnt = dp_packet_batch_size(packets_);
5137 #if !defined(__CHECKER__) && !defined(_WIN32)
5138 const size_t PKT_ARRAY_SIZE = cnt;
5139 #else
5140 /* Sparse or MSVC doesn't like variable length array. */
5141 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5142 #endif
5143 struct dp_packet *packet;
5144 struct dpcls *cls;
5145 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
5146 struct dp_netdev *dp = pmd->dp;
5147 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
5148 int lookup_cnt = 0, add_lookup_cnt;
5149 bool any_miss;
5150 size_t i;
5151
5152 for (i = 0; i < cnt; i++) {
5153 /* Key length is needed in all the cases, hash computed on demand. */
5154 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
5155 }
5156 /* Get the classifier for the in_port */
5157 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
5158 if (OVS_LIKELY(cls)) {
5159 any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
5160 } else {
5161 any_miss = true;
5162 memset(rules, 0, sizeof(rules));
5163 }
5164 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
5165 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
5166 struct ofpbuf actions, put_actions;
5167
5168 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
5169 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
5170
5171 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5172 struct dp_netdev_flow *netdev_flow;
5173
5174 if (OVS_LIKELY(rules[i])) {
5175 continue;
5176 }
5177
5178 /* It's possible that an earlier slow path execution installed
5179 * a rule covering this flow. In this case, it's a lot cheaper
5180 * to catch it here than execute a miss. */
5181 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
5182 &add_lookup_cnt);
5183 if (netdev_flow) {
5184 lookup_cnt += add_lookup_cnt;
5185 rules[i] = &netdev_flow->cr;
5186 continue;
5187 }
5188
5189 int error = handle_packet_upcall(pmd, packet, &keys[i],
5190 &actions, &put_actions);
5191
5192 if (OVS_UNLIKELY(error)) {
5193 upcall_fail_cnt++;
5194 } else {
5195 upcall_ok_cnt++;
5196 }
5197 }
5198
5199 ofpbuf_uninit(&actions);
5200 ofpbuf_uninit(&put_actions);
5201 fat_rwlock_unlock(&dp->upcall_rwlock);
5202 } else if (OVS_UNLIKELY(any_miss)) {
5203 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5204 if (OVS_UNLIKELY(!rules[i])) {
5205 dp_packet_delete(packet);
5206 upcall_fail_cnt++;
5207 }
5208 }
5209 }
5210
5211 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5212 struct dp_netdev_flow *flow;
5213
5214 if (OVS_UNLIKELY(!rules[i])) {
5215 continue;
5216 }
5217
5218 flow = dp_netdev_flow_cast(rules[i]);
5219
5220 emc_probabilistic_insert(pmd, &keys[i], flow);
5221 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
5222 }
5223
5224 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
5225 cnt - upcall_ok_cnt - upcall_fail_cnt);
5226 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
5227 lookup_cnt);
5228 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
5229 upcall_ok_cnt);
5230 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
5231 upcall_fail_cnt);
5232 }
5233
5234 /* Packets enter the datapath from a port (or from recirculation) here.
5235 *
5236 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
5237 * When false the metadata in 'packets' need to be initialized. */
5238 static void
5239 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
5240 struct dp_packet_batch *packets,
5241 bool md_is_valid, odp_port_t port_no)
5242 {
5243 #if !defined(__CHECKER__) && !defined(_WIN32)
5244 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
5245 #else
5246 /* Sparse or MSVC doesn't like variable length array. */
5247 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5248 #endif
5249 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
5250 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
5251 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
5252 size_t n_batches;
5253 odp_port_t in_port;
5254
5255 n_batches = 0;
5256 emc_processing(pmd, packets, keys, batches, &n_batches,
5257 md_is_valid, port_no);
5258 if (!dp_packet_batch_is_empty(packets)) {
5259 /* Get ingress port from first packet's metadata. */
5260 in_port = packets->packets[0]->md.in_port.odp_port;
5261 fast_path_processing(pmd, packets, keys,
5262 batches, &n_batches, in_port);
5263 }
5264
5265 /* All the flow batches need to be reset before any call to
5266 * packet_batch_per_flow_execute() as it could potentially trigger
5267 * recirculation. When a packet matching flow ‘j’ happens to be
5268 * recirculated, the nested call to dp_netdev_input__() could potentially
5269 * classify the packet as matching another flow - say 'k'. It could happen
5270 * that in the previous call to dp_netdev_input__() that same flow 'k' had
5271 * already its own batches[k] still waiting to be served. So if its
5272 * ‘batch’ member is not reset, the recirculated packet would be wrongly
5273 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
5274 size_t i;
5275 for (i = 0; i < n_batches; i++) {
5276 batches[i].flow->batch = NULL;
5277 }
5278
5279 for (i = 0; i < n_batches; i++) {
5280 packet_batch_per_flow_execute(&batches[i], pmd);
5281 }
5282 }
5283
5284 static void
5285 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
5286 struct dp_packet_batch *packets,
5287 odp_port_t port_no)
5288 {
5289 dp_netdev_input__(pmd, packets, false, port_no);
5290 }
5291
5292 static void
5293 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
5294 struct dp_packet_batch *packets)
5295 {
5296 dp_netdev_input__(pmd, packets, true, 0);
5297 }
5298
5299 struct dp_netdev_execute_aux {
5300 struct dp_netdev_pmd_thread *pmd;
5301 const struct flow *flow;
5302 };
5303
5304 static void
5305 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
5306 void *aux)
5307 {
5308 struct dp_netdev *dp = get_dp_netdev(dpif);
5309 dp->dp_purge_aux = aux;
5310 dp->dp_purge_cb = cb;
5311 }
5312
5313 static void
5314 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
5315 void *aux)
5316 {
5317 struct dp_netdev *dp = get_dp_netdev(dpif);
5318 dp->upcall_aux = aux;
5319 dp->upcall_cb = cb;
5320 }
5321
5322 static void
5323 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
5324 bool purge)
5325 {
5326 struct tx_port *tx;
5327 struct dp_netdev_port *port;
5328 long long interval;
5329
5330 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
5331 if (!tx->port->dynamic_txqs) {
5332 continue;
5333 }
5334 interval = pmd->ctx.now - tx->last_used;
5335 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
5336 port = tx->port;
5337 ovs_mutex_lock(&port->txq_used_mutex);
5338 port->txq_used[tx->qid]--;
5339 ovs_mutex_unlock(&port->txq_used_mutex);
5340 tx->qid = -1;
5341 }
5342 }
5343 }
5344
5345 static int
5346 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
5347 struct tx_port *tx)
5348 {
5349 struct dp_netdev_port *port;
5350 long long interval;
5351 int i, min_cnt, min_qid;
5352
5353 interval = pmd->ctx.now - tx->last_used;
5354 tx->last_used = pmd->ctx.now;
5355
5356 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
5357 return tx->qid;
5358 }
5359
5360 port = tx->port;
5361
5362 ovs_mutex_lock(&port->txq_used_mutex);
5363 if (tx->qid >= 0) {
5364 port->txq_used[tx->qid]--;
5365 tx->qid = -1;
5366 }
5367
5368 min_cnt = -1;
5369 min_qid = 0;
5370 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
5371 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
5372 min_cnt = port->txq_used[i];
5373 min_qid = i;
5374 }
5375 }
5376
5377 port->txq_used[min_qid]++;
5378 tx->qid = min_qid;
5379
5380 ovs_mutex_unlock(&port->txq_used_mutex);
5381
5382 dpif_netdev_xps_revalidate_pmd(pmd, false);
5383
5384 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
5385 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
5386 return min_qid;
5387 }
5388
5389 static struct tx_port *
5390 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5391 odp_port_t port_no)
5392 {
5393 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
5394 }
5395
5396 static struct tx_port *
5397 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5398 odp_port_t port_no)
5399 {
5400 return tx_port_lookup(&pmd->send_port_cache, port_no);
5401 }
5402
5403 static int
5404 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
5405 const struct nlattr *attr,
5406 struct dp_packet_batch *batch)
5407 {
5408 struct tx_port *tun_port;
5409 const struct ovs_action_push_tnl *data;
5410 int err;
5411
5412 data = nl_attr_get(attr);
5413
5414 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
5415 if (!tun_port) {
5416 err = -EINVAL;
5417 goto error;
5418 }
5419 err = netdev_push_header(tun_port->port->netdev, batch, data);
5420 if (!err) {
5421 return 0;
5422 }
5423 error:
5424 dp_packet_delete_batch(batch, true);
5425 return err;
5426 }
5427
5428 static void
5429 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
5430 struct dp_packet *packet, bool may_steal,
5431 struct flow *flow, ovs_u128 *ufid,
5432 struct ofpbuf *actions,
5433 const struct nlattr *userdata)
5434 {
5435 struct dp_packet_batch b;
5436 int error;
5437
5438 ofpbuf_clear(actions);
5439
5440 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
5441 DPIF_UC_ACTION, userdata, actions,
5442 NULL);
5443 if (!error || error == ENOSPC) {
5444 dp_packet_batch_init_packet(&b, packet);
5445 dp_netdev_execute_actions(pmd, &b, may_steal, flow,
5446 actions->data, actions->size);
5447 } else if (may_steal) {
5448 dp_packet_delete(packet);
5449 }
5450 }
5451
5452 static void
5453 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
5454 const struct nlattr *a, bool may_steal)
5455 OVS_NO_THREAD_SAFETY_ANALYSIS
5456 {
5457 struct dp_netdev_execute_aux *aux = aux_;
5458 uint32_t *depth = recirc_depth_get();
5459 struct dp_netdev_pmd_thread *pmd = aux->pmd;
5460 struct dp_netdev *dp = pmd->dp;
5461 int type = nl_attr_type(a);
5462 struct tx_port *p;
5463
5464 switch ((enum ovs_action_attr)type) {
5465 case OVS_ACTION_ATTR_OUTPUT:
5466 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
5467 if (OVS_LIKELY(p)) {
5468 struct dp_packet *packet;
5469 struct dp_packet_batch out;
5470
5471 if (!may_steal) {
5472 dp_packet_batch_clone(&out, packets_);
5473 dp_packet_batch_reset_cutlen(packets_);
5474 packets_ = &out;
5475 }
5476 dp_packet_batch_apply_cutlen(packets_);
5477
5478 #ifdef DPDK_NETDEV
5479 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
5480 && packets_->packets[0]->source
5481 != p->output_pkts.packets[0]->source)) {
5482 /* XXX: netdev-dpdk assumes that all packets in a single
5483 * output batch has the same source. Flush here to
5484 * avoid memory access issues. */
5485 dp_netdev_pmd_flush_output_on_port(pmd, p);
5486 }
5487 #endif
5488 if (dp_packet_batch_size(&p->output_pkts)
5489 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
5490 /* Flush here to avoid overflow. */
5491 dp_netdev_pmd_flush_output_on_port(pmd, p);
5492 }
5493
5494 if (dp_packet_batch_is_empty(&p->output_pkts)) {
5495 pmd->n_output_batches++;
5496 }
5497
5498 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5499 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
5500 pmd->ctx.last_rxq;
5501 dp_packet_batch_add(&p->output_pkts, packet);
5502 }
5503 return;
5504 }
5505 break;
5506
5507 case OVS_ACTION_ATTR_TUNNEL_PUSH:
5508 if (*depth < MAX_RECIRC_DEPTH) {
5509 dp_packet_batch_apply_cutlen(packets_);
5510 push_tnl_action(pmd, a, packets_);
5511 return;
5512 }
5513 break;
5514
5515 case OVS_ACTION_ATTR_TUNNEL_POP:
5516 if (*depth < MAX_RECIRC_DEPTH) {
5517 struct dp_packet_batch *orig_packets_ = packets_;
5518 odp_port_t portno = nl_attr_get_odp_port(a);
5519
5520 p = pmd_tnl_port_cache_lookup(pmd, portno);
5521 if (p) {
5522 struct dp_packet_batch tnl_pkt;
5523
5524 if (!may_steal) {
5525 dp_packet_batch_clone(&tnl_pkt, packets_);
5526 packets_ = &tnl_pkt;
5527 dp_packet_batch_reset_cutlen(orig_packets_);
5528 }
5529
5530 dp_packet_batch_apply_cutlen(packets_);
5531
5532 netdev_pop_header(p->port->netdev, packets_);
5533 if (dp_packet_batch_is_empty(packets_)) {
5534 return;
5535 }
5536
5537 struct dp_packet *packet;
5538 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5539 packet->md.in_port.odp_port = portno;
5540 }
5541
5542 (*depth)++;
5543 dp_netdev_recirculate(pmd, packets_);
5544 (*depth)--;
5545 return;
5546 }
5547 }
5548 break;
5549
5550 case OVS_ACTION_ATTR_USERSPACE:
5551 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
5552 struct dp_packet_batch *orig_packets_ = packets_;
5553 const struct nlattr *userdata;
5554 struct dp_packet_batch usr_pkt;
5555 struct ofpbuf actions;
5556 struct flow flow;
5557 ovs_u128 ufid;
5558 bool clone = false;
5559
5560 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
5561 ofpbuf_init(&actions, 0);
5562
5563 if (packets_->trunc) {
5564 if (!may_steal) {
5565 dp_packet_batch_clone(&usr_pkt, packets_);
5566 packets_ = &usr_pkt;
5567 clone = true;
5568 dp_packet_batch_reset_cutlen(orig_packets_);
5569 }
5570
5571 dp_packet_batch_apply_cutlen(packets_);
5572 }
5573
5574 struct dp_packet *packet;
5575 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5576 flow_extract(packet, &flow);
5577 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
5578 dp_execute_userspace_action(pmd, packet, may_steal, &flow,
5579 &ufid, &actions, userdata);
5580 }
5581
5582 if (clone) {
5583 dp_packet_delete_batch(packets_, true);
5584 }
5585
5586 ofpbuf_uninit(&actions);
5587 fat_rwlock_unlock(&dp->upcall_rwlock);
5588
5589 return;
5590 }
5591 break;
5592
5593 case OVS_ACTION_ATTR_RECIRC:
5594 if (*depth < MAX_RECIRC_DEPTH) {
5595 struct dp_packet_batch recirc_pkts;
5596
5597 if (!may_steal) {
5598 dp_packet_batch_clone(&recirc_pkts, packets_);
5599 packets_ = &recirc_pkts;
5600 }
5601
5602 struct dp_packet *packet;
5603 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5604 packet->md.recirc_id = nl_attr_get_u32(a);
5605 }
5606
5607 (*depth)++;
5608 dp_netdev_recirculate(pmd, packets_);
5609 (*depth)--;
5610
5611 return;
5612 }
5613
5614 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
5615 break;
5616
5617 case OVS_ACTION_ATTR_CT: {
5618 const struct nlattr *b;
5619 bool force = false;
5620 bool commit = false;
5621 unsigned int left;
5622 uint16_t zone = 0;
5623 const char *helper = NULL;
5624 const uint32_t *setmark = NULL;
5625 const struct ovs_key_ct_labels *setlabel = NULL;
5626 struct nat_action_info_t nat_action_info;
5627 struct nat_action_info_t *nat_action_info_ref = NULL;
5628 bool nat_config = false;
5629
5630 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
5631 nl_attr_get_size(a)) {
5632 enum ovs_ct_attr sub_type = nl_attr_type(b);
5633
5634 switch(sub_type) {
5635 case OVS_CT_ATTR_FORCE_COMMIT:
5636 force = true;
5637 /* fall through. */
5638 case OVS_CT_ATTR_COMMIT:
5639 commit = true;
5640 break;
5641 case OVS_CT_ATTR_ZONE:
5642 zone = nl_attr_get_u16(b);
5643 break;
5644 case OVS_CT_ATTR_HELPER:
5645 helper = nl_attr_get_string(b);
5646 break;
5647 case OVS_CT_ATTR_MARK:
5648 setmark = nl_attr_get(b);
5649 break;
5650 case OVS_CT_ATTR_LABELS:
5651 setlabel = nl_attr_get(b);
5652 break;
5653 case OVS_CT_ATTR_EVENTMASK:
5654 /* Silently ignored, as userspace datapath does not generate
5655 * netlink events. */
5656 break;
5657 case OVS_CT_ATTR_NAT: {
5658 const struct nlattr *b_nest;
5659 unsigned int left_nest;
5660 bool ip_min_specified = false;
5661 bool proto_num_min_specified = false;
5662 bool ip_max_specified = false;
5663 bool proto_num_max_specified = false;
5664 memset(&nat_action_info, 0, sizeof nat_action_info);
5665 nat_action_info_ref = &nat_action_info;
5666
5667 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
5668 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
5669
5670 switch (sub_type_nest) {
5671 case OVS_NAT_ATTR_SRC:
5672 case OVS_NAT_ATTR_DST:
5673 nat_config = true;
5674 nat_action_info.nat_action |=
5675 ((sub_type_nest == OVS_NAT_ATTR_SRC)
5676 ? NAT_ACTION_SRC : NAT_ACTION_DST);
5677 break;
5678 case OVS_NAT_ATTR_IP_MIN:
5679 memcpy(&nat_action_info.min_addr,
5680 nl_attr_get(b_nest),
5681 nl_attr_get_size(b_nest));
5682 ip_min_specified = true;
5683 break;
5684 case OVS_NAT_ATTR_IP_MAX:
5685 memcpy(&nat_action_info.max_addr,
5686 nl_attr_get(b_nest),
5687 nl_attr_get_size(b_nest));
5688 ip_max_specified = true;
5689 break;
5690 case OVS_NAT_ATTR_PROTO_MIN:
5691 nat_action_info.min_port =
5692 nl_attr_get_u16(b_nest);
5693 proto_num_min_specified = true;
5694 break;
5695 case OVS_NAT_ATTR_PROTO_MAX:
5696 nat_action_info.max_port =
5697 nl_attr_get_u16(b_nest);
5698 proto_num_max_specified = true;
5699 break;
5700 case OVS_NAT_ATTR_PERSISTENT:
5701 case OVS_NAT_ATTR_PROTO_HASH:
5702 case OVS_NAT_ATTR_PROTO_RANDOM:
5703 break;
5704 case OVS_NAT_ATTR_UNSPEC:
5705 case __OVS_NAT_ATTR_MAX:
5706 OVS_NOT_REACHED();
5707 }
5708 }
5709
5710 if (ip_min_specified && !ip_max_specified) {
5711 nat_action_info.max_addr = nat_action_info.min_addr;
5712 }
5713 if (proto_num_min_specified && !proto_num_max_specified) {
5714 nat_action_info.max_port = nat_action_info.min_port;
5715 }
5716 if (proto_num_min_specified || proto_num_max_specified) {
5717 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
5718 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
5719 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
5720 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
5721 }
5722 }
5723 break;
5724 }
5725 case OVS_CT_ATTR_UNSPEC:
5726 case __OVS_CT_ATTR_MAX:
5727 OVS_NOT_REACHED();
5728 }
5729 }
5730
5731 /* We won't be able to function properly in this case, hence
5732 * complain loudly. */
5733 if (nat_config && !commit) {
5734 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
5735 VLOG_WARN_RL(&rl, "NAT specified without commit.");
5736 }
5737
5738 conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
5739 commit, zone, setmark, setlabel, aux->flow->tp_src,
5740 aux->flow->tp_dst, helper, nat_action_info_ref,
5741 pmd->ctx.now / 1000);
5742 break;
5743 }
5744
5745 case OVS_ACTION_ATTR_METER:
5746 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
5747 pmd->ctx.now);
5748 break;
5749
5750 case OVS_ACTION_ATTR_PUSH_VLAN:
5751 case OVS_ACTION_ATTR_POP_VLAN:
5752 case OVS_ACTION_ATTR_PUSH_MPLS:
5753 case OVS_ACTION_ATTR_POP_MPLS:
5754 case OVS_ACTION_ATTR_SET:
5755 case OVS_ACTION_ATTR_SET_MASKED:
5756 case OVS_ACTION_ATTR_SAMPLE:
5757 case OVS_ACTION_ATTR_HASH:
5758 case OVS_ACTION_ATTR_UNSPEC:
5759 case OVS_ACTION_ATTR_TRUNC:
5760 case OVS_ACTION_ATTR_PUSH_ETH:
5761 case OVS_ACTION_ATTR_POP_ETH:
5762 case OVS_ACTION_ATTR_CLONE:
5763 case OVS_ACTION_ATTR_PUSH_NSH:
5764 case OVS_ACTION_ATTR_POP_NSH:
5765 case __OVS_ACTION_ATTR_MAX:
5766 OVS_NOT_REACHED();
5767 }
5768
5769 dp_packet_delete_batch(packets_, may_steal);
5770 }
5771
5772 static void
5773 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
5774 struct dp_packet_batch *packets,
5775 bool may_steal, const struct flow *flow,
5776 const struct nlattr *actions, size_t actions_len)
5777 {
5778 struct dp_netdev_execute_aux aux = { pmd, flow };
5779
5780 odp_execute_actions(&aux, packets, may_steal, actions,
5781 actions_len, dp_execute_cb);
5782 }
5783
5784 struct dp_netdev_ct_dump {
5785 struct ct_dpif_dump_state up;
5786 struct conntrack_dump dump;
5787 struct conntrack *ct;
5788 struct dp_netdev *dp;
5789 };
5790
5791 static int
5792 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
5793 const uint16_t *pzone, int *ptot_bkts)
5794 {
5795 struct dp_netdev *dp = get_dp_netdev(dpif);
5796 struct dp_netdev_ct_dump *dump;
5797
5798 dump = xzalloc(sizeof *dump);
5799 dump->dp = dp;
5800 dump->ct = &dp->conntrack;
5801
5802 conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
5803
5804 *dump_ = &dump->up;
5805
5806 return 0;
5807 }
5808
5809 static int
5810 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
5811 struct ct_dpif_dump_state *dump_,
5812 struct ct_dpif_entry *entry)
5813 {
5814 struct dp_netdev_ct_dump *dump;
5815
5816 INIT_CONTAINER(dump, dump_, up);
5817
5818 return conntrack_dump_next(&dump->dump, entry);
5819 }
5820
5821 static int
5822 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
5823 struct ct_dpif_dump_state *dump_)
5824 {
5825 struct dp_netdev_ct_dump *dump;
5826 int err;
5827
5828 INIT_CONTAINER(dump, dump_, up);
5829
5830 err = conntrack_dump_done(&dump->dump);
5831
5832 free(dump);
5833
5834 return err;
5835 }
5836
5837 static int
5838 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
5839 const struct ct_dpif_tuple *tuple)
5840 {
5841 struct dp_netdev *dp = get_dp_netdev(dpif);
5842
5843 if (tuple) {
5844 return EOPNOTSUPP;
5845 }
5846 return conntrack_flush(&dp->conntrack, zone);
5847 }
5848
5849 static int
5850 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
5851 {
5852 struct dp_netdev *dp = get_dp_netdev(dpif);
5853
5854 return conntrack_set_maxconns(&dp->conntrack, maxconns);
5855 }
5856
5857 static int
5858 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
5859 {
5860 struct dp_netdev *dp = get_dp_netdev(dpif);
5861
5862 return conntrack_get_maxconns(&dp->conntrack, maxconns);
5863 }
5864
5865 static int
5866 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
5867 {
5868 struct dp_netdev *dp = get_dp_netdev(dpif);
5869
5870 return conntrack_get_nconns(&dp->conntrack, nconns);
5871 }
5872
5873 const struct dpif_class dpif_netdev_class = {
5874 "netdev",
5875 dpif_netdev_init,
5876 dpif_netdev_enumerate,
5877 dpif_netdev_port_open_type,
5878 dpif_netdev_open,
5879 dpif_netdev_close,
5880 dpif_netdev_destroy,
5881 dpif_netdev_run,
5882 dpif_netdev_wait,
5883 dpif_netdev_get_stats,
5884 dpif_netdev_port_add,
5885 dpif_netdev_port_del,
5886 dpif_netdev_port_set_config,
5887 dpif_netdev_port_query_by_number,
5888 dpif_netdev_port_query_by_name,
5889 NULL, /* port_get_pid */
5890 dpif_netdev_port_dump_start,
5891 dpif_netdev_port_dump_next,
5892 dpif_netdev_port_dump_done,
5893 dpif_netdev_port_poll,
5894 dpif_netdev_port_poll_wait,
5895 dpif_netdev_flow_flush,
5896 dpif_netdev_flow_dump_create,
5897 dpif_netdev_flow_dump_destroy,
5898 dpif_netdev_flow_dump_thread_create,
5899 dpif_netdev_flow_dump_thread_destroy,
5900 dpif_netdev_flow_dump_next,
5901 dpif_netdev_operate,
5902 NULL, /* recv_set */
5903 NULL, /* handlers_set */
5904 dpif_netdev_set_config,
5905 dpif_netdev_queue_to_priority,
5906 NULL, /* recv */
5907 NULL, /* recv_wait */
5908 NULL, /* recv_purge */
5909 dpif_netdev_register_dp_purge_cb,
5910 dpif_netdev_register_upcall_cb,
5911 dpif_netdev_enable_upcall,
5912 dpif_netdev_disable_upcall,
5913 dpif_netdev_get_datapath_version,
5914 dpif_netdev_ct_dump_start,
5915 dpif_netdev_ct_dump_next,
5916 dpif_netdev_ct_dump_done,
5917 dpif_netdev_ct_flush,
5918 dpif_netdev_ct_set_maxconns,
5919 dpif_netdev_ct_get_maxconns,
5920 dpif_netdev_ct_get_nconns,
5921 dpif_netdev_meter_get_features,
5922 dpif_netdev_meter_set,
5923 dpif_netdev_meter_get,
5924 dpif_netdev_meter_del,
5925 };
5926
5927 static void
5928 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
5929 const char *argv[], void *aux OVS_UNUSED)
5930 {
5931 struct dp_netdev_port *port;
5932 struct dp_netdev *dp;
5933 odp_port_t port_no;
5934
5935 ovs_mutex_lock(&dp_netdev_mutex);
5936 dp = shash_find_data(&dp_netdevs, argv[1]);
5937 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
5938 ovs_mutex_unlock(&dp_netdev_mutex);
5939 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
5940 return;
5941 }
5942 ovs_refcount_ref(&dp->ref_cnt);
5943 ovs_mutex_unlock(&dp_netdev_mutex);
5944
5945 ovs_mutex_lock(&dp->port_mutex);
5946 if (get_port_by_name(dp, argv[2], &port)) {
5947 unixctl_command_reply_error(conn, "unknown port");
5948 goto exit;
5949 }
5950
5951 port_no = u32_to_odp(atoi(argv[3]));
5952 if (!port_no || port_no == ODPP_NONE) {
5953 unixctl_command_reply_error(conn, "bad port number");
5954 goto exit;
5955 }
5956 if (dp_netdev_lookup_port(dp, port_no)) {
5957 unixctl_command_reply_error(conn, "port number already in use");
5958 goto exit;
5959 }
5960
5961 /* Remove port. */
5962 hmap_remove(&dp->ports, &port->node);
5963 reconfigure_datapath(dp);
5964
5965 /* Reinsert with new port number. */
5966 port->port_no = port_no;
5967 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
5968 reconfigure_datapath(dp);
5969
5970 seq_change(dp->port_seq);
5971 unixctl_command_reply(conn, NULL);
5972
5973 exit:
5974 ovs_mutex_unlock(&dp->port_mutex);
5975 dp_netdev_unref(dp);
5976 }
5977
5978 static void
5979 dpif_dummy_register__(const char *type)
5980 {
5981 struct dpif_class *class;
5982
5983 class = xmalloc(sizeof *class);
5984 *class = dpif_netdev_class;
5985 class->type = xstrdup(type);
5986 dp_register_provider(class);
5987 }
5988
5989 static void
5990 dpif_dummy_override(const char *type)
5991 {
5992 int error;
5993
5994 /*
5995 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
5996 * a userland-only build. It's useful for testsuite.
5997 */
5998 error = dp_unregister_provider(type);
5999 if (error == 0 || error == EAFNOSUPPORT) {
6000 dpif_dummy_register__(type);
6001 }
6002 }
6003
6004 void
6005 dpif_dummy_register(enum dummy_level level)
6006 {
6007 if (level == DUMMY_OVERRIDE_ALL) {
6008 struct sset types;
6009 const char *type;
6010
6011 sset_init(&types);
6012 dp_enumerate_types(&types);
6013 SSET_FOR_EACH (type, &types) {
6014 dpif_dummy_override(type);
6015 }
6016 sset_destroy(&types);
6017 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
6018 dpif_dummy_override("system");
6019 }
6020
6021 dpif_dummy_register__("dummy");
6022
6023 unixctl_command_register("dpif-dummy/change-port-number",
6024 "dp port new-number",
6025 3, 3, dpif_dummy_change_port_number, NULL);
6026 }
6027 \f
6028 /* Datapath Classifier. */
6029
6030 /* A set of rules that all have the same fields wildcarded. */
6031 struct dpcls_subtable {
6032 /* The fields are only used by writers. */
6033 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
6034
6035 /* These fields are accessed by readers. */
6036 struct cmap rules; /* Contains "struct dpcls_rule"s. */
6037 uint32_t hit_cnt; /* Number of match hits in subtable in current
6038 optimization interval. */
6039 struct netdev_flow_key mask; /* Wildcards for fields (const). */
6040 /* 'mask' must be the last field, additional space is allocated here. */
6041 };
6042
6043 /* Initializes 'cls' as a classifier that initially contains no classification
6044 * rules. */
6045 static void
6046 dpcls_init(struct dpcls *cls)
6047 {
6048 cmap_init(&cls->subtables_map);
6049 pvector_init(&cls->subtables);
6050 }
6051
6052 static void
6053 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
6054 {
6055 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
6056 pvector_remove(&cls->subtables, subtable);
6057 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
6058 subtable->mask.hash);
6059 cmap_destroy(&subtable->rules);
6060 ovsrcu_postpone(free, subtable);
6061 }
6062
6063 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
6064 * caller's responsibility.
6065 * May only be called after all the readers have been terminated. */
6066 static void
6067 dpcls_destroy(struct dpcls *cls)
6068 {
6069 if (cls) {
6070 struct dpcls_subtable *subtable;
6071
6072 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
6073 ovs_assert(cmap_count(&subtable->rules) == 0);
6074 dpcls_destroy_subtable(cls, subtable);
6075 }
6076 cmap_destroy(&cls->subtables_map);
6077 pvector_destroy(&cls->subtables);
6078 }
6079 }
6080
6081 static struct dpcls_subtable *
6082 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6083 {
6084 struct dpcls_subtable *subtable;
6085
6086 /* Need to add one. */
6087 subtable = xmalloc(sizeof *subtable
6088 - sizeof subtable->mask.mf + mask->len);
6089 cmap_init(&subtable->rules);
6090 subtable->hit_cnt = 0;
6091 netdev_flow_key_clone(&subtable->mask, mask);
6092 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
6093 /* Add the new subtable at the end of the pvector (with no hits yet) */
6094 pvector_insert(&cls->subtables, subtable, 0);
6095 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
6096 cmap_count(&cls->subtables_map), subtable, cls->in_port);
6097 pvector_publish(&cls->subtables);
6098
6099 return subtable;
6100 }
6101
6102 static inline struct dpcls_subtable *
6103 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6104 {
6105 struct dpcls_subtable *subtable;
6106
6107 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
6108 &cls->subtables_map) {
6109 if (netdev_flow_key_equal(&subtable->mask, mask)) {
6110 return subtable;
6111 }
6112 }
6113 return dpcls_create_subtable(cls, mask);
6114 }
6115
6116
6117 /* Periodically sort the dpcls subtable vectors according to hit counts */
6118 static void
6119 dpcls_sort_subtable_vector(struct dpcls *cls)
6120 {
6121 struct pvector *pvec = &cls->subtables;
6122 struct dpcls_subtable *subtable;
6123
6124 PVECTOR_FOR_EACH (subtable, pvec) {
6125 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
6126 subtable->hit_cnt = 0;
6127 }
6128 pvector_publish(pvec);
6129 }
6130
6131 static inline void
6132 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
6133 struct polled_queue *poll_list, int poll_cnt)
6134 {
6135 struct dpcls *cls;
6136
6137 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
6138 uint64_t curr_tsc;
6139 /* Get the cycles that were used to process each queue and store. */
6140 for (unsigned i = 0; i < poll_cnt; i++) {
6141 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
6142 RXQ_CYCLES_PROC_CURR);
6143 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
6144 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
6145 0);
6146 }
6147 curr_tsc = cycles_counter_update(&pmd->perf_stats);
6148 if (pmd->intrvl_tsc_prev) {
6149 /* There is a prev timestamp, store a new intrvl cycle count. */
6150 atomic_store_relaxed(&pmd->intrvl_cycles,
6151 curr_tsc - pmd->intrvl_tsc_prev);
6152 }
6153 pmd->intrvl_tsc_prev = curr_tsc;
6154 /* Start new measuring interval */
6155 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6156 }
6157
6158 if (pmd->ctx.now > pmd->next_optimization) {
6159 /* Try to obtain the flow lock to block out revalidator threads.
6160 * If not possible, just try next time. */
6161 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
6162 /* Optimize each classifier */
6163 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6164 dpcls_sort_subtable_vector(cls);
6165 }
6166 ovs_mutex_unlock(&pmd->flow_mutex);
6167 /* Start new measuring interval */
6168 pmd->next_optimization = pmd->ctx.now
6169 + DPCLS_OPTIMIZATION_INTERVAL;
6170 }
6171 }
6172 }
6173
6174 /* Insert 'rule' into 'cls'. */
6175 static void
6176 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
6177 const struct netdev_flow_key *mask)
6178 {
6179 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
6180
6181 /* Refer to subtable's mask, also for later removal. */
6182 rule->mask = &subtable->mask;
6183 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
6184 }
6185
6186 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
6187 static void
6188 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
6189 {
6190 struct dpcls_subtable *subtable;
6191
6192 ovs_assert(rule->mask);
6193
6194 /* Get subtable from reference in rule->mask. */
6195 INIT_CONTAINER(subtable, rule->mask, mask);
6196 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
6197 == 0) {
6198 /* Delete empty subtable. */
6199 dpcls_destroy_subtable(cls, subtable);
6200 pvector_publish(&cls->subtables);
6201 }
6202 }
6203
6204 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
6205 * in 'mask' the values in 'key' and 'target' are the same. */
6206 static inline bool
6207 dpcls_rule_matches_key(const struct dpcls_rule *rule,
6208 const struct netdev_flow_key *target)
6209 {
6210 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
6211 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
6212 uint64_t value;
6213
6214 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
6215 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
6216 return false;
6217 }
6218 }
6219 return true;
6220 }
6221
6222 /* For each miniflow in 'keys' performs a classifier lookup writing the result
6223 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
6224 * NULL it is skipped.
6225 *
6226 * This function is optimized for use in the userspace datapath and therefore
6227 * does not implement a lot of features available in the standard
6228 * classifier_lookup() function. Specifically, it does not implement
6229 * priorities, instead returning any rule which matches the flow.
6230 *
6231 * Returns true if all miniflows found a corresponding rule. */
6232 static bool
6233 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
6234 struct dpcls_rule **rules, const size_t cnt,
6235 int *num_lookups_p)
6236 {
6237 /* The received 'cnt' miniflows are the search-keys that will be processed
6238 * to find a matching entry into the available subtables.
6239 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
6240 typedef uint32_t map_type;
6241 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
6242 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
6243
6244 struct dpcls_subtable *subtable;
6245
6246 map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
6247 map_type found_map;
6248 uint32_t hashes[MAP_BITS];
6249 const struct cmap_node *nodes[MAP_BITS];
6250
6251 if (cnt != MAP_BITS) {
6252 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
6253 }
6254 memset(rules, 0, cnt * sizeof *rules);
6255
6256 int lookups_match = 0, subtable_pos = 1;
6257
6258 /* The Datapath classifier - aka dpcls - is composed of subtables.
6259 * Subtables are dynamically created as needed when new rules are inserted.
6260 * Each subtable collects rules with matches on a specific subset of packet
6261 * fields as defined by the subtable's mask. We proceed to process every
6262 * search-key against each subtable, but when a match is found for a
6263 * search-key, the search for that key can stop because the rules are
6264 * non-overlapping. */
6265 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
6266 int i;
6267
6268 /* Compute hashes for the remaining keys. Each search-key is
6269 * masked with the subtable's mask to avoid hashing the wildcarded
6270 * bits. */
6271 ULLONG_FOR_EACH_1(i, keys_map) {
6272 hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
6273 &subtable->mask);
6274 }
6275 /* Lookup. */
6276 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
6277 /* Check results. When the i-th bit of found_map is set, it means
6278 * that a set of nodes with a matching hash value was found for the
6279 * i-th search-key. Due to possible hash collisions we need to check
6280 * which of the found rules, if any, really matches our masked
6281 * search-key. */
6282 ULLONG_FOR_EACH_1(i, found_map) {
6283 struct dpcls_rule *rule;
6284
6285 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
6286 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
6287 rules[i] = rule;
6288 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
6289 * within one second optimization interval. */
6290 subtable->hit_cnt++;
6291 lookups_match += subtable_pos;
6292 goto next;
6293 }
6294 }
6295 /* None of the found rules was a match. Reset the i-th bit to
6296 * keep searching this key in the next subtable. */
6297 ULLONG_SET0(found_map, i); /* Did not match. */
6298 next:
6299 ; /* Keep Sparse happy. */
6300 }
6301 keys_map &= ~found_map; /* Clear the found rules. */
6302 if (!keys_map) {
6303 if (num_lookups_p) {
6304 *num_lookups_p = lookups_match;
6305 }
6306 return true; /* All found. */
6307 }
6308 subtable_pos++;
6309 }
6310 if (num_lookups_p) {
6311 *num_lookups_p = lookups_match;
6312 }
6313 return false; /* Some misses. */
6314 }