]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: associate flow with a mark id
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <sys/types.h>
26 #include <netinet/in.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/ioctl.h>
31 #include <sys/socket.h>
32 #include <sys/stat.h>
33 #include <unistd.h>
34
35 #include "bitmap.h"
36 #include "cmap.h"
37 #include "conntrack.h"
38 #include "coverage.h"
39 #include "ct-dpif.h"
40 #include "csum.h"
41 #include "dp-packet.h"
42 #include "dpif.h"
43 #include "dpif-netdev-perf.h"
44 #include "dpif-provider.h"
45 #include "dummy.h"
46 #include "fat-rwlock.h"
47 #include "flow.h"
48 #include "hmapx.h"
49 #include "id-pool.h"
50 #include "latch.h"
51 #include "netdev.h"
52 #include "netdev-provider.h"
53 #include "netdev-vport.h"
54 #include "netlink.h"
55 #include "odp-execute.h"
56 #include "odp-util.h"
57 #include "openvswitch/dynamic-string.h"
58 #include "openvswitch/list.h"
59 #include "openvswitch/match.h"
60 #include "openvswitch/ofp-parse.h"
61 #include "openvswitch/ofp-print.h"
62 #include "openvswitch/ofpbuf.h"
63 #include "openvswitch/shash.h"
64 #include "openvswitch/vlog.h"
65 #include "ovs-numa.h"
66 #include "ovs-rcu.h"
67 #include "packets.h"
68 #include "openvswitch/poll-loop.h"
69 #include "pvector.h"
70 #include "random.h"
71 #include "seq.h"
72 #include "smap.h"
73 #include "sset.h"
74 #include "timeval.h"
75 #include "tnl-neigh-cache.h"
76 #include "tnl-ports.h"
77 #include "unixctl.h"
78 #include "util.h"
79 #include "uuid.h"
80
81 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
82
83 #define FLOW_DUMP_MAX_BATCH 50
84 /* Use per thread recirc_depth to prevent recirculation loop. */
85 #define MAX_RECIRC_DEPTH 6
86 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
87
88 /* Use instant packet send by default. */
89 #define DEFAULT_TX_FLUSH_INTERVAL 0
90
91 /* Configuration parameters. */
92 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
93 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
94 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
95 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
96
97 /* Protects against changes to 'dp_netdevs'. */
98 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
99
100 /* Contains all 'struct dp_netdev's. */
101 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
102 = SHASH_INITIALIZER(&dp_netdevs);
103
104 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
105
106 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
107 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
108 | CS_SRC_NAT | CS_DST_NAT)
109 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
110
111 static struct odp_support dp_netdev_support = {
112 .max_vlan_headers = SIZE_MAX,
113 .max_mpls_depth = SIZE_MAX,
114 .recirc = true,
115 .ct_state = true,
116 .ct_zone = true,
117 .ct_mark = true,
118 .ct_label = true,
119 .ct_state_nat = true,
120 .ct_orig_tuple = true,
121 .ct_orig_tuple6 = true,
122 };
123
124 /* Stores a miniflow with inline values */
125
126 struct netdev_flow_key {
127 uint32_t hash; /* Hash function differs for different users. */
128 uint32_t len; /* Length of the following miniflow (incl. map). */
129 struct miniflow mf;
130 uint64_t buf[FLOW_MAX_PACKET_U64S];
131 };
132
133 /* Exact match cache for frequently used flows
134 *
135 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
136 * search its entries for a miniflow that matches exactly the miniflow of the
137 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
138 *
139 * A cache entry holds a reference to its 'dp_netdev_flow'.
140 *
141 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
142 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
143 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
144 * value is the index of a cache entry where the miniflow could be.
145 *
146 *
147 * Thread-safety
148 * =============
149 *
150 * Each pmd_thread has its own private exact match cache.
151 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
152 */
153
154 #define EM_FLOW_HASH_SHIFT 13
155 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
156 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
157 #define EM_FLOW_HASH_SEGS 2
158
159 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
160 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
161 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
162 DEFAULT_EM_FLOW_INSERT_INV_PROB)
163
164 struct emc_entry {
165 struct dp_netdev_flow *flow;
166 struct netdev_flow_key key; /* key.hash used for emc hash value. */
167 };
168
169 struct emc_cache {
170 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
171 int sweep_idx; /* For emc_cache_slow_sweep(). */
172 };
173
174 /* Iterate in the exact match cache through every entry that might contain a
175 * miniflow with hash 'HASH'. */
176 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
177 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
178 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
179 i__ < EM_FLOW_HASH_SEGS; \
180 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
181 \f
182 /* Simple non-wildcarding single-priority classifier. */
183
184 /* Time in microseconds between successive optimizations of the dpcls
185 * subtable vector */
186 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
187
188 /* Time in microseconds of the interval in which rxq processing cycles used
189 * in rxq to pmd assignments is measured and stored. */
190 #define PMD_RXQ_INTERVAL_LEN 10000000LL
191
192 /* Number of intervals for which cycles are stored
193 * and used during rxq to pmd assignment. */
194 #define PMD_RXQ_INTERVAL_MAX 6
195
196 struct dpcls {
197 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
198 odp_port_t in_port;
199 struct cmap subtables_map;
200 struct pvector subtables;
201 };
202
203 /* A rule to be inserted to the classifier. */
204 struct dpcls_rule {
205 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
206 struct netdev_flow_key *mask; /* Subtable's mask. */
207 struct netdev_flow_key flow; /* Matching key. */
208 /* 'flow' must be the last field, additional space is allocated here. */
209 };
210
211 static void dpcls_init(struct dpcls *);
212 static void dpcls_destroy(struct dpcls *);
213 static void dpcls_sort_subtable_vector(struct dpcls *);
214 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
215 const struct netdev_flow_key *mask);
216 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
217 static bool dpcls_lookup(struct dpcls *cls,
218 const struct netdev_flow_key keys[],
219 struct dpcls_rule **rules, size_t cnt,
220 int *num_lookups_p);
221 \f
222 /* Set of supported meter flags */
223 #define DP_SUPPORTED_METER_FLAGS_MASK \
224 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
225
226 /* Set of supported meter band types */
227 #define DP_SUPPORTED_METER_BAND_TYPES \
228 ( 1 << OFPMBT13_DROP )
229
230 struct dp_meter_band {
231 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
232 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
233 uint64_t packet_count;
234 uint64_t byte_count;
235 };
236
237 struct dp_meter {
238 uint16_t flags;
239 uint16_t n_bands;
240 uint32_t max_delta_t;
241 uint64_t used;
242 uint64_t packet_count;
243 uint64_t byte_count;
244 struct dp_meter_band bands[];
245 };
246
247 /* Datapath based on the network device interface from netdev.h.
248 *
249 *
250 * Thread-safety
251 * =============
252 *
253 * Some members, marked 'const', are immutable. Accessing other members
254 * requires synchronization, as noted in more detail below.
255 *
256 * Acquisition order is, from outermost to innermost:
257 *
258 * dp_netdev_mutex (global)
259 * port_mutex
260 * non_pmd_mutex
261 */
262 struct dp_netdev {
263 const struct dpif_class *const class;
264 const char *const name;
265 struct dpif *dpif;
266 struct ovs_refcount ref_cnt;
267 atomic_flag destroyed;
268
269 /* Ports.
270 *
271 * Any lookup into 'ports' or any access to the dp_netdev_ports found
272 * through 'ports' requires taking 'port_mutex'. */
273 struct ovs_mutex port_mutex;
274 struct hmap ports;
275 struct seq *port_seq; /* Incremented whenever a port changes. */
276
277 /* The time that a packet can wait in output batch for sending. */
278 atomic_uint32_t tx_flush_interval;
279
280 /* Meters. */
281 struct ovs_mutex meter_locks[N_METER_LOCKS];
282 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
283
284 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
285 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
286 /* Enable collection of PMD performance metrics. */
287 atomic_bool pmd_perf_metrics;
288
289 /* Protects access to ofproto-dpif-upcall interface during revalidator
290 * thread synchronization. */
291 struct fat_rwlock upcall_rwlock;
292 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
293 void *upcall_aux;
294
295 /* Callback function for notifying the purging of dp flows (during
296 * reseting pmd deletion). */
297 dp_purge_callback *dp_purge_cb;
298 void *dp_purge_aux;
299
300 /* Stores all 'struct dp_netdev_pmd_thread's. */
301 struct cmap poll_threads;
302 /* id pool for per thread static_tx_qid. */
303 struct id_pool *tx_qid_pool;
304 struct ovs_mutex tx_qid_pool_mutex;
305
306 /* Protects the access of the 'struct dp_netdev_pmd_thread'
307 * instance for non-pmd thread. */
308 struct ovs_mutex non_pmd_mutex;
309
310 /* Each pmd thread will store its pointer to
311 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
312 ovsthread_key_t per_pmd_key;
313
314 struct seq *reconfigure_seq;
315 uint64_t last_reconfigure_seq;
316
317 /* Cpu mask for pin of pmd threads. */
318 char *pmd_cmask;
319
320 uint64_t last_tnl_conf_seq;
321
322 struct conntrack conntrack;
323 };
324
325 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
326 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
327 {
328 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
329 }
330
331 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
332 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
333 {
334 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
335 }
336
337
338 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
339 odp_port_t)
340 OVS_REQUIRES(dp->port_mutex);
341
342 enum rxq_cycles_counter_type {
343 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
344 processing packets during the current
345 interval. */
346 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
347 during rxq to pmd assignment. */
348 RXQ_N_CYCLES
349 };
350
351 #define XPS_TIMEOUT 500000LL /* In microseconds. */
352
353 /* Contained by struct dp_netdev_port's 'rxqs' member. */
354 struct dp_netdev_rxq {
355 struct dp_netdev_port *port;
356 struct netdev_rxq *rx;
357 unsigned core_id; /* Core to which this queue should be
358 pinned. OVS_CORE_UNSPEC if the
359 queue doesn't need to be pinned to a
360 particular core. */
361 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
362 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
363 bool is_vhost; /* Is rxq of a vhost port. */
364
365 /* Counters of cycles spent successfully polling and processing pkts. */
366 atomic_ullong cycles[RXQ_N_CYCLES];
367 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
368 sum them to yield the cycles used for an rxq. */
369 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
370 };
371
372 /* A port in a netdev-based datapath. */
373 struct dp_netdev_port {
374 odp_port_t port_no;
375 bool dynamic_txqs; /* If true XPS will be used. */
376 bool need_reconfigure; /* True if we should reconfigure netdev. */
377 struct netdev *netdev;
378 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
379 struct netdev_saved_flags *sf;
380 struct dp_netdev_rxq *rxqs;
381 unsigned n_rxq; /* Number of elements in 'rxqs' */
382 unsigned *txq_used; /* Number of threads that use each tx queue. */
383 struct ovs_mutex txq_used_mutex;
384 char *type; /* Port type as requested by user. */
385 char *rxq_affinity_list; /* Requested affinity of rx queues. */
386 };
387
388 /* Contained by struct dp_netdev_flow's 'stats' member. */
389 struct dp_netdev_flow_stats {
390 atomic_llong used; /* Last used time, in monotonic msecs. */
391 atomic_ullong packet_count; /* Number of packets matched. */
392 atomic_ullong byte_count; /* Number of bytes matched. */
393 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
394 };
395
396 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
397 *
398 *
399 * Thread-safety
400 * =============
401 *
402 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
403 * its pmd thread's classifier. The text below calls this classifier 'cls'.
404 *
405 * Motivation
406 * ----------
407 *
408 * The thread safety rules described here for "struct dp_netdev_flow" are
409 * motivated by two goals:
410 *
411 * - Prevent threads that read members of "struct dp_netdev_flow" from
412 * reading bad data due to changes by some thread concurrently modifying
413 * those members.
414 *
415 * - Prevent two threads making changes to members of a given "struct
416 * dp_netdev_flow" from interfering with each other.
417 *
418 *
419 * Rules
420 * -----
421 *
422 * A flow 'flow' may be accessed without a risk of being freed during an RCU
423 * grace period. Code that needs to hold onto a flow for a while
424 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
425 *
426 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
427 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
428 * from modification.
429 *
430 * Some members, marked 'const', are immutable. Accessing other members
431 * requires synchronization, as noted in more detail below.
432 */
433 struct dp_netdev_flow {
434 const struct flow flow; /* Unmasked flow that created this entry. */
435 /* Hash table index by unmasked flow. */
436 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
437 /* 'flow_table'. */
438 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
439 const ovs_u128 ufid; /* Unique flow identifier. */
440 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
441 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
442 /* flow. */
443
444 /* Number of references.
445 * The classifier owns one reference.
446 * Any thread trying to keep a rule from being freed should hold its own
447 * reference. */
448 struct ovs_refcount ref_cnt;
449
450 bool dead;
451 uint32_t mark; /* Unique flow mark assigned to a flow */
452
453 /* Statistics. */
454 struct dp_netdev_flow_stats stats;
455
456 /* Actions. */
457 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
458
459 /* While processing a group of input packets, the datapath uses the next
460 * member to store a pointer to the output batch for the flow. It is
461 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
462 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
463 struct packet_batch_per_flow *batch;
464
465 /* Packet classification. */
466 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
467 /* 'cr' must be the last member. */
468 };
469
470 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
471 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
472 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
473 struct flow *, bool);
474
475 /* A set of datapath actions within a "struct dp_netdev_flow".
476 *
477 *
478 * Thread-safety
479 * =============
480 *
481 * A struct dp_netdev_actions 'actions' is protected with RCU. */
482 struct dp_netdev_actions {
483 /* These members are immutable: they do not change during the struct's
484 * lifetime. */
485 unsigned int size; /* Size of 'actions', in bytes. */
486 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
487 };
488
489 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
490 size_t);
491 struct dp_netdev_actions *dp_netdev_flow_get_actions(
492 const struct dp_netdev_flow *);
493 static void dp_netdev_actions_free(struct dp_netdev_actions *);
494
495 struct polled_queue {
496 struct dp_netdev_rxq *rxq;
497 odp_port_t port_no;
498 };
499
500 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
501 struct rxq_poll {
502 struct dp_netdev_rxq *rxq;
503 struct hmap_node node;
504 };
505
506 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
507 * 'tnl_port_cache' or 'tx_ports'. */
508 struct tx_port {
509 struct dp_netdev_port *port;
510 int qid;
511 long long last_used;
512 struct hmap_node node;
513 long long flush_time;
514 struct dp_packet_batch output_pkts;
515 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
516 };
517
518 /* A set of properties for the current processing loop that is not directly
519 * associated with the pmd thread itself, but with the packets being
520 * processed or the short-term system configuration (for example, time).
521 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
522 struct dp_netdev_pmd_thread_ctx {
523 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
524 long long now;
525 /* RX queue from which last packet was received. */
526 struct dp_netdev_rxq *last_rxq;
527 };
528
529 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
530 * the performance overhead of interrupt processing. Therefore netdev can
531 * not implement rx-wait for these devices. dpif-netdev needs to poll
532 * these device to check for recv buffer. pmd-thread does polling for
533 * devices assigned to itself.
534 *
535 * DPDK used PMD for accessing NIC.
536 *
537 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
538 * I/O of all non-pmd threads. There will be no actual thread created
539 * for the instance.
540 *
541 * Each struct has its own flow cache and classifier per managed ingress port.
542 * For packets received on ingress port, a look up is done on corresponding PMD
543 * thread's flow cache and in case of a miss, lookup is performed in the
544 * corresponding classifier of port. Packets are executed with the found
545 * actions in either case.
546 * */
547 struct dp_netdev_pmd_thread {
548 struct dp_netdev *dp;
549 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
550 struct cmap_node node; /* In 'dp->poll_threads'. */
551
552 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
553 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
554
555 /* Per thread exact-match cache. Note, the instance for cpu core
556 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
557 * need to be protected by 'non_pmd_mutex'. Every other instance
558 * will only be accessed by its own pmd thread. */
559 struct emc_cache flow_cache;
560
561 /* Flow-Table and classifiers
562 *
563 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
564 * changes to 'classifiers' must be made while still holding the
565 * 'flow_mutex'.
566 */
567 struct ovs_mutex flow_mutex;
568 struct cmap flow_table OVS_GUARDED; /* Flow table. */
569
570 /* One classifier per in_port polled by the pmd */
571 struct cmap classifiers;
572 /* Periodically sort subtable vectors according to hit frequencies */
573 long long int next_optimization;
574 /* End of the next time interval for which processing cycles
575 are stored for each polled rxq. */
576 long long int rxq_next_cycle_store;
577
578 /* Last interval timestamp. */
579 uint64_t intrvl_tsc_prev;
580 /* Last interval cycles. */
581 atomic_ullong intrvl_cycles;
582
583 /* Current context of the PMD thread. */
584 struct dp_netdev_pmd_thread_ctx ctx;
585
586 struct latch exit_latch; /* For terminating the pmd thread. */
587 struct seq *reload_seq;
588 uint64_t last_reload_seq;
589 atomic_bool reload; /* Do we need to reload ports? */
590 pthread_t thread;
591 unsigned core_id; /* CPU core id of this pmd thread. */
592 int numa_id; /* numa node id of this pmd thread. */
593 bool isolated;
594
595 /* Queue id used by this pmd thread to send packets on all netdevs if
596 * XPS disabled for this netdev. All static_tx_qid's are unique and less
597 * than 'cmap_count(dp->poll_threads)'. */
598 uint32_t static_tx_qid;
599
600 /* Number of filled output batches. */
601 int n_output_batches;
602
603 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
604 /* List of rx queues to poll. */
605 struct hmap poll_list OVS_GUARDED;
606 /* Map of 'tx_port's used for transmission. Written by the main thread,
607 * read by the pmd thread. */
608 struct hmap tx_ports OVS_GUARDED;
609
610 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
611 * ports (that support push_tunnel/pop_tunnel), the other contains ports
612 * with at least one txq (that support send). A port can be in both.
613 *
614 * There are two separate maps to make sure that we don't try to execute
615 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
616 *
617 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
618 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
619 * other instance will only be accessed by its own pmd thread. */
620 struct hmap tnl_port_cache;
621 struct hmap send_port_cache;
622
623 /* Keep track of detailed PMD performance statistics. */
624 struct pmd_perf_stats perf_stats;
625
626 /* Set to true if the pmd thread needs to be reloaded. */
627 bool need_reload;
628 };
629
630 /* Interface to netdev-based datapath. */
631 struct dpif_netdev {
632 struct dpif dpif;
633 struct dp_netdev *dp;
634 uint64_t last_port_seq;
635 };
636
637 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
638 struct dp_netdev_port **portp)
639 OVS_REQUIRES(dp->port_mutex);
640 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
641 struct dp_netdev_port **portp)
642 OVS_REQUIRES(dp->port_mutex);
643 static void dp_netdev_free(struct dp_netdev *)
644 OVS_REQUIRES(dp_netdev_mutex);
645 static int do_add_port(struct dp_netdev *dp, const char *devname,
646 const char *type, odp_port_t port_no)
647 OVS_REQUIRES(dp->port_mutex);
648 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
649 OVS_REQUIRES(dp->port_mutex);
650 static int dpif_netdev_open(const struct dpif_class *, const char *name,
651 bool create, struct dpif **);
652 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
653 struct dp_packet_batch *,
654 bool should_steal,
655 const struct flow *flow,
656 const struct nlattr *actions,
657 size_t actions_len);
658 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
659 struct dp_packet_batch *, odp_port_t port_no);
660 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
661 struct dp_packet_batch *);
662
663 static void dp_netdev_disable_upcall(struct dp_netdev *);
664 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
665 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
666 struct dp_netdev *dp, unsigned core_id,
667 int numa_id);
668 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
669 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
670 OVS_REQUIRES(dp->port_mutex);
671
672 static void *pmd_thread_main(void *);
673 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
674 unsigned core_id);
675 static struct dp_netdev_pmd_thread *
676 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
677 static void dp_netdev_del_pmd(struct dp_netdev *dp,
678 struct dp_netdev_pmd_thread *pmd);
679 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
680 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
681 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
682 struct dp_netdev_port *port)
683 OVS_REQUIRES(pmd->port_mutex);
684 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
685 struct tx_port *tx)
686 OVS_REQUIRES(pmd->port_mutex);
687 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
688 struct dp_netdev_rxq *rxq)
689 OVS_REQUIRES(pmd->port_mutex);
690 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
691 struct rxq_poll *poll)
692 OVS_REQUIRES(pmd->port_mutex);
693 static int
694 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
695 bool force);
696
697 static void reconfigure_datapath(struct dp_netdev *dp)
698 OVS_REQUIRES(dp->port_mutex);
699 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
700 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
701 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
702 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
703 OVS_REQUIRES(pmd->port_mutex);
704 static inline void
705 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
706 struct polled_queue *poll_list, int poll_cnt);
707 static void
708 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
709 enum rxq_cycles_counter_type type,
710 unsigned long long cycles);
711 static uint64_t
712 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
713 enum rxq_cycles_counter_type type);
714 static void
715 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
716 unsigned long long cycles);
717 static uint64_t
718 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
719 static void
720 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
721 bool purge);
722 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
723 struct tx_port *tx);
724
725 static inline bool emc_entry_alive(struct emc_entry *ce);
726 static void emc_clear_entry(struct emc_entry *ce);
727
728 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
729 static inline bool
730 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
731
732 static void
733 emc_cache_init(struct emc_cache *flow_cache)
734 {
735 int i;
736
737 flow_cache->sweep_idx = 0;
738 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
739 flow_cache->entries[i].flow = NULL;
740 flow_cache->entries[i].key.hash = 0;
741 flow_cache->entries[i].key.len = sizeof(struct miniflow);
742 flowmap_init(&flow_cache->entries[i].key.mf.map);
743 }
744 }
745
746 static void
747 emc_cache_uninit(struct emc_cache *flow_cache)
748 {
749 int i;
750
751 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
752 emc_clear_entry(&flow_cache->entries[i]);
753 }
754 }
755
756 /* Check and clear dead flow references slowly (one entry at each
757 * invocation). */
758 static void
759 emc_cache_slow_sweep(struct emc_cache *flow_cache)
760 {
761 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
762
763 if (!emc_entry_alive(entry)) {
764 emc_clear_entry(entry);
765 }
766 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
767 }
768
769 /* Updates the time in PMD threads context and should be called in three cases:
770 *
771 * 1. PMD structure initialization:
772 * - dp_netdev_configure_pmd()
773 *
774 * 2. Before processing of the new packet batch:
775 * - dpif_netdev_execute()
776 * - dp_netdev_process_rxq_port()
777 *
778 * 3. At least once per polling iteration in main polling threads if no
779 * packets received on current iteration:
780 * - dpif_netdev_run()
781 * - pmd_thread_main()
782 *
783 * 'pmd->ctx.now' should be used without update in all other cases if possible.
784 */
785 static inline void
786 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
787 {
788 pmd->ctx.now = time_usec();
789 }
790
791 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
792 bool
793 dpif_is_netdev(const struct dpif *dpif)
794 {
795 return dpif->dpif_class->open == dpif_netdev_open;
796 }
797
798 static struct dpif_netdev *
799 dpif_netdev_cast(const struct dpif *dpif)
800 {
801 ovs_assert(dpif_is_netdev(dpif));
802 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
803 }
804
805 static struct dp_netdev *
806 get_dp_netdev(const struct dpif *dpif)
807 {
808 return dpif_netdev_cast(dpif)->dp;
809 }
810 \f
811 enum pmd_info_type {
812 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
813 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
814 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
815 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
816 };
817
818 static void
819 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
820 {
821 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
822 ? "main thread" : "pmd thread");
823 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
824 ds_put_format(reply, " numa_id %d", pmd->numa_id);
825 }
826 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
827 ds_put_format(reply, " core_id %u", pmd->core_id);
828 }
829 ds_put_cstr(reply, ":\n");
830 }
831
832 static void
833 pmd_info_show_stats(struct ds *reply,
834 struct dp_netdev_pmd_thread *pmd)
835 {
836 uint64_t stats[PMD_N_STATS];
837 uint64_t total_cycles, total_packets;
838 double passes_per_pkt = 0;
839 double lookups_per_hit = 0;
840 double packets_per_batch = 0;
841
842 pmd_perf_read_counters(&pmd->perf_stats, stats);
843 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
844 + stats[PMD_CYCLES_ITER_BUSY];
845 total_packets = stats[PMD_STAT_RECV];
846
847 format_pmd_thread(reply, pmd);
848
849 if (total_packets > 0) {
850 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
851 / (double) total_packets;
852 }
853 if (stats[PMD_STAT_MASKED_HIT] > 0) {
854 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
855 / (double) stats[PMD_STAT_MASKED_HIT];
856 }
857 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
858 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
859 / (double) stats[PMD_STAT_SENT_BATCHES];
860 }
861
862 ds_put_format(reply,
863 " packets received: %"PRIu64"\n"
864 " packet recirculations: %"PRIu64"\n"
865 " avg. datapath passes per packet: %.02f\n"
866 " emc hits: %"PRIu64"\n"
867 " megaflow hits: %"PRIu64"\n"
868 " avg. subtable lookups per megaflow hit: %.02f\n"
869 " miss with success upcall: %"PRIu64"\n"
870 " miss with failed upcall: %"PRIu64"\n"
871 " avg. packets per output batch: %.02f\n",
872 total_packets, stats[PMD_STAT_RECIRC],
873 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
874 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
875 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
876 packets_per_batch);
877
878 if (total_cycles == 0) {
879 return;
880 }
881
882 ds_put_format(reply,
883 " idle cycles: %"PRIu64" (%.02f%%)\n"
884 " processing cycles: %"PRIu64" (%.02f%%)\n",
885 stats[PMD_CYCLES_ITER_IDLE],
886 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
887 stats[PMD_CYCLES_ITER_BUSY],
888 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
889
890 if (total_packets == 0) {
891 return;
892 }
893
894 ds_put_format(reply,
895 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
896 total_cycles / (double) total_packets,
897 total_cycles, total_packets);
898
899 ds_put_format(reply,
900 " avg processing cycles per packet: "
901 "%.02f (%"PRIu64"/%"PRIu64")\n",
902 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
903 stats[PMD_CYCLES_ITER_BUSY], total_packets);
904 }
905
906 static void
907 pmd_info_show_perf(struct ds *reply,
908 struct dp_netdev_pmd_thread *pmd,
909 struct pmd_perf_params *par)
910 {
911 if (pmd->core_id != NON_PMD_CORE_ID) {
912 char *time_str =
913 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
914 long long now = time_msec();
915 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
916
917 ds_put_cstr(reply, "\n");
918 ds_put_format(reply, "Time: %s\n", time_str);
919 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
920 ds_put_cstr(reply, "\n");
921 format_pmd_thread(reply, pmd);
922 ds_put_cstr(reply, "\n");
923 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
924 if (pmd_perf_metrics_enabled(pmd)) {
925 /* Prevent parallel clearing of perf metrics. */
926 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
927 if (par->histograms) {
928 ds_put_cstr(reply, "\n");
929 pmd_perf_format_histograms(reply, &pmd->perf_stats);
930 }
931 if (par->iter_hist_len > 0) {
932 ds_put_cstr(reply, "\n");
933 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
934 par->iter_hist_len);
935 }
936 if (par->ms_hist_len > 0) {
937 ds_put_cstr(reply, "\n");
938 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
939 par->ms_hist_len);
940 }
941 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
942 }
943 free(time_str);
944 }
945 }
946
947 static int
948 compare_poll_list(const void *a_, const void *b_)
949 {
950 const struct rxq_poll *a = a_;
951 const struct rxq_poll *b = b_;
952
953 const char *namea = netdev_rxq_get_name(a->rxq->rx);
954 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
955
956 int cmp = strcmp(namea, nameb);
957 if (!cmp) {
958 return netdev_rxq_get_queue_id(a->rxq->rx)
959 - netdev_rxq_get_queue_id(b->rxq->rx);
960 } else {
961 return cmp;
962 }
963 }
964
965 static void
966 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
967 size_t *n)
968 {
969 struct rxq_poll *ret, *poll;
970 size_t i;
971
972 *n = hmap_count(&pmd->poll_list);
973 if (!*n) {
974 ret = NULL;
975 } else {
976 ret = xcalloc(*n, sizeof *ret);
977 i = 0;
978 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
979 ret[i] = *poll;
980 i++;
981 }
982 ovs_assert(i == *n);
983 qsort(ret, *n, sizeof *ret, compare_poll_list);
984 }
985
986 *list = ret;
987 }
988
989 static void
990 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
991 {
992 if (pmd->core_id != NON_PMD_CORE_ID) {
993 struct rxq_poll *list;
994 size_t n_rxq;
995 uint64_t total_cycles = 0;
996
997 ds_put_format(reply,
998 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
999 pmd->numa_id, pmd->core_id, (pmd->isolated)
1000 ? "true" : "false");
1001
1002 ovs_mutex_lock(&pmd->port_mutex);
1003 sorted_poll_list(pmd, &list, &n_rxq);
1004
1005 /* Get the total pmd cycles for an interval. */
1006 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1007 /* Estimate the cycles to cover all intervals. */
1008 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1009
1010 for (int i = 0; i < n_rxq; i++) {
1011 struct dp_netdev_rxq *rxq = list[i].rxq;
1012 const char *name = netdev_rxq_get_name(rxq->rx);
1013 uint64_t proc_cycles = 0;
1014
1015 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1016 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1017 }
1018 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1019 netdev_rxq_get_queue_id(list[i].rxq->rx));
1020 ds_put_format(reply, " pmd usage: ");
1021 if (total_cycles) {
1022 ds_put_format(reply, "%2"PRIu64"",
1023 proc_cycles * 100 / total_cycles);
1024 ds_put_cstr(reply, " %");
1025 } else {
1026 ds_put_format(reply, "%s", "NOT AVAIL");
1027 }
1028 ds_put_cstr(reply, "\n");
1029 }
1030 ovs_mutex_unlock(&pmd->port_mutex);
1031 free(list);
1032 }
1033 }
1034
1035 static int
1036 compare_poll_thread_list(const void *a_, const void *b_)
1037 {
1038 const struct dp_netdev_pmd_thread *a, *b;
1039
1040 a = *(struct dp_netdev_pmd_thread **)a_;
1041 b = *(struct dp_netdev_pmd_thread **)b_;
1042
1043 if (a->core_id < b->core_id) {
1044 return -1;
1045 }
1046 if (a->core_id > b->core_id) {
1047 return 1;
1048 }
1049 return 0;
1050 }
1051
1052 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1053 * this list, as long as we do not go to quiescent state. */
1054 static void
1055 sorted_poll_thread_list(struct dp_netdev *dp,
1056 struct dp_netdev_pmd_thread ***list,
1057 size_t *n)
1058 {
1059 struct dp_netdev_pmd_thread *pmd;
1060 struct dp_netdev_pmd_thread **pmd_list;
1061 size_t k = 0, n_pmds;
1062
1063 n_pmds = cmap_count(&dp->poll_threads);
1064 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1065
1066 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1067 if (k >= n_pmds) {
1068 break;
1069 }
1070 pmd_list[k++] = pmd;
1071 }
1072
1073 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1074
1075 *list = pmd_list;
1076 *n = k;
1077 }
1078
1079 static void
1080 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1081 const char *argv[], void *aux OVS_UNUSED)
1082 {
1083 struct ds reply = DS_EMPTY_INITIALIZER;
1084 struct dp_netdev *dp = NULL;
1085
1086 ovs_mutex_lock(&dp_netdev_mutex);
1087
1088 if (argc == 2) {
1089 dp = shash_find_data(&dp_netdevs, argv[1]);
1090 } else if (shash_count(&dp_netdevs) == 1) {
1091 /* There's only one datapath */
1092 dp = shash_first(&dp_netdevs)->data;
1093 }
1094
1095 if (!dp) {
1096 ovs_mutex_unlock(&dp_netdev_mutex);
1097 unixctl_command_reply_error(conn,
1098 "please specify an existing datapath");
1099 return;
1100 }
1101
1102 dp_netdev_request_reconfigure(dp);
1103 ovs_mutex_unlock(&dp_netdev_mutex);
1104 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1105 unixctl_command_reply(conn, ds_cstr(&reply));
1106 ds_destroy(&reply);
1107 }
1108
1109 static void
1110 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1111 void *aux)
1112 {
1113 struct ds reply = DS_EMPTY_INITIALIZER;
1114 struct dp_netdev_pmd_thread **pmd_list;
1115 struct dp_netdev *dp = NULL;
1116 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1117 unsigned int core_id;
1118 bool filter_on_pmd = false;
1119 size_t n;
1120
1121 ovs_mutex_lock(&dp_netdev_mutex);
1122
1123 while (argc > 1) {
1124 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1125 if (str_to_uint(argv[2], 10, &core_id)) {
1126 filter_on_pmd = true;
1127 }
1128 argc -= 2;
1129 argv += 2;
1130 } else {
1131 dp = shash_find_data(&dp_netdevs, argv[1]);
1132 argc -= 1;
1133 argv += 1;
1134 }
1135 }
1136
1137 if (!dp) {
1138 if (shash_count(&dp_netdevs) == 1) {
1139 /* There's only one datapath */
1140 dp = shash_first(&dp_netdevs)->data;
1141 } else {
1142 ovs_mutex_unlock(&dp_netdev_mutex);
1143 unixctl_command_reply_error(conn,
1144 "please specify an existing datapath");
1145 return;
1146 }
1147 }
1148
1149 sorted_poll_thread_list(dp, &pmd_list, &n);
1150 for (size_t i = 0; i < n; i++) {
1151 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1152 if (!pmd) {
1153 break;
1154 }
1155 if (filter_on_pmd && pmd->core_id != core_id) {
1156 continue;
1157 }
1158 if (type == PMD_INFO_SHOW_RXQ) {
1159 pmd_info_show_rxq(&reply, pmd);
1160 } else if (type == PMD_INFO_CLEAR_STATS) {
1161 pmd_perf_stats_clear(&pmd->perf_stats);
1162 } else if (type == PMD_INFO_SHOW_STATS) {
1163 pmd_info_show_stats(&reply, pmd);
1164 } else if (type == PMD_INFO_PERF_SHOW) {
1165 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1166 }
1167 }
1168 free(pmd_list);
1169
1170 ovs_mutex_unlock(&dp_netdev_mutex);
1171
1172 unixctl_command_reply(conn, ds_cstr(&reply));
1173 ds_destroy(&reply);
1174 }
1175
1176 static void
1177 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1178 const char *argv[],
1179 void *aux OVS_UNUSED)
1180 {
1181 struct pmd_perf_params par;
1182 long int it_hist = 0, ms_hist = 0;
1183 par.histograms = true;
1184
1185 while (argc > 1) {
1186 if (!strcmp(argv[1], "-nh")) {
1187 par.histograms = false;
1188 argc -= 1;
1189 argv += 1;
1190 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1191 it_hist = strtol(argv[2], NULL, 10);
1192 if (it_hist < 0) {
1193 it_hist = 0;
1194 } else if (it_hist > HISTORY_LEN) {
1195 it_hist = HISTORY_LEN;
1196 }
1197 argc -= 2;
1198 argv += 2;
1199 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1200 ms_hist = strtol(argv[2], NULL, 10);
1201 if (ms_hist < 0) {
1202 ms_hist = 0;
1203 } else if (ms_hist > HISTORY_LEN) {
1204 ms_hist = HISTORY_LEN;
1205 }
1206 argc -= 2;
1207 argv += 2;
1208 } else {
1209 break;
1210 }
1211 }
1212 par.iter_hist_len = it_hist;
1213 par.ms_hist_len = ms_hist;
1214 par.command_type = PMD_INFO_PERF_SHOW;
1215 dpif_netdev_pmd_info(conn, argc, argv, &par);
1216 }
1217 \f
1218 static int
1219 dpif_netdev_init(void)
1220 {
1221 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1222 clear_aux = PMD_INFO_CLEAR_STATS,
1223 poll_aux = PMD_INFO_SHOW_RXQ;
1224
1225 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1226 0, 3, dpif_netdev_pmd_info,
1227 (void *)&show_aux);
1228 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1229 0, 3, dpif_netdev_pmd_info,
1230 (void *)&clear_aux);
1231 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1232 0, 3, dpif_netdev_pmd_info,
1233 (void *)&poll_aux);
1234 unixctl_command_register("dpif-netdev/pmd-perf-show",
1235 "[-nh] [-it iter-history-len]"
1236 " [-ms ms-history-len]"
1237 " [-pmd core] [dp]",
1238 0, 8, pmd_perf_show_cmd,
1239 NULL);
1240 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1241 0, 1, dpif_netdev_pmd_rebalance,
1242 NULL);
1243 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1244 "on|off [-b before] [-a after] [-e|-ne] "
1245 "[-us usec] [-q qlen]",
1246 0, 10, pmd_perf_log_set_cmd,
1247 NULL);
1248 return 0;
1249 }
1250
1251 static int
1252 dpif_netdev_enumerate(struct sset *all_dps,
1253 const struct dpif_class *dpif_class)
1254 {
1255 struct shash_node *node;
1256
1257 ovs_mutex_lock(&dp_netdev_mutex);
1258 SHASH_FOR_EACH(node, &dp_netdevs) {
1259 struct dp_netdev *dp = node->data;
1260 if (dpif_class != dp->class) {
1261 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1262 * If the class doesn't match, skip this dpif. */
1263 continue;
1264 }
1265 sset_add(all_dps, node->name);
1266 }
1267 ovs_mutex_unlock(&dp_netdev_mutex);
1268
1269 return 0;
1270 }
1271
1272 static bool
1273 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1274 {
1275 return class != &dpif_netdev_class;
1276 }
1277
1278 static const char *
1279 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1280 {
1281 return strcmp(type, "internal") ? type
1282 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1283 : "tap";
1284 }
1285
1286 static struct dpif *
1287 create_dpif_netdev(struct dp_netdev *dp)
1288 {
1289 uint16_t netflow_id = hash_string(dp->name, 0);
1290 struct dpif_netdev *dpif;
1291
1292 ovs_refcount_ref(&dp->ref_cnt);
1293
1294 dpif = xmalloc(sizeof *dpif);
1295 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1296 dpif->dp = dp;
1297 dpif->last_port_seq = seq_read(dp->port_seq);
1298
1299 return &dpif->dpif;
1300 }
1301
1302 /* Choose an unused, non-zero port number and return it on success.
1303 * Return ODPP_NONE on failure. */
1304 static odp_port_t
1305 choose_port(struct dp_netdev *dp, const char *name)
1306 OVS_REQUIRES(dp->port_mutex)
1307 {
1308 uint32_t port_no;
1309
1310 if (dp->class != &dpif_netdev_class) {
1311 const char *p;
1312 int start_no = 0;
1313
1314 /* If the port name begins with "br", start the number search at
1315 * 100 to make writing tests easier. */
1316 if (!strncmp(name, "br", 2)) {
1317 start_no = 100;
1318 }
1319
1320 /* If the port name contains a number, try to assign that port number.
1321 * This can make writing unit tests easier because port numbers are
1322 * predictable. */
1323 for (p = name; *p != '\0'; p++) {
1324 if (isdigit((unsigned char) *p)) {
1325 port_no = start_no + strtol(p, NULL, 10);
1326 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1327 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1328 return u32_to_odp(port_no);
1329 }
1330 break;
1331 }
1332 }
1333 }
1334
1335 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1336 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1337 return u32_to_odp(port_no);
1338 }
1339 }
1340
1341 return ODPP_NONE;
1342 }
1343
1344 static int
1345 create_dp_netdev(const char *name, const struct dpif_class *class,
1346 struct dp_netdev **dpp)
1347 OVS_REQUIRES(dp_netdev_mutex)
1348 {
1349 struct dp_netdev *dp;
1350 int error;
1351
1352 dp = xzalloc(sizeof *dp);
1353 shash_add(&dp_netdevs, name, dp);
1354
1355 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1356 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1357 ovs_refcount_init(&dp->ref_cnt);
1358 atomic_flag_clear(&dp->destroyed);
1359
1360 ovs_mutex_init(&dp->port_mutex);
1361 hmap_init(&dp->ports);
1362 dp->port_seq = seq_create();
1363 fat_rwlock_init(&dp->upcall_rwlock);
1364
1365 dp->reconfigure_seq = seq_create();
1366 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1367
1368 for (int i = 0; i < N_METER_LOCKS; ++i) {
1369 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1370 }
1371
1372 /* Disable upcalls by default. */
1373 dp_netdev_disable_upcall(dp);
1374 dp->upcall_aux = NULL;
1375 dp->upcall_cb = NULL;
1376
1377 conntrack_init(&dp->conntrack);
1378
1379 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1380 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1381
1382 cmap_init(&dp->poll_threads);
1383
1384 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1385 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1386 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1387
1388 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1389 ovsthread_key_create(&dp->per_pmd_key, NULL);
1390
1391 ovs_mutex_lock(&dp->port_mutex);
1392 /* non-PMD will be created before all other threads and will
1393 * allocate static_tx_qid = 0. */
1394 dp_netdev_set_nonpmd(dp);
1395
1396 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1397 "internal"),
1398 ODPP_LOCAL);
1399 ovs_mutex_unlock(&dp->port_mutex);
1400 if (error) {
1401 dp_netdev_free(dp);
1402 return error;
1403 }
1404
1405 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1406 *dpp = dp;
1407 return 0;
1408 }
1409
1410 static void
1411 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1412 {
1413 seq_change(dp->reconfigure_seq);
1414 }
1415
1416 static bool
1417 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1418 {
1419 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1420 }
1421
1422 static int
1423 dpif_netdev_open(const struct dpif_class *class, const char *name,
1424 bool create, struct dpif **dpifp)
1425 {
1426 struct dp_netdev *dp;
1427 int error;
1428
1429 ovs_mutex_lock(&dp_netdev_mutex);
1430 dp = shash_find_data(&dp_netdevs, name);
1431 if (!dp) {
1432 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1433 } else {
1434 error = (dp->class != class ? EINVAL
1435 : create ? EEXIST
1436 : 0);
1437 }
1438 if (!error) {
1439 *dpifp = create_dpif_netdev(dp);
1440 dp->dpif = *dpifp;
1441 }
1442 ovs_mutex_unlock(&dp_netdev_mutex);
1443
1444 return error;
1445 }
1446
1447 static void
1448 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1449 OVS_NO_THREAD_SAFETY_ANALYSIS
1450 {
1451 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1452 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1453
1454 /* Before freeing a lock we should release it */
1455 fat_rwlock_unlock(&dp->upcall_rwlock);
1456 fat_rwlock_destroy(&dp->upcall_rwlock);
1457 }
1458
1459 static void
1460 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1461 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1462 {
1463 if (dp->meters[meter_id]) {
1464 free(dp->meters[meter_id]);
1465 dp->meters[meter_id] = NULL;
1466 }
1467 }
1468
1469 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1470 * through the 'dp_netdevs' shash while freeing 'dp'. */
1471 static void
1472 dp_netdev_free(struct dp_netdev *dp)
1473 OVS_REQUIRES(dp_netdev_mutex)
1474 {
1475 struct dp_netdev_port *port, *next;
1476
1477 shash_find_and_delete(&dp_netdevs, dp->name);
1478
1479 ovs_mutex_lock(&dp->port_mutex);
1480 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1481 do_del_port(dp, port);
1482 }
1483 ovs_mutex_unlock(&dp->port_mutex);
1484
1485 dp_netdev_destroy_all_pmds(dp, true);
1486 cmap_destroy(&dp->poll_threads);
1487
1488 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1489 id_pool_destroy(dp->tx_qid_pool);
1490
1491 ovs_mutex_destroy(&dp->non_pmd_mutex);
1492 ovsthread_key_delete(dp->per_pmd_key);
1493
1494 conntrack_destroy(&dp->conntrack);
1495
1496
1497 seq_destroy(dp->reconfigure_seq);
1498
1499 seq_destroy(dp->port_seq);
1500 hmap_destroy(&dp->ports);
1501 ovs_mutex_destroy(&dp->port_mutex);
1502
1503 /* Upcalls must be disabled at this point */
1504 dp_netdev_destroy_upcall_lock(dp);
1505
1506 int i;
1507
1508 for (i = 0; i < MAX_METERS; ++i) {
1509 meter_lock(dp, i);
1510 dp_delete_meter(dp, i);
1511 meter_unlock(dp, i);
1512 }
1513 for (i = 0; i < N_METER_LOCKS; ++i) {
1514 ovs_mutex_destroy(&dp->meter_locks[i]);
1515 }
1516
1517 free(dp->pmd_cmask);
1518 free(CONST_CAST(char *, dp->name));
1519 free(dp);
1520 }
1521
1522 static void
1523 dp_netdev_unref(struct dp_netdev *dp)
1524 {
1525 if (dp) {
1526 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1527 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1528 ovs_mutex_lock(&dp_netdev_mutex);
1529 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1530 dp_netdev_free(dp);
1531 }
1532 ovs_mutex_unlock(&dp_netdev_mutex);
1533 }
1534 }
1535
1536 static void
1537 dpif_netdev_close(struct dpif *dpif)
1538 {
1539 struct dp_netdev *dp = get_dp_netdev(dpif);
1540
1541 dp_netdev_unref(dp);
1542 free(dpif);
1543 }
1544
1545 static int
1546 dpif_netdev_destroy(struct dpif *dpif)
1547 {
1548 struct dp_netdev *dp = get_dp_netdev(dpif);
1549
1550 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1551 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1552 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1553 OVS_NOT_REACHED();
1554 }
1555 }
1556
1557 return 0;
1558 }
1559
1560 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1561 * load/store semantics. While the increment is not atomic, the load and
1562 * store operations are, making it impossible to read inconsistent values.
1563 *
1564 * This is used to update thread local stats counters. */
1565 static void
1566 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1567 {
1568 unsigned long long tmp;
1569
1570 atomic_read_relaxed(var, &tmp);
1571 tmp += n;
1572 atomic_store_relaxed(var, tmp);
1573 }
1574
1575 static int
1576 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1577 {
1578 struct dp_netdev *dp = get_dp_netdev(dpif);
1579 struct dp_netdev_pmd_thread *pmd;
1580 uint64_t pmd_stats[PMD_N_STATS];
1581
1582 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1583 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1584 stats->n_flows += cmap_count(&pmd->flow_table);
1585 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1586 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1587 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1588 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1589 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1590 }
1591 stats->n_masks = UINT32_MAX;
1592 stats->n_mask_hit = UINT64_MAX;
1593
1594 return 0;
1595 }
1596
1597 static void
1598 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1599 {
1600 if (pmd->core_id == NON_PMD_CORE_ID) {
1601 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1602 ovs_mutex_lock(&pmd->port_mutex);
1603 pmd_load_cached_ports(pmd);
1604 ovs_mutex_unlock(&pmd->port_mutex);
1605 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1606 return;
1607 }
1608
1609 ovs_mutex_lock(&pmd->cond_mutex);
1610 seq_change(pmd->reload_seq);
1611 atomic_store_relaxed(&pmd->reload, true);
1612 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1613 ovs_mutex_unlock(&pmd->cond_mutex);
1614 }
1615
1616 static uint32_t
1617 hash_port_no(odp_port_t port_no)
1618 {
1619 return hash_int(odp_to_u32(port_no), 0);
1620 }
1621
1622 static int
1623 port_create(const char *devname, const char *type,
1624 odp_port_t port_no, struct dp_netdev_port **portp)
1625 {
1626 struct netdev_saved_flags *sf;
1627 struct dp_netdev_port *port;
1628 enum netdev_flags flags;
1629 struct netdev *netdev;
1630 int error;
1631
1632 *portp = NULL;
1633
1634 /* Open and validate network device. */
1635 error = netdev_open(devname, type, &netdev);
1636 if (error) {
1637 return error;
1638 }
1639 /* XXX reject non-Ethernet devices */
1640
1641 netdev_get_flags(netdev, &flags);
1642 if (flags & NETDEV_LOOPBACK) {
1643 VLOG_ERR("%s: cannot add a loopback device", devname);
1644 error = EINVAL;
1645 goto out;
1646 }
1647
1648 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1649 if (error) {
1650 VLOG_ERR("%s: cannot set promisc flag", devname);
1651 goto out;
1652 }
1653
1654 port = xzalloc(sizeof *port);
1655 port->port_no = port_no;
1656 port->netdev = netdev;
1657 port->type = xstrdup(type);
1658 port->sf = sf;
1659 port->need_reconfigure = true;
1660 ovs_mutex_init(&port->txq_used_mutex);
1661
1662 *portp = port;
1663
1664 return 0;
1665
1666 out:
1667 netdev_close(netdev);
1668 return error;
1669 }
1670
1671 static int
1672 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1673 odp_port_t port_no)
1674 OVS_REQUIRES(dp->port_mutex)
1675 {
1676 struct dp_netdev_port *port;
1677 int error;
1678
1679 /* Reject devices already in 'dp'. */
1680 if (!get_port_by_name(dp, devname, &port)) {
1681 return EEXIST;
1682 }
1683
1684 error = port_create(devname, type, port_no, &port);
1685 if (error) {
1686 return error;
1687 }
1688
1689 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1690 seq_change(dp->port_seq);
1691
1692 reconfigure_datapath(dp);
1693
1694 return 0;
1695 }
1696
1697 static int
1698 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1699 odp_port_t *port_nop)
1700 {
1701 struct dp_netdev *dp = get_dp_netdev(dpif);
1702 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1703 const char *dpif_port;
1704 odp_port_t port_no;
1705 int error;
1706
1707 ovs_mutex_lock(&dp->port_mutex);
1708 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1709 if (*port_nop != ODPP_NONE) {
1710 port_no = *port_nop;
1711 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1712 } else {
1713 port_no = choose_port(dp, dpif_port);
1714 error = port_no == ODPP_NONE ? EFBIG : 0;
1715 }
1716 if (!error) {
1717 *port_nop = port_no;
1718 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1719 }
1720 ovs_mutex_unlock(&dp->port_mutex);
1721
1722 return error;
1723 }
1724
1725 static int
1726 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1727 {
1728 struct dp_netdev *dp = get_dp_netdev(dpif);
1729 int error;
1730
1731 ovs_mutex_lock(&dp->port_mutex);
1732 if (port_no == ODPP_LOCAL) {
1733 error = EINVAL;
1734 } else {
1735 struct dp_netdev_port *port;
1736
1737 error = get_port_by_number(dp, port_no, &port);
1738 if (!error) {
1739 do_del_port(dp, port);
1740 }
1741 }
1742 ovs_mutex_unlock(&dp->port_mutex);
1743
1744 return error;
1745 }
1746
1747 static bool
1748 is_valid_port_number(odp_port_t port_no)
1749 {
1750 return port_no != ODPP_NONE;
1751 }
1752
1753 static struct dp_netdev_port *
1754 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1755 OVS_REQUIRES(dp->port_mutex)
1756 {
1757 struct dp_netdev_port *port;
1758
1759 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1760 if (port->port_no == port_no) {
1761 return port;
1762 }
1763 }
1764 return NULL;
1765 }
1766
1767 static int
1768 get_port_by_number(struct dp_netdev *dp,
1769 odp_port_t port_no, struct dp_netdev_port **portp)
1770 OVS_REQUIRES(dp->port_mutex)
1771 {
1772 if (!is_valid_port_number(port_no)) {
1773 *portp = NULL;
1774 return EINVAL;
1775 } else {
1776 *portp = dp_netdev_lookup_port(dp, port_no);
1777 return *portp ? 0 : ENODEV;
1778 }
1779 }
1780
1781 static void
1782 port_destroy(struct dp_netdev_port *port)
1783 {
1784 if (!port) {
1785 return;
1786 }
1787
1788 netdev_close(port->netdev);
1789 netdev_restore_flags(port->sf);
1790
1791 for (unsigned i = 0; i < port->n_rxq; i++) {
1792 netdev_rxq_close(port->rxqs[i].rx);
1793 }
1794 ovs_mutex_destroy(&port->txq_used_mutex);
1795 free(port->rxq_affinity_list);
1796 free(port->txq_used);
1797 free(port->rxqs);
1798 free(port->type);
1799 free(port);
1800 }
1801
1802 static int
1803 get_port_by_name(struct dp_netdev *dp,
1804 const char *devname, struct dp_netdev_port **portp)
1805 OVS_REQUIRES(dp->port_mutex)
1806 {
1807 struct dp_netdev_port *port;
1808
1809 HMAP_FOR_EACH (port, node, &dp->ports) {
1810 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1811 *portp = port;
1812 return 0;
1813 }
1814 }
1815
1816 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1817 * existing port. */
1818 return ENODEV;
1819 }
1820
1821 /* Returns 'true' if there is a port with pmd netdev. */
1822 static bool
1823 has_pmd_port(struct dp_netdev *dp)
1824 OVS_REQUIRES(dp->port_mutex)
1825 {
1826 struct dp_netdev_port *port;
1827
1828 HMAP_FOR_EACH (port, node, &dp->ports) {
1829 if (netdev_is_pmd(port->netdev)) {
1830 return true;
1831 }
1832 }
1833
1834 return false;
1835 }
1836
1837 static void
1838 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1839 OVS_REQUIRES(dp->port_mutex)
1840 {
1841 hmap_remove(&dp->ports, &port->node);
1842 seq_change(dp->port_seq);
1843
1844 reconfigure_datapath(dp);
1845
1846 port_destroy(port);
1847 }
1848
1849 static void
1850 answer_port_query(const struct dp_netdev_port *port,
1851 struct dpif_port *dpif_port)
1852 {
1853 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1854 dpif_port->type = xstrdup(port->type);
1855 dpif_port->port_no = port->port_no;
1856 }
1857
1858 static int
1859 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1860 struct dpif_port *dpif_port)
1861 {
1862 struct dp_netdev *dp = get_dp_netdev(dpif);
1863 struct dp_netdev_port *port;
1864 int error;
1865
1866 ovs_mutex_lock(&dp->port_mutex);
1867 error = get_port_by_number(dp, port_no, &port);
1868 if (!error && dpif_port) {
1869 answer_port_query(port, dpif_port);
1870 }
1871 ovs_mutex_unlock(&dp->port_mutex);
1872
1873 return error;
1874 }
1875
1876 static int
1877 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1878 struct dpif_port *dpif_port)
1879 {
1880 struct dp_netdev *dp = get_dp_netdev(dpif);
1881 struct dp_netdev_port *port;
1882 int error;
1883
1884 ovs_mutex_lock(&dp->port_mutex);
1885 error = get_port_by_name(dp, devname, &port);
1886 if (!error && dpif_port) {
1887 answer_port_query(port, dpif_port);
1888 }
1889 ovs_mutex_unlock(&dp->port_mutex);
1890
1891 return error;
1892 }
1893
1894 static void
1895 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1896 {
1897 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1898 free(flow);
1899 }
1900
1901 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1902 {
1903 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1904 ovsrcu_postpone(dp_netdev_flow_free, flow);
1905 }
1906 }
1907
1908 static uint32_t
1909 dp_netdev_flow_hash(const ovs_u128 *ufid)
1910 {
1911 return ufid->u32[0];
1912 }
1913
1914 static inline struct dpcls *
1915 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1916 odp_port_t in_port)
1917 {
1918 struct dpcls *cls;
1919 uint32_t hash = hash_port_no(in_port);
1920 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1921 if (cls->in_port == in_port) {
1922 /* Port classifier exists already */
1923 return cls;
1924 }
1925 }
1926 return NULL;
1927 }
1928
1929 static inline struct dpcls *
1930 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1931 odp_port_t in_port)
1932 OVS_REQUIRES(pmd->flow_mutex)
1933 {
1934 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1935 uint32_t hash = hash_port_no(in_port);
1936
1937 if (!cls) {
1938 /* Create new classifier for in_port */
1939 cls = xmalloc(sizeof(*cls));
1940 dpcls_init(cls);
1941 cls->in_port = in_port;
1942 cmap_insert(&pmd->classifiers, &cls->node, hash);
1943 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1944 }
1945 return cls;
1946 }
1947
1948 #define MAX_FLOW_MARK (UINT32_MAX - 1)
1949 #define INVALID_FLOW_MARK (UINT32_MAX)
1950
1951 struct megaflow_to_mark_data {
1952 const struct cmap_node node;
1953 ovs_u128 mega_ufid;
1954 uint32_t mark;
1955 };
1956
1957 struct flow_mark {
1958 struct cmap megaflow_to_mark;
1959 struct cmap mark_to_flow;
1960 struct id_pool *pool;
1961 struct ovs_mutex mutex;
1962 };
1963
1964 static struct flow_mark flow_mark = {
1965 .megaflow_to_mark = CMAP_INITIALIZER,
1966 .mark_to_flow = CMAP_INITIALIZER,
1967 .mutex = OVS_MUTEX_INITIALIZER,
1968 };
1969
1970 static uint32_t
1971 flow_mark_alloc(void)
1972 {
1973 uint32_t mark;
1974
1975 if (!flow_mark.pool) {
1976 /* Haven't initiated yet, do it here */
1977 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
1978 }
1979
1980 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
1981 return mark;
1982 }
1983
1984 return INVALID_FLOW_MARK;
1985 }
1986
1987 static void
1988 flow_mark_free(uint32_t mark)
1989 {
1990 id_pool_free_id(flow_mark.pool, mark);
1991 }
1992
1993 /* associate megaflow with a mark, which is a 1:1 mapping */
1994 static void
1995 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
1996 {
1997 size_t hash = dp_netdev_flow_hash(mega_ufid);
1998 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
1999
2000 data->mega_ufid = *mega_ufid;
2001 data->mark = mark;
2002
2003 cmap_insert(&flow_mark.megaflow_to_mark,
2004 CONST_CAST(struct cmap_node *, &data->node), hash);
2005 }
2006
2007 /* disassociate meagaflow with a mark */
2008 static void
2009 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2010 {
2011 size_t hash = dp_netdev_flow_hash(mega_ufid);
2012 struct megaflow_to_mark_data *data;
2013
2014 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2015 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2016 cmap_remove(&flow_mark.megaflow_to_mark,
2017 CONST_CAST(struct cmap_node *, &data->node), hash);
2018 free(data);
2019 return;
2020 }
2021 }
2022
2023 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2024 UUID_ARGS((struct uuid *)mega_ufid));
2025 }
2026
2027 static inline uint32_t
2028 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2029 {
2030 size_t hash = dp_netdev_flow_hash(mega_ufid);
2031 struct megaflow_to_mark_data *data;
2032
2033 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2034 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2035 return data->mark;
2036 }
2037 }
2038
2039 VLOG_WARN("Mark id for ufid "UUID_FMT" was not found\n",
2040 UUID_ARGS((struct uuid *)mega_ufid));
2041 return INVALID_FLOW_MARK;
2042 }
2043
2044 /* associate mark with a flow, which is 1:N mapping */
2045 static void
2046 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2047 {
2048 dp_netdev_flow_ref(flow);
2049
2050 cmap_insert(&flow_mark.mark_to_flow,
2051 CONST_CAST(struct cmap_node *, &flow->mark_node),
2052 hash_int(mark, 0));
2053 flow->mark = mark;
2054
2055 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2056 }
2057
2058 static bool
2059 flow_mark_has_no_ref(uint32_t mark)
2060 {
2061 struct dp_netdev_flow *flow;
2062
2063 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2064 &flow_mark.mark_to_flow) {
2065 if (flow->mark == mark) {
2066 return false;
2067 }
2068 }
2069
2070 return true;
2071 }
2072
2073 static int
2074 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2075 struct dp_netdev_flow *flow)
2076 {
2077 int ret = 0;
2078 uint32_t mark = flow->mark;
2079 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2080 &flow->mark_node);
2081
2082 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2083 flow->mark = INVALID_FLOW_MARK;
2084
2085 /*
2086 * no flow is referencing the mark any more? If so, let's
2087 * remove the flow from hardware and free the mark.
2088 */
2089 if (flow_mark_has_no_ref(mark)) {
2090 struct dp_netdev_port *port;
2091 odp_port_t in_port = flow->flow.in_port.odp_port;
2092
2093 ovs_mutex_lock(&pmd->dp->port_mutex);
2094 port = dp_netdev_lookup_port(pmd->dp, in_port);
2095 if (port) {
2096 ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2097 }
2098 ovs_mutex_unlock(&pmd->dp->port_mutex);
2099
2100 flow_mark_free(mark);
2101 VLOG_DBG("Freed flow mark %u\n", mark);
2102
2103 megaflow_to_mark_disassociate(&flow->mega_ufid);
2104 }
2105 dp_netdev_flow_unref(flow);
2106
2107 return ret;
2108 }
2109
2110 static void
2111 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2112 {
2113 struct dp_netdev_flow *flow;
2114
2115 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2116 if (flow->pmd_id == pmd->core_id) {
2117 mark_to_flow_disassociate(pmd, flow);
2118 }
2119 }
2120 }
2121
2122 static void
2123 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2124 struct dp_netdev_flow *flow)
2125 OVS_REQUIRES(pmd->flow_mutex)
2126 {
2127 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2128 struct dpcls *cls;
2129 odp_port_t in_port = flow->flow.in_port.odp_port;
2130
2131 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2132 ovs_assert(cls != NULL);
2133 dpcls_remove(cls, &flow->cr);
2134 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2135 if (flow->mark != INVALID_FLOW_MARK) {
2136 mark_to_flow_disassociate(pmd, flow);
2137 }
2138 flow->dead = true;
2139
2140 dp_netdev_flow_unref(flow);
2141 }
2142
2143 static void
2144 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2145 {
2146 struct dp_netdev_flow *netdev_flow;
2147
2148 ovs_mutex_lock(&pmd->flow_mutex);
2149 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2150 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2151 }
2152 ovs_mutex_unlock(&pmd->flow_mutex);
2153 }
2154
2155 static int
2156 dpif_netdev_flow_flush(struct dpif *dpif)
2157 {
2158 struct dp_netdev *dp = get_dp_netdev(dpif);
2159 struct dp_netdev_pmd_thread *pmd;
2160
2161 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2162 dp_netdev_pmd_flow_flush(pmd);
2163 }
2164
2165 return 0;
2166 }
2167
2168 struct dp_netdev_port_state {
2169 struct hmap_position position;
2170 char *name;
2171 };
2172
2173 static int
2174 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2175 {
2176 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2177 return 0;
2178 }
2179
2180 static int
2181 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2182 struct dpif_port *dpif_port)
2183 {
2184 struct dp_netdev_port_state *state = state_;
2185 struct dp_netdev *dp = get_dp_netdev(dpif);
2186 struct hmap_node *node;
2187 int retval;
2188
2189 ovs_mutex_lock(&dp->port_mutex);
2190 node = hmap_at_position(&dp->ports, &state->position);
2191 if (node) {
2192 struct dp_netdev_port *port;
2193
2194 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2195
2196 free(state->name);
2197 state->name = xstrdup(netdev_get_name(port->netdev));
2198 dpif_port->name = state->name;
2199 dpif_port->type = port->type;
2200 dpif_port->port_no = port->port_no;
2201
2202 retval = 0;
2203 } else {
2204 retval = EOF;
2205 }
2206 ovs_mutex_unlock(&dp->port_mutex);
2207
2208 return retval;
2209 }
2210
2211 static int
2212 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2213 {
2214 struct dp_netdev_port_state *state = state_;
2215 free(state->name);
2216 free(state);
2217 return 0;
2218 }
2219
2220 static int
2221 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2222 {
2223 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2224 uint64_t new_port_seq;
2225 int error;
2226
2227 new_port_seq = seq_read(dpif->dp->port_seq);
2228 if (dpif->last_port_seq != new_port_seq) {
2229 dpif->last_port_seq = new_port_seq;
2230 error = ENOBUFS;
2231 } else {
2232 error = EAGAIN;
2233 }
2234
2235 return error;
2236 }
2237
2238 static void
2239 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2240 {
2241 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2242
2243 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2244 }
2245
2246 static struct dp_netdev_flow *
2247 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2248 {
2249 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2250 }
2251
2252 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2253 {
2254 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2255 }
2256
2257 /* netdev_flow_key utilities.
2258 *
2259 * netdev_flow_key is basically a miniflow. We use these functions
2260 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2261 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2262 *
2263 * - Since we are dealing exclusively with miniflows created by
2264 * miniflow_extract(), if the map is different the miniflow is different.
2265 * Therefore we can be faster by comparing the map and the miniflow in a
2266 * single memcmp().
2267 * - These functions can be inlined by the compiler. */
2268
2269 /* Given the number of bits set in miniflow's maps, returns the size of the
2270 * 'netdev_flow_key.mf' */
2271 static inline size_t
2272 netdev_flow_key_size(size_t flow_u64s)
2273 {
2274 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2275 }
2276
2277 static inline bool
2278 netdev_flow_key_equal(const struct netdev_flow_key *a,
2279 const struct netdev_flow_key *b)
2280 {
2281 /* 'b->len' may be not set yet. */
2282 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2283 }
2284
2285 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2286 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2287 * generated by miniflow_extract. */
2288 static inline bool
2289 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2290 const struct miniflow *mf)
2291 {
2292 return !memcmp(&key->mf, mf, key->len);
2293 }
2294
2295 static inline void
2296 netdev_flow_key_clone(struct netdev_flow_key *dst,
2297 const struct netdev_flow_key *src)
2298 {
2299 memcpy(dst, src,
2300 offsetof(struct netdev_flow_key, mf) + src->len);
2301 }
2302
2303 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2304 static inline void
2305 netdev_flow_mask_init(struct netdev_flow_key *mask,
2306 const struct match *match)
2307 {
2308 uint64_t *dst = miniflow_values(&mask->mf);
2309 struct flowmap fmap;
2310 uint32_t hash = 0;
2311 size_t idx;
2312
2313 /* Only check masks that make sense for the flow. */
2314 flow_wc_map(&match->flow, &fmap);
2315 flowmap_init(&mask->mf.map);
2316
2317 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2318 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2319
2320 if (mask_u64) {
2321 flowmap_set(&mask->mf.map, idx, 1);
2322 *dst++ = mask_u64;
2323 hash = hash_add64(hash, mask_u64);
2324 }
2325 }
2326
2327 map_t map;
2328
2329 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2330 hash = hash_add64(hash, map);
2331 }
2332
2333 size_t n = dst - miniflow_get_values(&mask->mf);
2334
2335 mask->hash = hash_finish(hash, n * 8);
2336 mask->len = netdev_flow_key_size(n);
2337 }
2338
2339 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2340 static inline void
2341 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2342 const struct flow *flow,
2343 const struct netdev_flow_key *mask)
2344 {
2345 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2346 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2347 uint32_t hash = 0;
2348 uint64_t value;
2349
2350 dst->len = mask->len;
2351 dst->mf = mask->mf; /* Copy maps. */
2352
2353 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2354 *dst_u64 = value & *mask_u64++;
2355 hash = hash_add64(hash, *dst_u64++);
2356 }
2357 dst->hash = hash_finish(hash,
2358 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2359 }
2360
2361 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2362 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
2363 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2364
2365 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2366 * 'mask'. */
2367 static inline uint32_t
2368 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2369 const struct netdev_flow_key *mask)
2370 {
2371 const uint64_t *p = miniflow_get_values(&mask->mf);
2372 uint32_t hash = 0;
2373 uint64_t value;
2374
2375 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2376 hash = hash_add64(hash, value & *p++);
2377 }
2378
2379 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2380 }
2381
2382 static inline bool
2383 emc_entry_alive(struct emc_entry *ce)
2384 {
2385 return ce->flow && !ce->flow->dead;
2386 }
2387
2388 static void
2389 emc_clear_entry(struct emc_entry *ce)
2390 {
2391 if (ce->flow) {
2392 dp_netdev_flow_unref(ce->flow);
2393 ce->flow = NULL;
2394 }
2395 }
2396
2397 static inline void
2398 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2399 const struct netdev_flow_key *key)
2400 {
2401 if (ce->flow != flow) {
2402 if (ce->flow) {
2403 dp_netdev_flow_unref(ce->flow);
2404 }
2405
2406 if (dp_netdev_flow_ref(flow)) {
2407 ce->flow = flow;
2408 } else {
2409 ce->flow = NULL;
2410 }
2411 }
2412 if (key) {
2413 netdev_flow_key_clone(&ce->key, key);
2414 }
2415 }
2416
2417 static inline void
2418 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2419 struct dp_netdev_flow *flow)
2420 {
2421 struct emc_entry *to_be_replaced = NULL;
2422 struct emc_entry *current_entry;
2423
2424 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2425 if (netdev_flow_key_equal(&current_entry->key, key)) {
2426 /* We found the entry with the 'mf' miniflow */
2427 emc_change_entry(current_entry, flow, NULL);
2428 return;
2429 }
2430
2431 /* Replacement policy: put the flow in an empty (not alive) entry, or
2432 * in the first entry where it can be */
2433 if (!to_be_replaced
2434 || (emc_entry_alive(to_be_replaced)
2435 && !emc_entry_alive(current_entry))
2436 || current_entry->key.hash < to_be_replaced->key.hash) {
2437 to_be_replaced = current_entry;
2438 }
2439 }
2440 /* We didn't find the miniflow in the cache.
2441 * The 'to_be_replaced' entry is where the new flow will be stored */
2442
2443 emc_change_entry(to_be_replaced, flow, key);
2444 }
2445
2446 static inline void
2447 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2448 const struct netdev_flow_key *key,
2449 struct dp_netdev_flow *flow)
2450 {
2451 /* Insert an entry into the EMC based on probability value 'min'. By
2452 * default the value is UINT32_MAX / 100 which yields an insertion
2453 * probability of 1/100 ie. 1% */
2454
2455 uint32_t min;
2456 atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
2457
2458 if (min && random_uint32() <= min) {
2459 emc_insert(&pmd->flow_cache, key, flow);
2460 }
2461 }
2462
2463 static inline struct dp_netdev_flow *
2464 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2465 {
2466 struct emc_entry *current_entry;
2467
2468 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2469 if (current_entry->key.hash == key->hash
2470 && emc_entry_alive(current_entry)
2471 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2472
2473 /* We found the entry with the 'key->mf' miniflow */
2474 return current_entry->flow;
2475 }
2476 }
2477
2478 return NULL;
2479 }
2480
2481 static struct dp_netdev_flow *
2482 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2483 const struct netdev_flow_key *key,
2484 int *lookup_num_p)
2485 {
2486 struct dpcls *cls;
2487 struct dpcls_rule *rule;
2488 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2489 in_port.odp_port));
2490 struct dp_netdev_flow *netdev_flow = NULL;
2491
2492 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2493 if (OVS_LIKELY(cls)) {
2494 dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2495 netdev_flow = dp_netdev_flow_cast(rule);
2496 }
2497 return netdev_flow;
2498 }
2499
2500 static struct dp_netdev_flow *
2501 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2502 const ovs_u128 *ufidp, const struct nlattr *key,
2503 size_t key_len)
2504 {
2505 struct dp_netdev_flow *netdev_flow;
2506 struct flow flow;
2507 ovs_u128 ufid;
2508
2509 /* If a UFID is not provided, determine one based on the key. */
2510 if (!ufidp && key && key_len
2511 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2512 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2513 ufidp = &ufid;
2514 }
2515
2516 if (ufidp) {
2517 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2518 &pmd->flow_table) {
2519 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2520 return netdev_flow;
2521 }
2522 }
2523 }
2524
2525 return NULL;
2526 }
2527
2528 static void
2529 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2530 struct dpif_flow_stats *stats)
2531 {
2532 struct dp_netdev_flow *netdev_flow;
2533 unsigned long long n;
2534 long long used;
2535 uint16_t flags;
2536
2537 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2538
2539 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2540 stats->n_packets = n;
2541 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2542 stats->n_bytes = n;
2543 atomic_read_relaxed(&netdev_flow->stats.used, &used);
2544 stats->used = used;
2545 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2546 stats->tcp_flags = flags;
2547 }
2548
2549 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2550 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2551 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2552 * protect them. */
2553 static void
2554 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2555 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2556 struct dpif_flow *flow, bool terse)
2557 {
2558 if (terse) {
2559 memset(flow, 0, sizeof *flow);
2560 } else {
2561 struct flow_wildcards wc;
2562 struct dp_netdev_actions *actions;
2563 size_t offset;
2564 struct odp_flow_key_parms odp_parms = {
2565 .flow = &netdev_flow->flow,
2566 .mask = &wc.masks,
2567 .support = dp_netdev_support,
2568 };
2569
2570 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2571 /* in_port is exact matched, but we have left it out from the mask for
2572 * optimnization reasons. Add in_port back to the mask. */
2573 wc.masks.in_port.odp_port = ODPP_NONE;
2574
2575 /* Key */
2576 offset = key_buf->size;
2577 flow->key = ofpbuf_tail(key_buf);
2578 odp_flow_key_from_flow(&odp_parms, key_buf);
2579 flow->key_len = key_buf->size - offset;
2580
2581 /* Mask */
2582 offset = mask_buf->size;
2583 flow->mask = ofpbuf_tail(mask_buf);
2584 odp_parms.key_buf = key_buf;
2585 odp_flow_key_from_mask(&odp_parms, mask_buf);
2586 flow->mask_len = mask_buf->size - offset;
2587
2588 /* Actions */
2589 actions = dp_netdev_flow_get_actions(netdev_flow);
2590 flow->actions = actions->actions;
2591 flow->actions_len = actions->size;
2592 }
2593
2594 flow->ufid = netdev_flow->ufid;
2595 flow->ufid_present = true;
2596 flow->pmd_id = netdev_flow->pmd_id;
2597 get_dpif_flow_stats(netdev_flow, &flow->stats);
2598 }
2599
2600 static int
2601 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2602 const struct nlattr *mask_key,
2603 uint32_t mask_key_len, const struct flow *flow,
2604 struct flow_wildcards *wc, bool probe)
2605 {
2606 enum odp_key_fitness fitness;
2607
2608 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
2609 if (fitness) {
2610 if (!probe) {
2611 /* This should not happen: it indicates that
2612 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2613 * disagree on the acceptable form of a mask. Log the problem
2614 * as an error, with enough details to enable debugging. */
2615 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2616
2617 if (!VLOG_DROP_ERR(&rl)) {
2618 struct ds s;
2619
2620 ds_init(&s);
2621 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2622 true);
2623 VLOG_ERR("internal error parsing flow mask %s (%s)",
2624 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2625 ds_destroy(&s);
2626 }
2627 }
2628
2629 return EINVAL;
2630 }
2631
2632 return 0;
2633 }
2634
2635 static int
2636 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2637 struct flow *flow, bool probe)
2638 {
2639 if (odp_flow_key_to_flow(key, key_len, flow)) {
2640 if (!probe) {
2641 /* This should not happen: it indicates that
2642 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2643 * the acceptable form of a flow. Log the problem as an error,
2644 * with enough details to enable debugging. */
2645 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2646
2647 if (!VLOG_DROP_ERR(&rl)) {
2648 struct ds s;
2649
2650 ds_init(&s);
2651 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2652 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2653 ds_destroy(&s);
2654 }
2655 }
2656
2657 return EINVAL;
2658 }
2659
2660 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2661 return EINVAL;
2662 }
2663
2664 return 0;
2665 }
2666
2667 static int
2668 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2669 {
2670 struct dp_netdev *dp = get_dp_netdev(dpif);
2671 struct dp_netdev_flow *netdev_flow;
2672 struct dp_netdev_pmd_thread *pmd;
2673 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2674 struct hmapx_node *node;
2675 int error = EINVAL;
2676
2677 if (get->pmd_id == PMD_ID_NULL) {
2678 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2679 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2680 dp_netdev_pmd_unref(pmd);
2681 }
2682 }
2683 } else {
2684 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2685 if (!pmd) {
2686 goto out;
2687 }
2688 hmapx_add(&to_find, pmd);
2689 }
2690
2691 if (!hmapx_count(&to_find)) {
2692 goto out;
2693 }
2694
2695 HMAPX_FOR_EACH (node, &to_find) {
2696 pmd = (struct dp_netdev_pmd_thread *) node->data;
2697 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2698 get->key_len);
2699 if (netdev_flow) {
2700 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2701 get->flow, false);
2702 error = 0;
2703 break;
2704 } else {
2705 error = ENOENT;
2706 }
2707 }
2708
2709 HMAPX_FOR_EACH (node, &to_find) {
2710 pmd = (struct dp_netdev_pmd_thread *) node->data;
2711 dp_netdev_pmd_unref(pmd);
2712 }
2713 out:
2714 hmapx_destroy(&to_find);
2715 return error;
2716 }
2717
2718 /*
2719 * There are two flow offload operations here: addition and modification.
2720 *
2721 * For flow addition, this function does:
2722 * - allocate a new flow mark id
2723 * - perform hardware flow offload
2724 * - associate the flow mark with flow and mega flow
2725 *
2726 * For flow modification, both flow mark and the associations are still
2727 * valid, thus only item 2 needed.
2728 */
2729 static void
2730 try_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, odp_port_t in_port,
2731 struct dp_netdev_flow *flow, struct match *match,
2732 const struct nlattr *actions, size_t actions_len)
2733 {
2734 struct offload_info info;
2735 struct dp_netdev_port *port;
2736 bool modification = flow->mark != INVALID_FLOW_MARK;
2737 const char *op = modification ? "modify" : "add";
2738 uint32_t mark;
2739 int ret;
2740
2741 ovs_mutex_lock(&flow_mark.mutex);
2742
2743 if (modification) {
2744 mark = flow->mark;
2745 } else {
2746 if (!netdev_is_flow_api_enabled()) {
2747 goto out;
2748 }
2749
2750 /*
2751 * If a mega flow has already been offloaded (from other PMD
2752 * instances), do not offload it again.
2753 */
2754 mark = megaflow_to_mark_find(&flow->mega_ufid);
2755 if (mark != INVALID_FLOW_MARK) {
2756 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2757 mark_to_flow_associate(mark, flow);
2758 goto out;
2759 }
2760
2761 mark = flow_mark_alloc();
2762 if (mark == INVALID_FLOW_MARK) {
2763 VLOG_ERR("Failed to allocate flow mark!\n");
2764 goto out;
2765 }
2766 }
2767 info.flow_mark = mark;
2768
2769 ovs_mutex_lock(&pmd->dp->port_mutex);
2770 port = dp_netdev_lookup_port(pmd->dp, in_port);
2771 if (!port) {
2772 ovs_mutex_unlock(&pmd->dp->port_mutex);
2773 goto out;
2774 }
2775 ret = netdev_flow_put(port->netdev, match,
2776 CONST_CAST(struct nlattr *, actions),
2777 actions_len, &flow->mega_ufid, &info, NULL);
2778 ovs_mutex_unlock(&pmd->dp->port_mutex);
2779
2780 if (ret) {
2781 VLOG_ERR("Failed to %s netdev flow with mark %u\n", op, mark);
2782 if (!modification) {
2783 flow_mark_free(mark);
2784 } else {
2785 mark_to_flow_disassociate(pmd, flow);
2786 }
2787 goto out;
2788 }
2789
2790 if (!modification) {
2791 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2792 mark_to_flow_associate(mark, flow);
2793 }
2794 VLOG_DBG("Succeed to %s netdev flow with mark %u\n", op, mark);
2795
2796 out:
2797 ovs_mutex_unlock(&flow_mark.mutex);
2798 }
2799
2800 static void
2801 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
2802 {
2803 struct flow masked_flow;
2804 size_t i;
2805
2806 for (i = 0; i < sizeof(struct flow); i++) {
2807 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
2808 ((uint8_t *)&match->wc)[i];
2809 }
2810 dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
2811 }
2812
2813 static struct dp_netdev_flow *
2814 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2815 struct match *match, const ovs_u128 *ufid,
2816 const struct nlattr *actions, size_t actions_len)
2817 OVS_REQUIRES(pmd->flow_mutex)
2818 {
2819 struct dp_netdev_flow *flow;
2820 struct netdev_flow_key mask;
2821 struct dpcls *cls;
2822
2823 /* Make sure in_port is exact matched before we read it. */
2824 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
2825 odp_port_t in_port = match->flow.in_port.odp_port;
2826
2827 /* As we select the dpcls based on the port number, each netdev flow
2828 * belonging to the same dpcls will have the same odp_port value.
2829 * For performance reasons we wildcard odp_port here in the mask. In the
2830 * typical case dp_hash is also wildcarded, and the resulting 8-byte
2831 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
2832 * will not be part of the subtable mask.
2833 * This will speed up the hash computation during dpcls_lookup() because
2834 * there is one less call to hash_add64() in this case. */
2835 match->wc.masks.in_port.odp_port = 0;
2836 netdev_flow_mask_init(&mask, match);
2837 match->wc.masks.in_port.odp_port = ODPP_NONE;
2838
2839 /* Make sure wc does not have metadata. */
2840 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2841 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2842
2843 /* Do not allocate extra space. */
2844 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2845 memset(&flow->stats, 0, sizeof flow->stats);
2846 flow->dead = false;
2847 flow->batch = NULL;
2848 flow->mark = INVALID_FLOW_MARK;
2849 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2850 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2851 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2852 ovs_refcount_init(&flow->ref_cnt);
2853 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2854
2855 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
2856 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2857
2858 /* Select dpcls for in_port. Relies on in_port to be exact match. */
2859 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2860 dpcls_insert(cls, &flow->cr, &mask);
2861
2862 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2863 dp_netdev_flow_hash(&flow->ufid));
2864
2865 try_netdev_flow_put(pmd, in_port, flow, match, actions, actions_len);
2866
2867 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
2868 struct ds ds = DS_EMPTY_INITIALIZER;
2869 struct ofpbuf key_buf, mask_buf;
2870 struct odp_flow_key_parms odp_parms = {
2871 .flow = &match->flow,
2872 .mask = &match->wc.masks,
2873 .support = dp_netdev_support,
2874 };
2875
2876 ofpbuf_init(&key_buf, 0);
2877 ofpbuf_init(&mask_buf, 0);
2878
2879 odp_flow_key_from_flow(&odp_parms, &key_buf);
2880 odp_parms.key_buf = &key_buf;
2881 odp_flow_key_from_mask(&odp_parms, &mask_buf);
2882
2883 ds_put_cstr(&ds, "flow_add: ");
2884 odp_format_ufid(ufid, &ds);
2885 ds_put_cstr(&ds, " ");
2886 odp_flow_format(key_buf.data, key_buf.size,
2887 mask_buf.data, mask_buf.size,
2888 NULL, &ds, false);
2889 ds_put_cstr(&ds, ", actions:");
2890 format_odp_actions(&ds, actions, actions_len, NULL);
2891
2892 VLOG_DBG("%s", ds_cstr(&ds));
2893
2894 ofpbuf_uninit(&key_buf);
2895 ofpbuf_uninit(&mask_buf);
2896
2897 /* Add a printout of the actual match installed. */
2898 struct match m;
2899 ds_clear(&ds);
2900 ds_put_cstr(&ds, "flow match: ");
2901 miniflow_expand(&flow->cr.flow.mf, &m.flow);
2902 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2903 memset(&m.tun_md, 0, sizeof m.tun_md);
2904 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2905
2906 VLOG_DBG("%s", ds_cstr(&ds));
2907
2908 ds_destroy(&ds);
2909 }
2910
2911 return flow;
2912 }
2913
2914 static int
2915 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
2916 struct netdev_flow_key *key,
2917 struct match *match,
2918 ovs_u128 *ufid,
2919 const struct dpif_flow_put *put,
2920 struct dpif_flow_stats *stats)
2921 {
2922 struct dp_netdev_flow *netdev_flow;
2923 int error = 0;
2924
2925 if (stats) {
2926 memset(stats, 0, sizeof *stats);
2927 }
2928
2929 ovs_mutex_lock(&pmd->flow_mutex);
2930 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
2931 if (!netdev_flow) {
2932 if (put->flags & DPIF_FP_CREATE) {
2933 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2934 dp_netdev_flow_add(pmd, match, ufid, put->actions,
2935 put->actions_len);
2936 error = 0;
2937 } else {
2938 error = EFBIG;
2939 }
2940 } else {
2941 error = ENOENT;
2942 }
2943 } else {
2944 if (put->flags & DPIF_FP_MODIFY) {
2945 struct dp_netdev_actions *new_actions;
2946 struct dp_netdev_actions *old_actions;
2947 odp_port_t in_port = netdev_flow->flow.in_port.odp_port;
2948
2949 new_actions = dp_netdev_actions_create(put->actions,
2950 put->actions_len);
2951
2952 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2953 ovsrcu_set(&netdev_flow->actions, new_actions);
2954
2955 try_netdev_flow_put(pmd, in_port, netdev_flow, match,
2956 put->actions, put->actions_len);
2957
2958 if (stats) {
2959 get_dpif_flow_stats(netdev_flow, stats);
2960 }
2961 if (put->flags & DPIF_FP_ZERO_STATS) {
2962 /* XXX: The userspace datapath uses thread local statistics
2963 * (for flows), which should be updated only by the owning
2964 * thread. Since we cannot write on stats memory here,
2965 * we choose not to support this flag. Please note:
2966 * - This feature is currently used only by dpctl commands with
2967 * option --clear.
2968 * - Should the need arise, this operation can be implemented
2969 * by keeping a base value (to be update here) for each
2970 * counter, and subtracting it before outputting the stats */
2971 error = EOPNOTSUPP;
2972 }
2973
2974 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2975 } else if (put->flags & DPIF_FP_CREATE) {
2976 error = EEXIST;
2977 } else {
2978 /* Overlapping flow. */
2979 error = EINVAL;
2980 }
2981 }
2982 ovs_mutex_unlock(&pmd->flow_mutex);
2983 return error;
2984 }
2985
2986 static int
2987 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2988 {
2989 struct dp_netdev *dp = get_dp_netdev(dpif);
2990 struct netdev_flow_key key, mask;
2991 struct dp_netdev_pmd_thread *pmd;
2992 struct match match;
2993 ovs_u128 ufid;
2994 int error;
2995 bool probe = put->flags & DPIF_FP_PROBE;
2996
2997 if (put->stats) {
2998 memset(put->stats, 0, sizeof *put->stats);
2999 }
3000 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3001 probe);
3002 if (error) {
3003 return error;
3004 }
3005 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3006 put->mask, put->mask_len,
3007 &match.flow, &match.wc, probe);
3008 if (error) {
3009 return error;
3010 }
3011
3012 if (put->ufid) {
3013 ufid = *put->ufid;
3014 } else {
3015 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3016 }
3017
3018 /* Must produce a netdev_flow_key for lookup.
3019 * Use the same method as employed to create the key when adding
3020 * the flow to the dplcs to make sure they match. */
3021 netdev_flow_mask_init(&mask, &match);
3022 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3023
3024 if (put->pmd_id == PMD_ID_NULL) {
3025 if (cmap_count(&dp->poll_threads) == 0) {
3026 return EINVAL;
3027 }
3028 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3029 struct dpif_flow_stats pmd_stats;
3030 int pmd_error;
3031
3032 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3033 &pmd_stats);
3034 if (pmd_error) {
3035 error = pmd_error;
3036 } else if (put->stats) {
3037 put->stats->n_packets += pmd_stats.n_packets;
3038 put->stats->n_bytes += pmd_stats.n_bytes;
3039 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3040 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3041 }
3042 }
3043 } else {
3044 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3045 if (!pmd) {
3046 return EINVAL;
3047 }
3048 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3049 dp_netdev_pmd_unref(pmd);
3050 }
3051
3052 return error;
3053 }
3054
3055 static int
3056 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3057 struct dpif_flow_stats *stats,
3058 const struct dpif_flow_del *del)
3059 {
3060 struct dp_netdev_flow *netdev_flow;
3061 int error = 0;
3062
3063 ovs_mutex_lock(&pmd->flow_mutex);
3064 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3065 del->key_len);
3066 if (netdev_flow) {
3067 if (stats) {
3068 get_dpif_flow_stats(netdev_flow, stats);
3069 }
3070 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3071 } else {
3072 error = ENOENT;
3073 }
3074 ovs_mutex_unlock(&pmd->flow_mutex);
3075
3076 return error;
3077 }
3078
3079 static int
3080 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3081 {
3082 struct dp_netdev *dp = get_dp_netdev(dpif);
3083 struct dp_netdev_pmd_thread *pmd;
3084 int error = 0;
3085
3086 if (del->stats) {
3087 memset(del->stats, 0, sizeof *del->stats);
3088 }
3089
3090 if (del->pmd_id == PMD_ID_NULL) {
3091 if (cmap_count(&dp->poll_threads) == 0) {
3092 return EINVAL;
3093 }
3094 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3095 struct dpif_flow_stats pmd_stats;
3096 int pmd_error;
3097
3098 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3099 if (pmd_error) {
3100 error = pmd_error;
3101 } else if (del->stats) {
3102 del->stats->n_packets += pmd_stats.n_packets;
3103 del->stats->n_bytes += pmd_stats.n_bytes;
3104 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3105 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3106 }
3107 }
3108 } else {
3109 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3110 if (!pmd) {
3111 return EINVAL;
3112 }
3113 error = flow_del_on_pmd(pmd, del->stats, del);
3114 dp_netdev_pmd_unref(pmd);
3115 }
3116
3117
3118 return error;
3119 }
3120
3121 struct dpif_netdev_flow_dump {
3122 struct dpif_flow_dump up;
3123 struct cmap_position poll_thread_pos;
3124 struct cmap_position flow_pos;
3125 struct dp_netdev_pmd_thread *cur_pmd;
3126 int status;
3127 struct ovs_mutex mutex;
3128 };
3129
3130 static struct dpif_netdev_flow_dump *
3131 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3132 {
3133 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3134 }
3135
3136 static struct dpif_flow_dump *
3137 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3138 char *type OVS_UNUSED)
3139 {
3140 struct dpif_netdev_flow_dump *dump;
3141
3142 dump = xzalloc(sizeof *dump);
3143 dpif_flow_dump_init(&dump->up, dpif_);
3144 dump->up.terse = terse;
3145 ovs_mutex_init(&dump->mutex);
3146
3147 return &dump->up;
3148 }
3149
3150 static int
3151 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3152 {
3153 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3154
3155 ovs_mutex_destroy(&dump->mutex);
3156 free(dump);
3157 return 0;
3158 }
3159
3160 struct dpif_netdev_flow_dump_thread {
3161 struct dpif_flow_dump_thread up;
3162 struct dpif_netdev_flow_dump *dump;
3163 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3164 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3165 };
3166
3167 static struct dpif_netdev_flow_dump_thread *
3168 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3169 {
3170 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3171 }
3172
3173 static struct dpif_flow_dump_thread *
3174 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3175 {
3176 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3177 struct dpif_netdev_flow_dump_thread *thread;
3178
3179 thread = xmalloc(sizeof *thread);
3180 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3181 thread->dump = dump;
3182 return &thread->up;
3183 }
3184
3185 static void
3186 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3187 {
3188 struct dpif_netdev_flow_dump_thread *thread
3189 = dpif_netdev_flow_dump_thread_cast(thread_);
3190
3191 free(thread);
3192 }
3193
3194 static int
3195 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3196 struct dpif_flow *flows, int max_flows)
3197 {
3198 struct dpif_netdev_flow_dump_thread *thread
3199 = dpif_netdev_flow_dump_thread_cast(thread_);
3200 struct dpif_netdev_flow_dump *dump = thread->dump;
3201 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3202 int n_flows = 0;
3203 int i;
3204
3205 ovs_mutex_lock(&dump->mutex);
3206 if (!dump->status) {
3207 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3208 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3209 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3210 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3211
3212 /* First call to dump_next(), extracts the first pmd thread.
3213 * If there is no pmd thread, returns immediately. */
3214 if (!pmd) {
3215 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3216 if (!pmd) {
3217 ovs_mutex_unlock(&dump->mutex);
3218 return n_flows;
3219
3220 }
3221 }
3222
3223 do {
3224 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3225 struct cmap_node *node;
3226
3227 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3228 if (!node) {
3229 break;
3230 }
3231 netdev_flows[n_flows] = CONTAINER_OF(node,
3232 struct dp_netdev_flow,
3233 node);
3234 }
3235 /* When finishing dumping the current pmd thread, moves to
3236 * the next. */
3237 if (n_flows < flow_limit) {
3238 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3239 dp_netdev_pmd_unref(pmd);
3240 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3241 if (!pmd) {
3242 dump->status = EOF;
3243 break;
3244 }
3245 }
3246 /* Keeps the reference to next caller. */
3247 dump->cur_pmd = pmd;
3248
3249 /* If the current dump is empty, do not exit the loop, since the
3250 * remaining pmds could have flows to be dumped. Just dumps again
3251 * on the new 'pmd'. */
3252 } while (!n_flows);
3253 }
3254 ovs_mutex_unlock(&dump->mutex);
3255
3256 for (i = 0; i < n_flows; i++) {
3257 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3258 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3259 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3260 struct dpif_flow *f = &flows[i];
3261 struct ofpbuf key, mask;
3262
3263 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3264 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3265 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3266 dump->up.terse);
3267 }
3268
3269 return n_flows;
3270 }
3271
3272 static int
3273 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3274 OVS_NO_THREAD_SAFETY_ANALYSIS
3275 {
3276 struct dp_netdev *dp = get_dp_netdev(dpif);
3277 struct dp_netdev_pmd_thread *pmd;
3278 struct dp_packet_batch pp;
3279
3280 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3281 dp_packet_size(execute->packet) > UINT16_MAX) {
3282 return EINVAL;
3283 }
3284
3285 /* Tries finding the 'pmd'. If NULL is returned, that means
3286 * the current thread is a non-pmd thread and should use
3287 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3288 pmd = ovsthread_getspecific(dp->per_pmd_key);
3289 if (!pmd) {
3290 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3291 if (!pmd) {
3292 return EBUSY;
3293 }
3294 }
3295
3296 if (execute->probe) {
3297 /* If this is part of a probe, Drop the packet, since executing
3298 * the action may actually cause spurious packets be sent into
3299 * the network. */
3300 if (pmd->core_id == NON_PMD_CORE_ID) {
3301 dp_netdev_pmd_unref(pmd);
3302 }
3303 return 0;
3304 }
3305
3306 /* If the current thread is non-pmd thread, acquires
3307 * the 'non_pmd_mutex'. */
3308 if (pmd->core_id == NON_PMD_CORE_ID) {
3309 ovs_mutex_lock(&dp->non_pmd_mutex);
3310 }
3311
3312 /* Update current time in PMD context. */
3313 pmd_thread_ctx_time_update(pmd);
3314
3315 /* The action processing expects the RSS hash to be valid, because
3316 * it's always initialized at the beginning of datapath processing.
3317 * In this case, though, 'execute->packet' may not have gone through
3318 * the datapath at all, it may have been generated by the upper layer
3319 * (OpenFlow packet-out, BFD frame, ...). */
3320 if (!dp_packet_rss_valid(execute->packet)) {
3321 dp_packet_set_rss_hash(execute->packet,
3322 flow_hash_5tuple(execute->flow, 0));
3323 }
3324
3325 dp_packet_batch_init_packet(&pp, execute->packet);
3326 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3327 execute->actions, execute->actions_len);
3328 dp_netdev_pmd_flush_output_packets(pmd, true);
3329
3330 if (pmd->core_id == NON_PMD_CORE_ID) {
3331 ovs_mutex_unlock(&dp->non_pmd_mutex);
3332 dp_netdev_pmd_unref(pmd);
3333 }
3334
3335 return 0;
3336 }
3337
3338 static void
3339 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
3340 {
3341 size_t i;
3342
3343 for (i = 0; i < n_ops; i++) {
3344 struct dpif_op *op = ops[i];
3345
3346 switch (op->type) {
3347 case DPIF_OP_FLOW_PUT:
3348 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3349 break;
3350
3351 case DPIF_OP_FLOW_DEL:
3352 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3353 break;
3354
3355 case DPIF_OP_EXECUTE:
3356 op->error = dpif_netdev_execute(dpif, &op->execute);
3357 break;
3358
3359 case DPIF_OP_FLOW_GET:
3360 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3361 break;
3362 }
3363 }
3364 }
3365
3366 /* Applies datapath configuration from the database. Some of the changes are
3367 * actually applied in dpif_netdev_run(). */
3368 static int
3369 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3370 {
3371 struct dp_netdev *dp = get_dp_netdev(dpif);
3372 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3373 unsigned long long insert_prob =
3374 smap_get_ullong(other_config, "emc-insert-inv-prob",
3375 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3376 uint32_t insert_min, cur_min;
3377 uint32_t tx_flush_interval, cur_tx_flush_interval;
3378
3379 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3380 DEFAULT_TX_FLUSH_INTERVAL);
3381 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3382 if (tx_flush_interval != cur_tx_flush_interval) {
3383 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3384 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3385 tx_flush_interval);
3386 }
3387
3388 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3389 free(dp->pmd_cmask);
3390 dp->pmd_cmask = nullable_xstrdup(cmask);
3391 dp_netdev_request_reconfigure(dp);
3392 }
3393
3394 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3395 if (insert_prob <= UINT32_MAX) {
3396 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3397 } else {
3398 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3399 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3400 }
3401
3402 if (insert_min != cur_min) {
3403 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3404 if (insert_min == 0) {
3405 VLOG_INFO("EMC has been disabled");
3406 } else {
3407 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3408 insert_prob, (100 / (float)insert_prob));
3409 }
3410 }
3411
3412 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3413 bool cur_perf_enabled;
3414 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3415 if (perf_enabled != cur_perf_enabled) {
3416 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3417 if (perf_enabled) {
3418 VLOG_INFO("PMD performance metrics collection enabled");
3419 } else {
3420 VLOG_INFO("PMD performance metrics collection disabled");
3421 }
3422 }
3423
3424 return 0;
3425 }
3426
3427 /* Parses affinity list and returns result in 'core_ids'. */
3428 static int
3429 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3430 {
3431 unsigned i;
3432 char *list, *copy, *key, *value;
3433 int error = 0;
3434
3435 for (i = 0; i < n_rxq; i++) {
3436 core_ids[i] = OVS_CORE_UNSPEC;
3437 }
3438
3439 if (!affinity_list) {
3440 return 0;
3441 }
3442
3443 list = copy = xstrdup(affinity_list);
3444
3445 while (ofputil_parse_key_value(&list, &key, &value)) {
3446 int rxq_id, core_id;
3447
3448 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3449 || !str_to_int(value, 0, &core_id) || core_id < 0) {
3450 error = EINVAL;
3451 break;
3452 }
3453
3454 if (rxq_id < n_rxq) {
3455 core_ids[rxq_id] = core_id;
3456 }
3457 }
3458
3459 free(copy);
3460 return error;
3461 }
3462
3463 /* Parses 'affinity_list' and applies configuration if it is valid. */
3464 static int
3465 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3466 const char *affinity_list)
3467 {
3468 unsigned *core_ids, i;
3469 int error = 0;
3470
3471 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3472 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3473 error = EINVAL;
3474 goto exit;
3475 }
3476
3477 for (i = 0; i < port->n_rxq; i++) {
3478 port->rxqs[i].core_id = core_ids[i];
3479 }
3480
3481 exit:
3482 free(core_ids);
3483 return error;
3484 }
3485
3486 /* Changes the affinity of port's rx queues. The changes are actually applied
3487 * in dpif_netdev_run(). */
3488 static int
3489 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
3490 const struct smap *cfg)
3491 {
3492 struct dp_netdev *dp = get_dp_netdev(dpif);
3493 struct dp_netdev_port *port;
3494 int error = 0;
3495 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
3496
3497 ovs_mutex_lock(&dp->port_mutex);
3498 error = get_port_by_number(dp, port_no, &port);
3499 if (error || !netdev_is_pmd(port->netdev)
3500 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
3501 goto unlock;
3502 }
3503
3504 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
3505 if (error) {
3506 goto unlock;
3507 }
3508 free(port->rxq_affinity_list);
3509 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
3510
3511 dp_netdev_request_reconfigure(dp);
3512 unlock:
3513 ovs_mutex_unlock(&dp->port_mutex);
3514 return error;
3515 }
3516
3517 static int
3518 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
3519 uint32_t queue_id, uint32_t *priority)
3520 {
3521 *priority = queue_id;
3522 return 0;
3523 }
3524
3525 \f
3526 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
3527 * a copy of the 'size' bytes of 'actions' input parameters. */
3528 struct dp_netdev_actions *
3529 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
3530 {
3531 struct dp_netdev_actions *netdev_actions;
3532
3533 netdev_actions = xmalloc(sizeof *netdev_actions + size);
3534 memcpy(netdev_actions->actions, actions, size);
3535 netdev_actions->size = size;
3536
3537 return netdev_actions;
3538 }
3539
3540 struct dp_netdev_actions *
3541 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
3542 {
3543 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
3544 }
3545
3546 static void
3547 dp_netdev_actions_free(struct dp_netdev_actions *actions)
3548 {
3549 free(actions);
3550 }
3551 \f
3552 static void
3553 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
3554 enum rxq_cycles_counter_type type,
3555 unsigned long long cycles)
3556 {
3557 atomic_store_relaxed(&rx->cycles[type], cycles);
3558 }
3559
3560 static void
3561 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
3562 enum rxq_cycles_counter_type type,
3563 unsigned long long cycles)
3564 {
3565 non_atomic_ullong_add(&rx->cycles[type], cycles);
3566 }
3567
3568 static uint64_t
3569 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
3570 enum rxq_cycles_counter_type type)
3571 {
3572 unsigned long long processing_cycles;
3573 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
3574 return processing_cycles;
3575 }
3576
3577 static void
3578 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
3579 unsigned long long cycles)
3580 {
3581 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
3582 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
3583 }
3584
3585 static uint64_t
3586 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
3587 {
3588 unsigned long long processing_cycles;
3589 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
3590 return processing_cycles;
3591 }
3592
3593 #if ATOMIC_ALWAYS_LOCK_FREE_8B
3594 static inline bool
3595 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
3596 {
3597 bool pmd_perf_enabled;
3598 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
3599 return pmd_perf_enabled;
3600 }
3601 #else
3602 /* If stores and reads of 64-bit integers are not atomic, the full PMD
3603 * performance metrics are not available as locked access to 64 bit
3604 * integers would be prohibitively expensive. */
3605 static inline bool
3606 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
3607 {
3608 return false;
3609 }
3610 #endif
3611
3612 static int
3613 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
3614 struct tx_port *p)
3615 {
3616 int i;
3617 int tx_qid;
3618 int output_cnt;
3619 bool dynamic_txqs;
3620 struct cycle_timer timer;
3621 uint64_t cycles;
3622 uint32_t tx_flush_interval;
3623
3624 cycle_timer_start(&pmd->perf_stats, &timer);
3625
3626 dynamic_txqs = p->port->dynamic_txqs;
3627 if (dynamic_txqs) {
3628 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
3629 } else {
3630 tx_qid = pmd->static_tx_qid;
3631 }
3632
3633 output_cnt = dp_packet_batch_size(&p->output_pkts);
3634 ovs_assert(output_cnt > 0);
3635
3636 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
3637 dp_packet_batch_init(&p->output_pkts);
3638
3639 /* Update time of the next flush. */
3640 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
3641 p->flush_time = pmd->ctx.now + tx_flush_interval;
3642
3643 ovs_assert(pmd->n_output_batches > 0);
3644 pmd->n_output_batches--;
3645
3646 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
3647 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
3648
3649 /* Distribute send cycles evenly among transmitted packets and assign to
3650 * their respective rx queues. */
3651 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
3652 for (i = 0; i < output_cnt; i++) {
3653 if (p->output_pkts_rxqs[i]) {
3654 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
3655 RXQ_CYCLES_PROC_CURR, cycles);
3656 }
3657 }
3658
3659 return output_cnt;
3660 }
3661
3662 static int
3663 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
3664 bool force)
3665 {
3666 struct tx_port *p;
3667 int output_cnt = 0;
3668
3669 if (!pmd->n_output_batches) {
3670 return 0;
3671 }
3672
3673 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
3674 if (!dp_packet_batch_is_empty(&p->output_pkts)
3675 && (force || pmd->ctx.now >= p->flush_time)) {
3676 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
3677 }
3678 }
3679 return output_cnt;
3680 }
3681
3682 static int
3683 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
3684 struct dp_netdev_rxq *rxq,
3685 odp_port_t port_no)
3686 {
3687 struct pmd_perf_stats *s = &pmd->perf_stats;
3688 struct dp_packet_batch batch;
3689 struct cycle_timer timer;
3690 int error;
3691 int batch_cnt = 0;
3692 int rem_qlen = 0, *qlen_p = NULL;
3693 uint64_t cycles;
3694
3695 /* Measure duration for polling and processing rx burst. */
3696 cycle_timer_start(&pmd->perf_stats, &timer);
3697
3698 pmd->ctx.last_rxq = rxq;
3699 dp_packet_batch_init(&batch);
3700
3701 /* Fetch the rx queue length only for vhostuser ports. */
3702 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
3703 qlen_p = &rem_qlen;
3704 }
3705
3706 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
3707 if (!error) {
3708 /* At least one packet received. */
3709 *recirc_depth_get() = 0;
3710 pmd_thread_ctx_time_update(pmd);
3711 batch_cnt = batch.count;
3712 if (pmd_perf_metrics_enabled(pmd)) {
3713 /* Update batch histogram. */
3714 s->current.batches++;
3715 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
3716 /* Update the maximum vhost rx queue fill level. */
3717 if (rxq->is_vhost && rem_qlen >= 0) {
3718 uint32_t qfill = batch_cnt + rem_qlen;
3719 if (qfill > s->current.max_vhost_qfill) {
3720 s->current.max_vhost_qfill = qfill;
3721 }
3722 }
3723 }
3724 /* Process packet batch. */
3725 dp_netdev_input(pmd, &batch, port_no);
3726
3727 /* Assign processing cycles to rx queue. */
3728 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
3729 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
3730
3731 dp_netdev_pmd_flush_output_packets(pmd, false);
3732 } else {
3733 /* Discard cycles. */
3734 cycle_timer_stop(&pmd->perf_stats, &timer);
3735 if (error != EAGAIN && error != EOPNOTSUPP) {
3736 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3737
3738 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3739 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
3740 }
3741 }
3742
3743 pmd->ctx.last_rxq = NULL;
3744
3745 return batch_cnt;
3746 }
3747
3748 static struct tx_port *
3749 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3750 {
3751 struct tx_port *tx;
3752
3753 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3754 if (tx->port->port_no == port_no) {
3755 return tx;
3756 }
3757 }
3758
3759 return NULL;
3760 }
3761
3762 static int
3763 port_reconfigure(struct dp_netdev_port *port)
3764 {
3765 struct netdev *netdev = port->netdev;
3766 int i, err;
3767
3768 /* Closes the existing 'rxq's. */
3769 for (i = 0; i < port->n_rxq; i++) {
3770 netdev_rxq_close(port->rxqs[i].rx);
3771 port->rxqs[i].rx = NULL;
3772 }
3773 unsigned last_nrxq = port->n_rxq;
3774 port->n_rxq = 0;
3775
3776 /* Allows 'netdev' to apply the pending configuration changes. */
3777 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
3778 err = netdev_reconfigure(netdev);
3779 if (err && (err != EOPNOTSUPP)) {
3780 VLOG_ERR("Failed to set interface %s new configuration",
3781 netdev_get_name(netdev));
3782 return err;
3783 }
3784 }
3785 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3786 port->rxqs = xrealloc(port->rxqs,
3787 sizeof *port->rxqs * netdev_n_rxq(netdev));
3788 /* Realloc 'used' counters for tx queues. */
3789 free(port->txq_used);
3790 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
3791
3792 for (i = 0; i < netdev_n_rxq(netdev); i++) {
3793 bool new_queue = i >= last_nrxq;
3794 if (new_queue) {
3795 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
3796 }
3797
3798 port->rxqs[i].port = port;
3799 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
3800
3801 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
3802 if (err) {
3803 return err;
3804 }
3805 port->n_rxq++;
3806 }
3807
3808 /* Parse affinity list to apply configuration for new queues. */
3809 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
3810
3811 /* If reconfiguration was successful mark it as such, so we can use it */
3812 port->need_reconfigure = false;
3813
3814 return 0;
3815 }
3816
3817 struct rr_numa_list {
3818 struct hmap numas; /* Contains 'struct rr_numa' */
3819 };
3820
3821 struct rr_numa {
3822 struct hmap_node node;
3823
3824 int numa_id;
3825
3826 /* Non isolated pmds on numa node 'numa_id' */
3827 struct dp_netdev_pmd_thread **pmds;
3828 int n_pmds;
3829
3830 int cur_index;
3831 bool idx_inc;
3832 };
3833
3834 static struct rr_numa *
3835 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
3836 {
3837 struct rr_numa *numa;
3838
3839 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
3840 if (numa->numa_id == numa_id) {
3841 return numa;
3842 }
3843 }
3844
3845 return NULL;
3846 }
3847
3848 /* Returns the next node in numa list following 'numa' in round-robin fashion.
3849 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
3850 * Returns NULL if 'rr' numa list is empty. */
3851 static struct rr_numa *
3852 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
3853 {
3854 struct hmap_node *node = NULL;
3855
3856 if (numa) {
3857 node = hmap_next(&rr->numas, &numa->node);
3858 }
3859 if (!node) {
3860 node = hmap_first(&rr->numas);
3861 }
3862
3863 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
3864 }
3865
3866 static void
3867 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
3868 {
3869 struct dp_netdev_pmd_thread *pmd;
3870 struct rr_numa *numa;
3871
3872 hmap_init(&rr->numas);
3873
3874 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3875 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3876 continue;
3877 }
3878
3879 numa = rr_numa_list_lookup(rr, pmd->numa_id);
3880 if (!numa) {
3881 numa = xzalloc(sizeof *numa);
3882 numa->numa_id = pmd->numa_id;
3883 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
3884 }
3885 numa->n_pmds++;
3886 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
3887 numa->pmds[numa->n_pmds - 1] = pmd;
3888 /* At least one pmd so initialise curr_idx and idx_inc. */
3889 numa->cur_index = 0;
3890 numa->idx_inc = true;
3891 }
3892 }
3893
3894 /* Returns the next pmd from the numa node in
3895 * incrementing or decrementing order. */
3896 static struct dp_netdev_pmd_thread *
3897 rr_numa_get_pmd(struct rr_numa *numa)
3898 {
3899 int numa_idx = numa->cur_index;
3900
3901 if (numa->idx_inc == true) {
3902 /* Incrementing through list of pmds. */
3903 if (numa->cur_index == numa->n_pmds-1) {
3904 /* Reached the last pmd. */
3905 numa->idx_inc = false;
3906 } else {
3907 numa->cur_index++;
3908 }
3909 } else {
3910 /* Decrementing through list of pmds. */
3911 if (numa->cur_index == 0) {
3912 /* Reached the first pmd. */
3913 numa->idx_inc = true;
3914 } else {
3915 numa->cur_index--;
3916 }
3917 }
3918 return numa->pmds[numa_idx];
3919 }
3920
3921 static void
3922 rr_numa_list_destroy(struct rr_numa_list *rr)
3923 {
3924 struct rr_numa *numa;
3925
3926 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
3927 free(numa->pmds);
3928 free(numa);
3929 }
3930 hmap_destroy(&rr->numas);
3931 }
3932
3933 /* Sort Rx Queues by the processing cycles they are consuming. */
3934 static int
3935 compare_rxq_cycles(const void *a, const void *b)
3936 {
3937 struct dp_netdev_rxq *qa;
3938 struct dp_netdev_rxq *qb;
3939 uint64_t cycles_qa, cycles_qb;
3940
3941 qa = *(struct dp_netdev_rxq **) a;
3942 qb = *(struct dp_netdev_rxq **) b;
3943
3944 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
3945 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
3946
3947 if (cycles_qa != cycles_qb) {
3948 return (cycles_qa < cycles_qb) ? 1 : -1;
3949 } else {
3950 /* Cycles are the same so tiebreak on port/queue id.
3951 * Tiebreaking (as opposed to return 0) ensures consistent
3952 * sort results across multiple OS's. */
3953 uint32_t port_qa = odp_to_u32(qa->port->port_no);
3954 uint32_t port_qb = odp_to_u32(qb->port->port_no);
3955 if (port_qa != port_qb) {
3956 return port_qa > port_qb ? 1 : -1;
3957 } else {
3958 return netdev_rxq_get_queue_id(qa->rx)
3959 - netdev_rxq_get_queue_id(qb->rx);
3960 }
3961 }
3962 }
3963
3964 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
3965 * queues and marks the pmds as isolated. Otherwise, assign non isolated
3966 * pmds to unpinned queues.
3967 *
3968 * If 'pinned' is false queues will be sorted by processing cycles they are
3969 * consuming and then assigned to pmds in round robin order.
3970 *
3971 * The function doesn't touch the pmd threads, it just stores the assignment
3972 * in the 'pmd' member of each rxq. */
3973 static void
3974 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
3975 {
3976 struct dp_netdev_port *port;
3977 struct rr_numa_list rr;
3978 struct rr_numa *non_local_numa = NULL;
3979 struct dp_netdev_rxq ** rxqs = NULL;
3980 int n_rxqs = 0;
3981 struct rr_numa *numa = NULL;
3982 int numa_id;
3983
3984 HMAP_FOR_EACH (port, node, &dp->ports) {
3985 if (!netdev_is_pmd(port->netdev)) {
3986 continue;
3987 }
3988
3989 for (int qid = 0; qid < port->n_rxq; qid++) {
3990 struct dp_netdev_rxq *q = &port->rxqs[qid];
3991
3992 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
3993 struct dp_netdev_pmd_thread *pmd;
3994
3995 pmd = dp_netdev_get_pmd(dp, q->core_id);
3996 if (!pmd) {
3997 VLOG_WARN("There is no PMD thread on core %d. Queue "
3998 "%d on port \'%s\' will not be polled.",
3999 q->core_id, qid, netdev_get_name(port->netdev));
4000 } else {
4001 q->pmd = pmd;
4002 pmd->isolated = true;
4003 dp_netdev_pmd_unref(pmd);
4004 }
4005 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4006 uint64_t cycle_hist = 0;
4007
4008 if (n_rxqs == 0) {
4009 rxqs = xmalloc(sizeof *rxqs);
4010 } else {
4011 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4012 }
4013 /* Sum the queue intervals and store the cycle history. */
4014 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4015 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4016 }
4017 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
4018
4019 /* Store the queue. */
4020 rxqs[n_rxqs++] = q;
4021 }
4022 }
4023 }
4024
4025 if (n_rxqs > 1) {
4026 /* Sort the queues in order of the processing cycles
4027 * they consumed during their last pmd interval. */
4028 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4029 }
4030
4031 rr_numa_list_populate(dp, &rr);
4032 /* Assign the sorted queues to pmds in round robin. */
4033 for (int i = 0; i < n_rxqs; i++) {
4034 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4035 numa = rr_numa_list_lookup(&rr, numa_id);
4036 if (!numa) {
4037 /* There are no pmds on the queue's local NUMA node.
4038 Round robin on the NUMA nodes that do have pmds. */
4039 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4040 if (!non_local_numa) {
4041 VLOG_ERR("There is no available (non-isolated) pmd "
4042 "thread for port \'%s\' queue %d. This queue "
4043 "will not be polled. Is pmd-cpu-mask set to "
4044 "zero? Or are all PMDs isolated to other "
4045 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4046 netdev_rxq_get_queue_id(rxqs[i]->rx));
4047 continue;
4048 }
4049 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
4050 VLOG_WARN("There's no available (non-isolated) pmd thread "
4051 "on numa node %d. Queue %d on port \'%s\' will "
4052 "be assigned to the pmd on core %d "
4053 "(numa node %d). Expect reduced performance.",
4054 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4055 netdev_rxq_get_name(rxqs[i]->rx),
4056 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4057 } else {
4058 rxqs[i]->pmd = rr_numa_get_pmd(numa);
4059 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4060 "rx queue %d (measured processing cycles %"PRIu64").",
4061 rxqs[i]->pmd->core_id, numa_id,
4062 netdev_rxq_get_name(rxqs[i]->rx),
4063 netdev_rxq_get_queue_id(rxqs[i]->rx),
4064 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
4065 }
4066 }
4067
4068 rr_numa_list_destroy(&rr);
4069 free(rxqs);
4070 }
4071
4072 static void
4073 reload_affected_pmds(struct dp_netdev *dp)
4074 {
4075 struct dp_netdev_pmd_thread *pmd;
4076
4077 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4078 if (pmd->need_reload) {
4079 flow_mark_flush(pmd);
4080 dp_netdev_reload_pmd__(pmd);
4081 pmd->need_reload = false;
4082 }
4083 }
4084 }
4085
4086 static void
4087 reconfigure_pmd_threads(struct dp_netdev *dp)
4088 OVS_REQUIRES(dp->port_mutex)
4089 {
4090 struct dp_netdev_pmd_thread *pmd;
4091 struct ovs_numa_dump *pmd_cores;
4092 struct ovs_numa_info_core *core;
4093 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4094 struct hmapx_node *node;
4095 bool changed = false;
4096 bool need_to_adjust_static_tx_qids = false;
4097
4098 /* The pmd threads should be started only if there's a pmd port in the
4099 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4100 * NR_PMD_THREADS per numa node. */
4101 if (!has_pmd_port(dp)) {
4102 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4103 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4104 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4105 } else {
4106 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4107 }
4108
4109 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4110 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4111 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4112 /* Adjustment is required to keep 'static_tx_qid's sequential and
4113 * avoid possible issues, for example, imbalanced tx queue usage
4114 * and unnecessary locking caused by remapping on netdev level. */
4115 need_to_adjust_static_tx_qids = true;
4116 }
4117
4118 /* Check for unwanted pmd threads */
4119 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4120 if (pmd->core_id == NON_PMD_CORE_ID) {
4121 continue;
4122 }
4123 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4124 pmd->core_id)) {
4125 hmapx_add(&to_delete, pmd);
4126 } else if (need_to_adjust_static_tx_qids) {
4127 pmd->need_reload = true;
4128 }
4129 }
4130
4131 HMAPX_FOR_EACH (node, &to_delete) {
4132 pmd = (struct dp_netdev_pmd_thread *) node->data;
4133 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4134 pmd->numa_id, pmd->core_id);
4135 dp_netdev_del_pmd(dp, pmd);
4136 }
4137 changed = !hmapx_is_empty(&to_delete);
4138 hmapx_destroy(&to_delete);
4139
4140 if (need_to_adjust_static_tx_qids) {
4141 /* 'static_tx_qid's are not sequential now.
4142 * Reload remaining threads to fix this. */
4143 reload_affected_pmds(dp);
4144 }
4145
4146 /* Check for required new pmd threads */
4147 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4148 pmd = dp_netdev_get_pmd(dp, core->core_id);
4149 if (!pmd) {
4150 pmd = xzalloc(sizeof *pmd);
4151 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4152 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4153 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4154 pmd->numa_id, pmd->core_id);
4155 changed = true;
4156 } else {
4157 dp_netdev_pmd_unref(pmd);
4158 }
4159 }
4160
4161 if (changed) {
4162 struct ovs_numa_info_numa *numa;
4163
4164 /* Log the number of pmd threads per numa node. */
4165 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4166 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4167 numa->n_cores, numa->numa_id);
4168 }
4169 }
4170
4171 ovs_numa_dump_destroy(pmd_cores);
4172 }
4173
4174 static void
4175 pmd_remove_stale_ports(struct dp_netdev *dp,
4176 struct dp_netdev_pmd_thread *pmd)
4177 OVS_EXCLUDED(pmd->port_mutex)
4178 OVS_REQUIRES(dp->port_mutex)
4179 {
4180 struct rxq_poll *poll, *poll_next;
4181 struct tx_port *tx, *tx_next;
4182
4183 ovs_mutex_lock(&pmd->port_mutex);
4184 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4185 struct dp_netdev_port *port = poll->rxq->port;
4186
4187 if (port->need_reconfigure
4188 || !hmap_contains(&dp->ports, &port->node)) {
4189 dp_netdev_del_rxq_from_pmd(pmd, poll);
4190 }
4191 }
4192 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4193 struct dp_netdev_port *port = tx->port;
4194
4195 if (port->need_reconfigure
4196 || !hmap_contains(&dp->ports, &port->node)) {
4197 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4198 }
4199 }
4200 ovs_mutex_unlock(&pmd->port_mutex);
4201 }
4202
4203 /* Must be called each time a port is added/removed or the cmask changes.
4204 * This creates and destroys pmd threads, reconfigures ports, opens their
4205 * rxqs and assigns all rxqs/txqs to pmd threads. */
4206 static void
4207 reconfigure_datapath(struct dp_netdev *dp)
4208 OVS_REQUIRES(dp->port_mutex)
4209 {
4210 struct dp_netdev_pmd_thread *pmd;
4211 struct dp_netdev_port *port;
4212 int wanted_txqs;
4213
4214 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4215
4216 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4217 * on the system and the user configuration. */
4218 reconfigure_pmd_threads(dp);
4219
4220 wanted_txqs = cmap_count(&dp->poll_threads);
4221
4222 /* The number of pmd threads might have changed, or a port can be new:
4223 * adjust the txqs. */
4224 HMAP_FOR_EACH (port, node, &dp->ports) {
4225 netdev_set_tx_multiq(port->netdev, wanted_txqs);
4226 }
4227
4228 /* Step 2: Remove from the pmd threads ports that have been removed or
4229 * need reconfiguration. */
4230
4231 /* Check for all the ports that need reconfiguration. We cache this in
4232 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4233 * change at any time. */
4234 HMAP_FOR_EACH (port, node, &dp->ports) {
4235 if (netdev_is_reconf_required(port->netdev)) {
4236 port->need_reconfigure = true;
4237 }
4238 }
4239
4240 /* Remove from the pmd threads all the ports that have been deleted or
4241 * need reconfiguration. */
4242 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4243 pmd_remove_stale_ports(dp, pmd);
4244 }
4245
4246 /* Reload affected pmd threads. We must wait for the pmd threads before
4247 * reconfiguring the ports, because a port cannot be reconfigured while
4248 * it's being used. */
4249 reload_affected_pmds(dp);
4250
4251 /* Step 3: Reconfigure ports. */
4252
4253 /* We only reconfigure the ports that we determined above, because they're
4254 * not being used by any pmd thread at the moment. If a port fails to
4255 * reconfigure we remove it from the datapath. */
4256 struct dp_netdev_port *next_port;
4257 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4258 int err;
4259
4260 if (!port->need_reconfigure) {
4261 continue;
4262 }
4263
4264 err = port_reconfigure(port);
4265 if (err) {
4266 hmap_remove(&dp->ports, &port->node);
4267 seq_change(dp->port_seq);
4268 port_destroy(port);
4269 } else {
4270 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4271 }
4272 }
4273
4274 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4275 * for now, we just update the 'pmd' pointer in each rxq to point to the
4276 * wanted thread according to the scheduling policy. */
4277
4278 /* Reset all the pmd threads to non isolated. */
4279 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4280 pmd->isolated = false;
4281 }
4282
4283 /* Reset all the queues to unassigned */
4284 HMAP_FOR_EACH (port, node, &dp->ports) {
4285 for (int i = 0; i < port->n_rxq; i++) {
4286 port->rxqs[i].pmd = NULL;
4287 }
4288 }
4289
4290 /* Add pinned queues and mark pmd threads isolated. */
4291 rxq_scheduling(dp, true);
4292
4293 /* Add non-pinned queues. */
4294 rxq_scheduling(dp, false);
4295
4296 /* Step 5: Remove queues not compliant with new scheduling. */
4297 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4298 struct rxq_poll *poll, *poll_next;
4299
4300 ovs_mutex_lock(&pmd->port_mutex);
4301 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4302 if (poll->rxq->pmd != pmd) {
4303 dp_netdev_del_rxq_from_pmd(pmd, poll);
4304 }
4305 }
4306 ovs_mutex_unlock(&pmd->port_mutex);
4307 }
4308
4309 /* Reload affected pmd threads. We must wait for the pmd threads to remove
4310 * the old queues before readding them, otherwise a queue can be polled by
4311 * two threads at the same time. */
4312 reload_affected_pmds(dp);
4313
4314 /* Step 6: Add queues from scheduling, if they're not there already. */
4315 HMAP_FOR_EACH (port, node, &dp->ports) {
4316 if (!netdev_is_pmd(port->netdev)) {
4317 continue;
4318 }
4319
4320 for (int qid = 0; qid < port->n_rxq; qid++) {
4321 struct dp_netdev_rxq *q = &port->rxqs[qid];
4322
4323 if (q->pmd) {
4324 ovs_mutex_lock(&q->pmd->port_mutex);
4325 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4326 ovs_mutex_unlock(&q->pmd->port_mutex);
4327 }
4328 }
4329 }
4330
4331 /* Add every port to the tx cache of every pmd thread, if it's not
4332 * there already and if this pmd has at least one rxq to poll. */
4333 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4334 ovs_mutex_lock(&pmd->port_mutex);
4335 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4336 HMAP_FOR_EACH (port, node, &dp->ports) {
4337 dp_netdev_add_port_tx_to_pmd(pmd, port);
4338 }
4339 }
4340 ovs_mutex_unlock(&pmd->port_mutex);
4341 }
4342
4343 /* Reload affected pmd threads. */
4344 reload_affected_pmds(dp);
4345 }
4346
4347 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
4348 static bool
4349 ports_require_restart(const struct dp_netdev *dp)
4350 OVS_REQUIRES(dp->port_mutex)
4351 {
4352 struct dp_netdev_port *port;
4353
4354 HMAP_FOR_EACH (port, node, &dp->ports) {
4355 if (netdev_is_reconf_required(port->netdev)) {
4356 return true;
4357 }
4358 }
4359
4360 return false;
4361 }
4362
4363 /* Return true if needs to revalidate datapath flows. */
4364 static bool
4365 dpif_netdev_run(struct dpif *dpif)
4366 {
4367 struct dp_netdev_port *port;
4368 struct dp_netdev *dp = get_dp_netdev(dpif);
4369 struct dp_netdev_pmd_thread *non_pmd;
4370 uint64_t new_tnl_seq;
4371 bool need_to_flush = true;
4372
4373 ovs_mutex_lock(&dp->port_mutex);
4374 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4375 if (non_pmd) {
4376 ovs_mutex_lock(&dp->non_pmd_mutex);
4377 HMAP_FOR_EACH (port, node, &dp->ports) {
4378 if (!netdev_is_pmd(port->netdev)) {
4379 int i;
4380
4381 for (i = 0; i < port->n_rxq; i++) {
4382 if (dp_netdev_process_rxq_port(non_pmd,
4383 &port->rxqs[i],
4384 port->port_no)) {
4385 need_to_flush = false;
4386 }
4387 }
4388 }
4389 }
4390 if (need_to_flush) {
4391 /* We didn't receive anything in the process loop.
4392 * Check if we need to send something.
4393 * There was no time updates on current iteration. */
4394 pmd_thread_ctx_time_update(non_pmd);
4395 dp_netdev_pmd_flush_output_packets(non_pmd, false);
4396 }
4397
4398 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
4399 ovs_mutex_unlock(&dp->non_pmd_mutex);
4400
4401 dp_netdev_pmd_unref(non_pmd);
4402 }
4403
4404 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
4405 reconfigure_datapath(dp);
4406 }
4407 ovs_mutex_unlock(&dp->port_mutex);
4408
4409 tnl_neigh_cache_run();
4410 tnl_port_map_run();
4411 new_tnl_seq = seq_read(tnl_conf_seq);
4412
4413 if (dp->last_tnl_conf_seq != new_tnl_seq) {
4414 dp->last_tnl_conf_seq = new_tnl_seq;
4415 return true;
4416 }
4417 return false;
4418 }
4419
4420 static void
4421 dpif_netdev_wait(struct dpif *dpif)
4422 {
4423 struct dp_netdev_port *port;
4424 struct dp_netdev *dp = get_dp_netdev(dpif);
4425
4426 ovs_mutex_lock(&dp_netdev_mutex);
4427 ovs_mutex_lock(&dp->port_mutex);
4428 HMAP_FOR_EACH (port, node, &dp->ports) {
4429 netdev_wait_reconf_required(port->netdev);
4430 if (!netdev_is_pmd(port->netdev)) {
4431 int i;
4432
4433 for (i = 0; i < port->n_rxq; i++) {
4434 netdev_rxq_wait(port->rxqs[i].rx);
4435 }
4436 }
4437 }
4438 ovs_mutex_unlock(&dp->port_mutex);
4439 ovs_mutex_unlock(&dp_netdev_mutex);
4440 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
4441 }
4442
4443 static void
4444 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
4445 {
4446 struct tx_port *tx_port_cached;
4447
4448 /* Flush all the queued packets. */
4449 dp_netdev_pmd_flush_output_packets(pmd, true);
4450 /* Free all used tx queue ids. */
4451 dpif_netdev_xps_revalidate_pmd(pmd, true);
4452
4453 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
4454 free(tx_port_cached);
4455 }
4456 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
4457 free(tx_port_cached);
4458 }
4459 }
4460
4461 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
4462 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
4463 * device, otherwise to 'pmd->send_port_cache' if the port has at least
4464 * one txq. */
4465 static void
4466 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
4467 OVS_REQUIRES(pmd->port_mutex)
4468 {
4469 struct tx_port *tx_port, *tx_port_cached;
4470
4471 pmd_free_cached_ports(pmd);
4472 hmap_shrink(&pmd->send_port_cache);
4473 hmap_shrink(&pmd->tnl_port_cache);
4474
4475 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
4476 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
4477 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4478 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
4479 hash_port_no(tx_port_cached->port->port_no));
4480 }
4481
4482 if (netdev_n_txq(tx_port->port->netdev)) {
4483 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
4484 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
4485 hash_port_no(tx_port_cached->port->port_no));
4486 }
4487 }
4488 }
4489
4490 static void
4491 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4492 {
4493 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4494 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
4495 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
4496 ", numa_id %d.", pmd->core_id, pmd->numa_id);
4497 }
4498 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4499
4500 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
4501 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
4502 }
4503
4504 static void
4505 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
4506 {
4507 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
4508 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
4509 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
4510 }
4511
4512 static int
4513 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
4514 struct polled_queue **ppoll_list)
4515 {
4516 struct polled_queue *poll_list = *ppoll_list;
4517 struct rxq_poll *poll;
4518 int i;
4519
4520 ovs_mutex_lock(&pmd->port_mutex);
4521 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
4522 * sizeof *poll_list);
4523
4524 i = 0;
4525 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4526 poll_list[i].rxq = poll->rxq;
4527 poll_list[i].port_no = poll->rxq->port->port_no;
4528 i++;
4529 }
4530
4531 pmd_load_cached_ports(pmd);
4532
4533 ovs_mutex_unlock(&pmd->port_mutex);
4534
4535 *ppoll_list = poll_list;
4536 return i;
4537 }
4538
4539 static void *
4540 pmd_thread_main(void *f_)
4541 {
4542 struct dp_netdev_pmd_thread *pmd = f_;
4543 struct pmd_perf_stats *s = &pmd->perf_stats;
4544 unsigned int lc = 0;
4545 struct polled_queue *poll_list;
4546 bool exiting;
4547 int poll_cnt;
4548 int i;
4549 int process_packets = 0;
4550
4551 poll_list = NULL;
4552
4553 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
4554 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
4555 ovs_numa_thread_setaffinity_core(pmd->core_id);
4556 dpdk_set_lcore_id(pmd->core_id);
4557 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4558 emc_cache_init(&pmd->flow_cache);
4559 reload:
4560 pmd_alloc_static_tx_qid(pmd);
4561
4562 /* List port/core affinity */
4563 for (i = 0; i < poll_cnt; i++) {
4564 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
4565 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
4566 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4567 /* Reset the rxq current cycles counter. */
4568 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
4569 }
4570
4571 if (!poll_cnt) {
4572 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
4573 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
4574 poll_block();
4575 }
4576 lc = UINT_MAX;
4577 }
4578
4579 pmd->intrvl_tsc_prev = 0;
4580 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
4581 cycles_counter_update(s);
4582 /* Protect pmd stats from external clearing while polling. */
4583 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
4584 for (;;) {
4585 uint64_t rx_packets = 0, tx_packets = 0;
4586
4587 pmd_perf_start_iteration(s);
4588
4589 for (i = 0; i < poll_cnt; i++) {
4590 process_packets =
4591 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
4592 poll_list[i].port_no);
4593 rx_packets += process_packets;
4594 }
4595
4596 if (!rx_packets) {
4597 /* We didn't receive anything in the process loop.
4598 * Check if we need to send something.
4599 * There was no time updates on current iteration. */
4600 pmd_thread_ctx_time_update(pmd);
4601 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
4602 }
4603
4604 if (lc++ > 1024) {
4605 bool reload;
4606
4607 lc = 0;
4608
4609 coverage_try_clear();
4610 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
4611 if (!ovsrcu_try_quiesce()) {
4612 emc_cache_slow_sweep(&pmd->flow_cache);
4613 }
4614
4615 atomic_read_relaxed(&pmd->reload, &reload);
4616 if (reload) {
4617 break;
4618 }
4619 }
4620 pmd_perf_end_iteration(s, rx_packets, tx_packets,
4621 pmd_perf_metrics_enabled(pmd));
4622 }
4623 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
4624
4625 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
4626 exiting = latch_is_set(&pmd->exit_latch);
4627 /* Signal here to make sure the pmd finishes
4628 * reloading the updated configuration. */
4629 dp_netdev_pmd_reload_done(pmd);
4630
4631 pmd_free_static_tx_qid(pmd);
4632
4633 if (!exiting) {
4634 goto reload;
4635 }
4636
4637 emc_cache_uninit(&pmd->flow_cache);
4638 free(poll_list);
4639 pmd_free_cached_ports(pmd);
4640 return NULL;
4641 }
4642
4643 static void
4644 dp_netdev_disable_upcall(struct dp_netdev *dp)
4645 OVS_ACQUIRES(dp->upcall_rwlock)
4646 {
4647 fat_rwlock_wrlock(&dp->upcall_rwlock);
4648 }
4649
4650 \f
4651 /* Meters */
4652 static void
4653 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
4654 struct ofputil_meter_features *features)
4655 {
4656 features->max_meters = MAX_METERS;
4657 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
4658 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
4659 features->max_bands = MAX_BANDS;
4660 features->max_color = 0;
4661 }
4662
4663 /* Returns false when packet needs to be dropped. */
4664 static void
4665 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
4666 uint32_t meter_id, long long int now)
4667 {
4668 struct dp_meter *meter;
4669 struct dp_meter_band *band;
4670 struct dp_packet *packet;
4671 long long int long_delta_t; /* msec */
4672 uint32_t delta_t; /* msec */
4673 const size_t cnt = dp_packet_batch_size(packets_);
4674 uint32_t bytes, volume;
4675 int exceeded_band[NETDEV_MAX_BURST];
4676 uint32_t exceeded_rate[NETDEV_MAX_BURST];
4677 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
4678
4679 if (meter_id >= MAX_METERS) {
4680 return;
4681 }
4682
4683 meter_lock(dp, meter_id);
4684 meter = dp->meters[meter_id];
4685 if (!meter) {
4686 goto out;
4687 }
4688
4689 /* Initialize as negative values. */
4690 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
4691 /* Initialize as zeroes. */
4692 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
4693
4694 /* All packets will hit the meter at the same time. */
4695 long_delta_t = (now - meter->used) / 1000; /* msec */
4696
4697 /* Make sure delta_t will not be too large, so that bucket will not
4698 * wrap around below. */
4699 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
4700 ? meter->max_delta_t : (uint32_t)long_delta_t;
4701
4702 /* Update meter stats. */
4703 meter->used = now;
4704 meter->packet_count += cnt;
4705 bytes = 0;
4706 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
4707 bytes += dp_packet_size(packet);
4708 }
4709 meter->byte_count += bytes;
4710
4711 /* Meters can operate in terms of packets per second or kilobits per
4712 * second. */
4713 if (meter->flags & OFPMF13_PKTPS) {
4714 /* Rate in packets/second, bucket 1/1000 packets. */
4715 /* msec * packets/sec = 1/1000 packets. */
4716 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
4717 } else {
4718 /* Rate in kbps, bucket in bits. */
4719 /* msec * kbps = bits */
4720 volume = bytes * 8;
4721 }
4722
4723 /* Update all bands and find the one hit with the highest rate for each
4724 * packet (if any). */
4725 for (int m = 0; m < meter->n_bands; ++m) {
4726 band = &meter->bands[m];
4727
4728 /* Update band's bucket. */
4729 band->bucket += delta_t * band->up.rate;
4730 if (band->bucket > band->up.burst_size) {
4731 band->bucket = band->up.burst_size;
4732 }
4733
4734 /* Drain the bucket for all the packets, if possible. */
4735 if (band->bucket >= volume) {
4736 band->bucket -= volume;
4737 } else {
4738 int band_exceeded_pkt;
4739
4740 /* Band limit hit, must process packet-by-packet. */
4741 if (meter->flags & OFPMF13_PKTPS) {
4742 band_exceeded_pkt = band->bucket / 1000;
4743 band->bucket %= 1000; /* Remainder stays in bucket. */
4744
4745 /* Update the exceeding band for each exceeding packet.
4746 * (Only one band will be fired by a packet, and that
4747 * can be different for each packet.) */
4748 for (int i = band_exceeded_pkt; i < cnt; i++) {
4749 if (band->up.rate > exceeded_rate[i]) {
4750 exceeded_rate[i] = band->up.rate;
4751 exceeded_band[i] = m;
4752 }
4753 }
4754 } else {
4755 /* Packet sizes differ, must process one-by-one. */
4756 band_exceeded_pkt = cnt;
4757 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
4758 uint32_t bits = dp_packet_size(packet) * 8;
4759
4760 if (band->bucket >= bits) {
4761 band->bucket -= bits;
4762 } else {
4763 if (i < band_exceeded_pkt) {
4764 band_exceeded_pkt = i;
4765 }
4766 /* Update the exceeding band for the exceeding packet.
4767 * (Only one band will be fired by a packet, and that
4768 * can be different for each packet.) */
4769 if (band->up.rate > exceeded_rate[i]) {
4770 exceeded_rate[i] = band->up.rate;
4771 exceeded_band[i] = m;
4772 }
4773 }
4774 }
4775 }
4776 /* Remember the first exceeding packet. */
4777 if (exceeded_pkt > band_exceeded_pkt) {
4778 exceeded_pkt = band_exceeded_pkt;
4779 }
4780 }
4781 }
4782
4783 /* Fire the highest rate band exceeded by each packet.
4784 * Drop packets if needed, by swapping packet to the end that will be
4785 * ignored. */
4786 size_t j;
4787 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4788 if (exceeded_band[j] >= 0) {
4789 /* Meter drop packet. */
4790 band = &meter->bands[exceeded_band[j]];
4791 band->packet_count += 1;
4792 band->byte_count += dp_packet_size(packet);
4793
4794 dp_packet_delete(packet);
4795 } else {
4796 /* Meter accepts packet. */
4797 dp_packet_batch_refill(packets_, packet, j);
4798 }
4799 }
4800 out:
4801 meter_unlock(dp, meter_id);
4802 }
4803
4804 /* Meter set/get/del processing is still single-threaded. */
4805 static int
4806 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id,
4807 struct ofputil_meter_config *config)
4808 {
4809 struct dp_netdev *dp = get_dp_netdev(dpif);
4810 uint32_t mid = meter_id->uint32;
4811 struct dp_meter *meter;
4812 int i;
4813
4814 if (mid >= MAX_METERS) {
4815 return EFBIG; /* Meter_id out of range. */
4816 }
4817
4818 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK ||
4819 !(config->flags & (OFPMF13_KBPS | OFPMF13_PKTPS))) {
4820 return EBADF; /* Unsupported flags set */
4821 }
4822
4823 /* Validate bands */
4824 if (config->n_bands == 0 || config->n_bands > MAX_BANDS) {
4825 return EINVAL; /* Too many bands */
4826 }
4827
4828 /* Validate rates */
4829 for (i = 0; i < config->n_bands; i++) {
4830 if (config->bands[i].rate == 0) {
4831 return EDOM; /* rate must be non-zero */
4832 }
4833 }
4834
4835 for (i = 0; i < config->n_bands; ++i) {
4836 switch (config->bands[i].type) {
4837 case OFPMBT13_DROP:
4838 break;
4839 default:
4840 return ENODEV; /* Unsupported band type */
4841 }
4842 }
4843
4844 /* Allocate meter */
4845 meter = xzalloc(sizeof *meter
4846 + config->n_bands * sizeof(struct dp_meter_band));
4847 if (meter) {
4848 meter->flags = config->flags;
4849 meter->n_bands = config->n_bands;
4850 meter->max_delta_t = 0;
4851 meter->used = time_usec();
4852
4853 /* set up bands */
4854 for (i = 0; i < config->n_bands; ++i) {
4855 uint32_t band_max_delta_t;
4856
4857 /* Set burst size to a workable value if none specified. */
4858 if (config->bands[i].burst_size == 0) {
4859 config->bands[i].burst_size = config->bands[i].rate;
4860 }
4861
4862 meter->bands[i].up = config->bands[i];
4863 /* Convert burst size to the bucket units: */
4864 /* pkts => 1/1000 packets, kilobits => bits. */
4865 meter->bands[i].up.burst_size *= 1000;
4866 /* Initialize bucket to empty. */
4867 meter->bands[i].bucket = 0;
4868
4869 /* Figure out max delta_t that is enough to fill any bucket. */
4870 band_max_delta_t
4871 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
4872 if (band_max_delta_t > meter->max_delta_t) {
4873 meter->max_delta_t = band_max_delta_t;
4874 }
4875 }
4876
4877 meter_lock(dp, mid);
4878 dp_delete_meter(dp, mid); /* Free existing meter, if any */
4879 dp->meters[mid] = meter;
4880 meter_unlock(dp, mid);
4881
4882 return 0;
4883 }
4884 return ENOMEM;
4885 }
4886
4887 static int
4888 dpif_netdev_meter_get(const struct dpif *dpif,
4889 ofproto_meter_id meter_id_,
4890 struct ofputil_meter_stats *stats, uint16_t n_bands)
4891 {
4892 const struct dp_netdev *dp = get_dp_netdev(dpif);
4893 const struct dp_meter *meter;
4894 uint32_t meter_id = meter_id_.uint32;
4895
4896 if (meter_id >= MAX_METERS) {
4897 return EFBIG;
4898 }
4899 meter = dp->meters[meter_id];
4900 if (!meter) {
4901 return ENOENT;
4902 }
4903 if (stats) {
4904 int i = 0;
4905
4906 meter_lock(dp, meter_id);
4907 stats->packet_in_count = meter->packet_count;
4908 stats->byte_in_count = meter->byte_count;
4909
4910 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
4911 stats->bands[i].packet_count = meter->bands[i].packet_count;
4912 stats->bands[i].byte_count = meter->bands[i].byte_count;
4913 }
4914 meter_unlock(dp, meter_id);
4915
4916 stats->n_bands = i;
4917 }
4918 return 0;
4919 }
4920
4921 static int
4922 dpif_netdev_meter_del(struct dpif *dpif,
4923 ofproto_meter_id meter_id_,
4924 struct ofputil_meter_stats *stats, uint16_t n_bands)
4925 {
4926 struct dp_netdev *dp = get_dp_netdev(dpif);
4927 int error;
4928
4929 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
4930 if (!error) {
4931 uint32_t meter_id = meter_id_.uint32;
4932
4933 meter_lock(dp, meter_id);
4934 dp_delete_meter(dp, meter_id);
4935 meter_unlock(dp, meter_id);
4936 }
4937 return error;
4938 }
4939
4940 \f
4941 static void
4942 dpif_netdev_disable_upcall(struct dpif *dpif)
4943 OVS_NO_THREAD_SAFETY_ANALYSIS
4944 {
4945 struct dp_netdev *dp = get_dp_netdev(dpif);
4946 dp_netdev_disable_upcall(dp);
4947 }
4948
4949 static void
4950 dp_netdev_enable_upcall(struct dp_netdev *dp)
4951 OVS_RELEASES(dp->upcall_rwlock)
4952 {
4953 fat_rwlock_unlock(&dp->upcall_rwlock);
4954 }
4955
4956 static void
4957 dpif_netdev_enable_upcall(struct dpif *dpif)
4958 OVS_NO_THREAD_SAFETY_ANALYSIS
4959 {
4960 struct dp_netdev *dp = get_dp_netdev(dpif);
4961 dp_netdev_enable_upcall(dp);
4962 }
4963
4964 static void
4965 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
4966 {
4967 ovs_mutex_lock(&pmd->cond_mutex);
4968 atomic_store_relaxed(&pmd->reload, false);
4969 pmd->last_reload_seq = seq_read(pmd->reload_seq);
4970 xpthread_cond_signal(&pmd->cond);
4971 ovs_mutex_unlock(&pmd->cond_mutex);
4972 }
4973
4974 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
4975 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
4976 * 'core_id' is NON_PMD_CORE_ID).
4977 *
4978 * Caller must unrefs the returned reference. */
4979 static struct dp_netdev_pmd_thread *
4980 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
4981 {
4982 struct dp_netdev_pmd_thread *pmd;
4983 const struct cmap_node *pnode;
4984
4985 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
4986 if (!pnode) {
4987 return NULL;
4988 }
4989 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
4990
4991 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
4992 }
4993
4994 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
4995 static void
4996 dp_netdev_set_nonpmd(struct dp_netdev *dp)
4997 OVS_REQUIRES(dp->port_mutex)
4998 {
4999 struct dp_netdev_pmd_thread *non_pmd;
5000
5001 non_pmd = xzalloc(sizeof *non_pmd);
5002 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5003 }
5004
5005 /* Caller must have valid pointer to 'pmd'. */
5006 static bool
5007 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5008 {
5009 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5010 }
5011
5012 static void
5013 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5014 {
5015 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5016 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5017 }
5018 }
5019
5020 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
5021 * fails, keeps checking for next node until reaching the end of cmap.
5022 *
5023 * Caller must unrefs the returned reference. */
5024 static struct dp_netdev_pmd_thread *
5025 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5026 {
5027 struct dp_netdev_pmd_thread *next;
5028
5029 do {
5030 struct cmap_node *node;
5031
5032 node = cmap_next_position(&dp->poll_threads, pos);
5033 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5034 : NULL;
5035 } while (next && !dp_netdev_pmd_try_ref(next));
5036
5037 return next;
5038 }
5039
5040 /* Configures the 'pmd' based on the input argument. */
5041 static void
5042 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
5043 unsigned core_id, int numa_id)
5044 {
5045 pmd->dp = dp;
5046 pmd->core_id = core_id;
5047 pmd->numa_id = numa_id;
5048 pmd->need_reload = false;
5049 pmd->n_output_batches = 0;
5050
5051 ovs_refcount_init(&pmd->ref_cnt);
5052 latch_init(&pmd->exit_latch);
5053 pmd->reload_seq = seq_create();
5054 pmd->last_reload_seq = seq_read(pmd->reload_seq);
5055 atomic_init(&pmd->reload, false);
5056 xpthread_cond_init(&pmd->cond, NULL);
5057 ovs_mutex_init(&pmd->cond_mutex);
5058 ovs_mutex_init(&pmd->flow_mutex);
5059 ovs_mutex_init(&pmd->port_mutex);
5060 cmap_init(&pmd->flow_table);
5061 cmap_init(&pmd->classifiers);
5062 pmd->ctx.last_rxq = NULL;
5063 pmd_thread_ctx_time_update(pmd);
5064 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
5065 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
5066 hmap_init(&pmd->poll_list);
5067 hmap_init(&pmd->tx_ports);
5068 hmap_init(&pmd->tnl_port_cache);
5069 hmap_init(&pmd->send_port_cache);
5070 /* init the 'flow_cache' since there is no
5071 * actual thread created for NON_PMD_CORE_ID. */
5072 if (core_id == NON_PMD_CORE_ID) {
5073 emc_cache_init(&pmd->flow_cache);
5074 pmd_alloc_static_tx_qid(pmd);
5075 }
5076 pmd_perf_stats_init(&pmd->perf_stats);
5077 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
5078 hash_int(core_id, 0));
5079 }
5080
5081 static void
5082 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
5083 {
5084 struct dpcls *cls;
5085
5086 dp_netdev_pmd_flow_flush(pmd);
5087 hmap_destroy(&pmd->send_port_cache);
5088 hmap_destroy(&pmd->tnl_port_cache);
5089 hmap_destroy(&pmd->tx_ports);
5090 hmap_destroy(&pmd->poll_list);
5091 /* All flows (including their dpcls_rules) have been deleted already */
5092 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
5093 dpcls_destroy(cls);
5094 ovsrcu_postpone(free, cls);
5095 }
5096 cmap_destroy(&pmd->classifiers);
5097 cmap_destroy(&pmd->flow_table);
5098 ovs_mutex_destroy(&pmd->flow_mutex);
5099 latch_destroy(&pmd->exit_latch);
5100 seq_destroy(pmd->reload_seq);
5101 xpthread_cond_destroy(&pmd->cond);
5102 ovs_mutex_destroy(&pmd->cond_mutex);
5103 ovs_mutex_destroy(&pmd->port_mutex);
5104 free(pmd);
5105 }
5106
5107 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
5108 * and unrefs the struct. */
5109 static void
5110 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
5111 {
5112 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
5113 * but extra cleanup is necessary */
5114 if (pmd->core_id == NON_PMD_CORE_ID) {
5115 ovs_mutex_lock(&dp->non_pmd_mutex);
5116 emc_cache_uninit(&pmd->flow_cache);
5117 pmd_free_cached_ports(pmd);
5118 pmd_free_static_tx_qid(pmd);
5119 ovs_mutex_unlock(&dp->non_pmd_mutex);
5120 } else {
5121 latch_set(&pmd->exit_latch);
5122 dp_netdev_reload_pmd__(pmd);
5123 xpthread_join(pmd->thread, NULL);
5124 }
5125
5126 dp_netdev_pmd_clear_ports(pmd);
5127
5128 /* Purges the 'pmd''s flows after stopping the thread, but before
5129 * destroying the flows, so that the flow stats can be collected. */
5130 if (dp->dp_purge_cb) {
5131 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
5132 }
5133 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
5134 dp_netdev_pmd_unref(pmd);
5135 }
5136
5137 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
5138 * thread. */
5139 static void
5140 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
5141 {
5142 struct dp_netdev_pmd_thread *pmd;
5143 struct dp_netdev_pmd_thread **pmd_list;
5144 size_t k = 0, n_pmds;
5145
5146 n_pmds = cmap_count(&dp->poll_threads);
5147 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
5148
5149 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5150 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
5151 continue;
5152 }
5153 /* We cannot call dp_netdev_del_pmd(), since it alters
5154 * 'dp->poll_threads' (while we're iterating it) and it
5155 * might quiesce. */
5156 ovs_assert(k < n_pmds);
5157 pmd_list[k++] = pmd;
5158 }
5159
5160 for (size_t i = 0; i < k; i++) {
5161 dp_netdev_del_pmd(dp, pmd_list[i]);
5162 }
5163 free(pmd_list);
5164 }
5165
5166 /* Deletes all rx queues from pmd->poll_list and all the ports from
5167 * pmd->tx_ports. */
5168 static void
5169 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
5170 {
5171 struct rxq_poll *poll;
5172 struct tx_port *port;
5173
5174 ovs_mutex_lock(&pmd->port_mutex);
5175 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
5176 free(poll);
5177 }
5178 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
5179 free(port);
5180 }
5181 ovs_mutex_unlock(&pmd->port_mutex);
5182 }
5183
5184 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
5185 static void
5186 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
5187 struct dp_netdev_rxq *rxq)
5188 OVS_REQUIRES(pmd->port_mutex)
5189 {
5190 int qid = netdev_rxq_get_queue_id(rxq->rx);
5191 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
5192 struct rxq_poll *poll;
5193
5194 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
5195 if (poll->rxq == rxq) {
5196 /* 'rxq' is already polled by this thread. Do nothing. */
5197 return;
5198 }
5199 }
5200
5201 poll = xmalloc(sizeof *poll);
5202 poll->rxq = rxq;
5203 hmap_insert(&pmd->poll_list, &poll->node, hash);
5204
5205 pmd->need_reload = true;
5206 }
5207
5208 /* Delete 'poll' from poll_list of PMD thread. */
5209 static void
5210 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
5211 struct rxq_poll *poll)
5212 OVS_REQUIRES(pmd->port_mutex)
5213 {
5214 hmap_remove(&pmd->poll_list, &poll->node);
5215 free(poll);
5216
5217 pmd->need_reload = true;
5218 }
5219
5220 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
5221 * changes to take effect. */
5222 static void
5223 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
5224 struct dp_netdev_port *port)
5225 OVS_REQUIRES(pmd->port_mutex)
5226 {
5227 struct tx_port *tx;
5228
5229 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
5230 if (tx) {
5231 /* 'port' is already on this thread tx cache. Do nothing. */
5232 return;
5233 }
5234
5235 tx = xzalloc(sizeof *tx);
5236
5237 tx->port = port;
5238 tx->qid = -1;
5239 tx->flush_time = 0LL;
5240 dp_packet_batch_init(&tx->output_pkts);
5241
5242 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
5243 pmd->need_reload = true;
5244 }
5245
5246 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
5247 * changes to take effect. */
5248 static void
5249 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
5250 struct tx_port *tx)
5251 OVS_REQUIRES(pmd->port_mutex)
5252 {
5253 hmap_remove(&pmd->tx_ports, &tx->node);
5254 free(tx);
5255 pmd->need_reload = true;
5256 }
5257 \f
5258 static char *
5259 dpif_netdev_get_datapath_version(void)
5260 {
5261 return xstrdup("<built-in>");
5262 }
5263
5264 static void
5265 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
5266 uint16_t tcp_flags, long long now)
5267 {
5268 uint16_t flags;
5269
5270 atomic_store_relaxed(&netdev_flow->stats.used, now);
5271 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
5272 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
5273 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
5274 flags |= tcp_flags;
5275 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
5276 }
5277
5278 static int
5279 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
5280 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
5281 enum dpif_upcall_type type, const struct nlattr *userdata,
5282 struct ofpbuf *actions, struct ofpbuf *put_actions)
5283 {
5284 struct dp_netdev *dp = pmd->dp;
5285
5286 if (OVS_UNLIKELY(!dp->upcall_cb)) {
5287 return ENODEV;
5288 }
5289
5290 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
5291 struct ds ds = DS_EMPTY_INITIALIZER;
5292 char *packet_str;
5293 struct ofpbuf key;
5294 struct odp_flow_key_parms odp_parms = {
5295 .flow = flow,
5296 .mask = wc ? &wc->masks : NULL,
5297 .support = dp_netdev_support,
5298 };
5299
5300 ofpbuf_init(&key, 0);
5301 odp_flow_key_from_flow(&odp_parms, &key);
5302 packet_str = ofp_dp_packet_to_string(packet_);
5303
5304 odp_flow_key_format(key.data, key.size, &ds);
5305
5306 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
5307 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
5308
5309 ofpbuf_uninit(&key);
5310 free(packet_str);
5311
5312 ds_destroy(&ds);
5313 }
5314
5315 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
5316 actions, wc, put_actions, dp->upcall_aux);
5317 }
5318
5319 static inline uint32_t
5320 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
5321 const struct miniflow *mf)
5322 {
5323 uint32_t hash;
5324
5325 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5326 hash = dp_packet_get_rss_hash(packet);
5327 } else {
5328 hash = miniflow_hash_5tuple(mf, 0);
5329 dp_packet_set_rss_hash(packet, hash);
5330 }
5331
5332 return hash;
5333 }
5334
5335 static inline uint32_t
5336 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
5337 const struct miniflow *mf)
5338 {
5339 uint32_t hash, recirc_depth;
5340
5341 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
5342 hash = dp_packet_get_rss_hash(packet);
5343 } else {
5344 hash = miniflow_hash_5tuple(mf, 0);
5345 dp_packet_set_rss_hash(packet, hash);
5346 }
5347
5348 /* The RSS hash must account for the recirculation depth to avoid
5349 * collisions in the exact match cache */
5350 recirc_depth = *recirc_depth_get_unsafe();
5351 if (OVS_UNLIKELY(recirc_depth)) {
5352 hash = hash_finish(hash, recirc_depth);
5353 dp_packet_set_rss_hash(packet, hash);
5354 }
5355 return hash;
5356 }
5357
5358 struct packet_batch_per_flow {
5359 unsigned int byte_count;
5360 uint16_t tcp_flags;
5361 struct dp_netdev_flow *flow;
5362
5363 struct dp_packet_batch array;
5364 };
5365
5366 static inline void
5367 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
5368 struct dp_packet *packet,
5369 const struct miniflow *mf)
5370 {
5371 batch->byte_count += dp_packet_size(packet);
5372 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
5373 batch->array.packets[batch->array.count++] = packet;
5374 }
5375
5376 static inline void
5377 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
5378 struct dp_netdev_flow *flow)
5379 {
5380 flow->batch = batch;
5381
5382 batch->flow = flow;
5383 dp_packet_batch_init(&batch->array);
5384 batch->byte_count = 0;
5385 batch->tcp_flags = 0;
5386 }
5387
5388 static inline void
5389 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
5390 struct dp_netdev_pmd_thread *pmd)
5391 {
5392 struct dp_netdev_actions *actions;
5393 struct dp_netdev_flow *flow = batch->flow;
5394
5395 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
5396 batch->tcp_flags, pmd->ctx.now / 1000);
5397
5398 actions = dp_netdev_flow_get_actions(flow);
5399
5400 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
5401 actions->actions, actions->size);
5402 }
5403
5404 static inline void
5405 dp_netdev_queue_batches(struct dp_packet *pkt,
5406 struct dp_netdev_flow *flow, const struct miniflow *mf,
5407 struct packet_batch_per_flow *batches,
5408 size_t *n_batches)
5409 {
5410 struct packet_batch_per_flow *batch = flow->batch;
5411
5412 if (OVS_UNLIKELY(!batch)) {
5413 batch = &batches[(*n_batches)++];
5414 packet_batch_per_flow_init(batch, flow);
5415 }
5416
5417 packet_batch_per_flow_update(batch, pkt, mf);
5418 }
5419
5420 /* Try to process all ('cnt') the 'packets' using only the exact match cache
5421 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
5422 * miniflow is copied into 'keys' and the packet pointer is moved at the
5423 * beginning of the 'packets' array.
5424 *
5425 * The function returns the number of packets that needs to be processed in the
5426 * 'packets' array (they have been moved to the beginning of the vector).
5427 *
5428 * For performance reasons a caller may choose not to initialize the metadata
5429 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
5430 * is not valid and must be initialized by this function using 'port_no'.
5431 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
5432 * will be ignored.
5433 */
5434 static inline size_t
5435 emc_processing(struct dp_netdev_pmd_thread *pmd,
5436 struct dp_packet_batch *packets_,
5437 struct netdev_flow_key *keys,
5438 struct packet_batch_per_flow batches[], size_t *n_batches,
5439 bool md_is_valid, odp_port_t port_no)
5440 {
5441 struct emc_cache *flow_cache = &pmd->flow_cache;
5442 struct netdev_flow_key *key = &keys[0];
5443 size_t n_missed = 0, n_dropped = 0;
5444 struct dp_packet *packet;
5445 const size_t cnt = dp_packet_batch_size(packets_);
5446 uint32_t cur_min;
5447 int i;
5448
5449 atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
5450 pmd_perf_update_counter(&pmd->perf_stats,
5451 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
5452 cnt);
5453
5454 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
5455 struct dp_netdev_flow *flow;
5456
5457 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
5458 dp_packet_delete(packet);
5459 n_dropped++;
5460 continue;
5461 }
5462
5463 if (i != cnt - 1) {
5464 struct dp_packet **packets = packets_->packets;
5465 /* Prefetch next packet data and metadata. */
5466 OVS_PREFETCH(dp_packet_data(packets[i+1]));
5467 pkt_metadata_prefetch_init(&packets[i+1]->md);
5468 }
5469
5470 if (!md_is_valid) {
5471 pkt_metadata_init(&packet->md, port_no);
5472 }
5473 miniflow_extract(packet, &key->mf);
5474 key->len = 0; /* Not computed yet. */
5475 /* If EMC is disabled skip hash computation and emc_lookup */
5476 if (cur_min) {
5477 if (!md_is_valid) {
5478 key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet,
5479 &key->mf);
5480 } else {
5481 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
5482 }
5483 flow = emc_lookup(flow_cache, key);
5484 } else {
5485 flow = NULL;
5486 }
5487 if (OVS_LIKELY(flow)) {
5488 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
5489 n_batches);
5490 } else {
5491 /* Exact match cache missed. Group missed packets together at
5492 * the beginning of the 'packets' array. */
5493 dp_packet_batch_refill(packets_, packet, i);
5494 /* 'key[n_missed]' contains the key of the current packet and it
5495 * must be returned to the caller. The next key should be extracted
5496 * to 'keys[n_missed + 1]'. */
5497 key = &keys[++n_missed];
5498 }
5499 }
5500
5501 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT,
5502 cnt - n_dropped - n_missed);
5503
5504 return dp_packet_batch_size(packets_);
5505 }
5506
5507 static inline int
5508 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
5509 struct dp_packet *packet,
5510 const struct netdev_flow_key *key,
5511 struct ofpbuf *actions, struct ofpbuf *put_actions)
5512 {
5513 struct ofpbuf *add_actions;
5514 struct dp_packet_batch b;
5515 struct match match;
5516 ovs_u128 ufid;
5517 int error;
5518 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
5519
5520 match.tun_md.valid = false;
5521 miniflow_expand(&key->mf, &match.flow);
5522
5523 ofpbuf_clear(actions);
5524 ofpbuf_clear(put_actions);
5525
5526 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
5527 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
5528 &ufid, DPIF_UC_MISS, NULL, actions,
5529 put_actions);
5530 if (OVS_UNLIKELY(error && error != ENOSPC)) {
5531 dp_packet_delete(packet);
5532 return error;
5533 }
5534
5535 /* The Netlink encoding of datapath flow keys cannot express
5536 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
5537 * tag is interpreted as exact match on the fact that there is no
5538 * VLAN. Unless we refactor a lot of code that translates between
5539 * Netlink and struct flow representations, we have to do the same
5540 * here. */
5541 if (!match.wc.masks.vlans[0].tci) {
5542 match.wc.masks.vlans[0].tci = htons(0xffff);
5543 }
5544
5545 /* We can't allow the packet batching in the next loop to execute
5546 * the actions. Otherwise, if there are any slow path actions,
5547 * we'll send the packet up twice. */
5548 dp_packet_batch_init_packet(&b, packet);
5549 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
5550 actions->data, actions->size);
5551
5552 add_actions = put_actions->size ? put_actions : actions;
5553 if (OVS_LIKELY(error != ENOSPC)) {
5554 struct dp_netdev_flow *netdev_flow;
5555
5556 /* XXX: There's a race window where a flow covering this packet
5557 * could have already been installed since we last did the flow
5558 * lookup before upcall. This could be solved by moving the
5559 * mutex lock outside the loop, but that's an awful long time
5560 * to be locking everyone out of making flow installs. If we
5561 * move to a per-core classifier, it would be reasonable. */
5562 ovs_mutex_lock(&pmd->flow_mutex);
5563 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
5564 if (OVS_LIKELY(!netdev_flow)) {
5565 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
5566 add_actions->data,
5567 add_actions->size);
5568 }
5569 ovs_mutex_unlock(&pmd->flow_mutex);
5570 emc_probabilistic_insert(pmd, key, netdev_flow);
5571 }
5572 if (pmd_perf_metrics_enabled(pmd)) {
5573 /* Update upcall stats. */
5574 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
5575 struct pmd_perf_stats *s = &pmd->perf_stats;
5576 s->current.upcalls++;
5577 s->current.upcall_cycles += cycles;
5578 histogram_add_sample(&s->cycles_per_upcall, cycles);
5579 }
5580 return error;
5581 }
5582
5583 static inline void
5584 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
5585 struct dp_packet_batch *packets_,
5586 struct netdev_flow_key *keys,
5587 struct packet_batch_per_flow batches[],
5588 size_t *n_batches,
5589 odp_port_t in_port)
5590 {
5591 const size_t cnt = dp_packet_batch_size(packets_);
5592 #if !defined(__CHECKER__) && !defined(_WIN32)
5593 const size_t PKT_ARRAY_SIZE = cnt;
5594 #else
5595 /* Sparse or MSVC doesn't like variable length array. */
5596 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5597 #endif
5598 struct dp_packet *packet;
5599 struct dpcls *cls;
5600 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
5601 struct dp_netdev *dp = pmd->dp;
5602 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
5603 int lookup_cnt = 0, add_lookup_cnt;
5604 bool any_miss;
5605
5606 for (size_t i = 0; i < cnt; i++) {
5607 /* Key length is needed in all the cases, hash computed on demand. */
5608 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
5609 }
5610 /* Get the classifier for the in_port */
5611 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
5612 if (OVS_LIKELY(cls)) {
5613 any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
5614 } else {
5615 any_miss = true;
5616 memset(rules, 0, sizeof(rules));
5617 }
5618 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
5619 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
5620 struct ofpbuf actions, put_actions;
5621
5622 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
5623 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
5624
5625 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5626 struct dp_netdev_flow *netdev_flow;
5627
5628 if (OVS_LIKELY(rules[i])) {
5629 continue;
5630 }
5631
5632 /* It's possible that an earlier slow path execution installed
5633 * a rule covering this flow. In this case, it's a lot cheaper
5634 * to catch it here than execute a miss. */
5635 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
5636 &add_lookup_cnt);
5637 if (netdev_flow) {
5638 lookup_cnt += add_lookup_cnt;
5639 rules[i] = &netdev_flow->cr;
5640 continue;
5641 }
5642
5643 int error = handle_packet_upcall(pmd, packet, &keys[i],
5644 &actions, &put_actions);
5645
5646 if (OVS_UNLIKELY(error)) {
5647 upcall_fail_cnt++;
5648 } else {
5649 upcall_ok_cnt++;
5650 }
5651 }
5652
5653 ofpbuf_uninit(&actions);
5654 ofpbuf_uninit(&put_actions);
5655 fat_rwlock_unlock(&dp->upcall_rwlock);
5656 } else if (OVS_UNLIKELY(any_miss)) {
5657 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5658 if (OVS_UNLIKELY(!rules[i])) {
5659 dp_packet_delete(packet);
5660 upcall_fail_cnt++;
5661 }
5662 }
5663 }
5664
5665 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5666 struct dp_netdev_flow *flow;
5667
5668 if (OVS_UNLIKELY(!rules[i])) {
5669 continue;
5670 }
5671
5672 flow = dp_netdev_flow_cast(rules[i]);
5673
5674 emc_probabilistic_insert(pmd, &keys[i], flow);
5675 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
5676 }
5677
5678 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
5679 cnt - upcall_ok_cnt - upcall_fail_cnt);
5680 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
5681 lookup_cnt);
5682 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
5683 upcall_ok_cnt);
5684 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
5685 upcall_fail_cnt);
5686 }
5687
5688 /* Packets enter the datapath from a port (or from recirculation) here.
5689 *
5690 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
5691 * When false the metadata in 'packets' need to be initialized. */
5692 static void
5693 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
5694 struct dp_packet_batch *packets,
5695 bool md_is_valid, odp_port_t port_no)
5696 {
5697 #if !defined(__CHECKER__) && !defined(_WIN32)
5698 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
5699 #else
5700 /* Sparse or MSVC doesn't like variable length array. */
5701 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
5702 #endif
5703 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
5704 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
5705 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
5706 size_t n_batches;
5707 odp_port_t in_port;
5708
5709 n_batches = 0;
5710 emc_processing(pmd, packets, keys, batches, &n_batches,
5711 md_is_valid, port_no);
5712 if (!dp_packet_batch_is_empty(packets)) {
5713 /* Get ingress port from first packet's metadata. */
5714 in_port = packets->packets[0]->md.in_port.odp_port;
5715 fast_path_processing(pmd, packets, keys,
5716 batches, &n_batches, in_port);
5717 }
5718
5719 /* All the flow batches need to be reset before any call to
5720 * packet_batch_per_flow_execute() as it could potentially trigger
5721 * recirculation. When a packet matching flow ‘j’ happens to be
5722 * recirculated, the nested call to dp_netdev_input__() could potentially
5723 * classify the packet as matching another flow - say 'k'. It could happen
5724 * that in the previous call to dp_netdev_input__() that same flow 'k' had
5725 * already its own batches[k] still waiting to be served. So if its
5726 * ‘batch’ member is not reset, the recirculated packet would be wrongly
5727 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
5728 size_t i;
5729 for (i = 0; i < n_batches; i++) {
5730 batches[i].flow->batch = NULL;
5731 }
5732
5733 for (i = 0; i < n_batches; i++) {
5734 packet_batch_per_flow_execute(&batches[i], pmd);
5735 }
5736 }
5737
5738 static void
5739 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
5740 struct dp_packet_batch *packets,
5741 odp_port_t port_no)
5742 {
5743 dp_netdev_input__(pmd, packets, false, port_no);
5744 }
5745
5746 static void
5747 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
5748 struct dp_packet_batch *packets)
5749 {
5750 dp_netdev_input__(pmd, packets, true, 0);
5751 }
5752
5753 struct dp_netdev_execute_aux {
5754 struct dp_netdev_pmd_thread *pmd;
5755 const struct flow *flow;
5756 };
5757
5758 static void
5759 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
5760 void *aux)
5761 {
5762 struct dp_netdev *dp = get_dp_netdev(dpif);
5763 dp->dp_purge_aux = aux;
5764 dp->dp_purge_cb = cb;
5765 }
5766
5767 static void
5768 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
5769 void *aux)
5770 {
5771 struct dp_netdev *dp = get_dp_netdev(dpif);
5772 dp->upcall_aux = aux;
5773 dp->upcall_cb = cb;
5774 }
5775
5776 static void
5777 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
5778 bool purge)
5779 {
5780 struct tx_port *tx;
5781 struct dp_netdev_port *port;
5782 long long interval;
5783
5784 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
5785 if (!tx->port->dynamic_txqs) {
5786 continue;
5787 }
5788 interval = pmd->ctx.now - tx->last_used;
5789 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
5790 port = tx->port;
5791 ovs_mutex_lock(&port->txq_used_mutex);
5792 port->txq_used[tx->qid]--;
5793 ovs_mutex_unlock(&port->txq_used_mutex);
5794 tx->qid = -1;
5795 }
5796 }
5797 }
5798
5799 static int
5800 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
5801 struct tx_port *tx)
5802 {
5803 struct dp_netdev_port *port;
5804 long long interval;
5805 int i, min_cnt, min_qid;
5806
5807 interval = pmd->ctx.now - tx->last_used;
5808 tx->last_used = pmd->ctx.now;
5809
5810 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
5811 return tx->qid;
5812 }
5813
5814 port = tx->port;
5815
5816 ovs_mutex_lock(&port->txq_used_mutex);
5817 if (tx->qid >= 0) {
5818 port->txq_used[tx->qid]--;
5819 tx->qid = -1;
5820 }
5821
5822 min_cnt = -1;
5823 min_qid = 0;
5824 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
5825 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
5826 min_cnt = port->txq_used[i];
5827 min_qid = i;
5828 }
5829 }
5830
5831 port->txq_used[min_qid]++;
5832 tx->qid = min_qid;
5833
5834 ovs_mutex_unlock(&port->txq_used_mutex);
5835
5836 dpif_netdev_xps_revalidate_pmd(pmd, false);
5837
5838 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
5839 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
5840 return min_qid;
5841 }
5842
5843 static struct tx_port *
5844 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5845 odp_port_t port_no)
5846 {
5847 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
5848 }
5849
5850 static struct tx_port *
5851 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5852 odp_port_t port_no)
5853 {
5854 return tx_port_lookup(&pmd->send_port_cache, port_no);
5855 }
5856
5857 static int
5858 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
5859 const struct nlattr *attr,
5860 struct dp_packet_batch *batch)
5861 {
5862 struct tx_port *tun_port;
5863 const struct ovs_action_push_tnl *data;
5864 int err;
5865
5866 data = nl_attr_get(attr);
5867
5868 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
5869 if (!tun_port) {
5870 err = -EINVAL;
5871 goto error;
5872 }
5873 err = netdev_push_header(tun_port->port->netdev, batch, data);
5874 if (!err) {
5875 return 0;
5876 }
5877 error:
5878 dp_packet_delete_batch(batch, true);
5879 return err;
5880 }
5881
5882 static void
5883 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
5884 struct dp_packet *packet, bool should_steal,
5885 struct flow *flow, ovs_u128 *ufid,
5886 struct ofpbuf *actions,
5887 const struct nlattr *userdata)
5888 {
5889 struct dp_packet_batch b;
5890 int error;
5891
5892 ofpbuf_clear(actions);
5893
5894 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
5895 DPIF_UC_ACTION, userdata, actions,
5896 NULL);
5897 if (!error || error == ENOSPC) {
5898 dp_packet_batch_init_packet(&b, packet);
5899 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
5900 actions->data, actions->size);
5901 } else if (should_steal) {
5902 dp_packet_delete(packet);
5903 }
5904 }
5905
5906 static void
5907 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
5908 const struct nlattr *a, bool should_steal)
5909 OVS_NO_THREAD_SAFETY_ANALYSIS
5910 {
5911 struct dp_netdev_execute_aux *aux = aux_;
5912 uint32_t *depth = recirc_depth_get();
5913 struct dp_netdev_pmd_thread *pmd = aux->pmd;
5914 struct dp_netdev *dp = pmd->dp;
5915 int type = nl_attr_type(a);
5916 struct tx_port *p;
5917
5918 switch ((enum ovs_action_attr)type) {
5919 case OVS_ACTION_ATTR_OUTPUT:
5920 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
5921 if (OVS_LIKELY(p)) {
5922 struct dp_packet *packet;
5923 struct dp_packet_batch out;
5924
5925 if (!should_steal) {
5926 dp_packet_batch_clone(&out, packets_);
5927 dp_packet_batch_reset_cutlen(packets_);
5928 packets_ = &out;
5929 }
5930 dp_packet_batch_apply_cutlen(packets_);
5931
5932 #ifdef DPDK_NETDEV
5933 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
5934 && packets_->packets[0]->source
5935 != p->output_pkts.packets[0]->source)) {
5936 /* XXX: netdev-dpdk assumes that all packets in a single
5937 * output batch has the same source. Flush here to
5938 * avoid memory access issues. */
5939 dp_netdev_pmd_flush_output_on_port(pmd, p);
5940 }
5941 #endif
5942 if (dp_packet_batch_size(&p->output_pkts)
5943 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
5944 /* Flush here to avoid overflow. */
5945 dp_netdev_pmd_flush_output_on_port(pmd, p);
5946 }
5947
5948 if (dp_packet_batch_is_empty(&p->output_pkts)) {
5949 pmd->n_output_batches++;
5950 }
5951
5952 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5953 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
5954 pmd->ctx.last_rxq;
5955 dp_packet_batch_add(&p->output_pkts, packet);
5956 }
5957 return;
5958 }
5959 break;
5960
5961 case OVS_ACTION_ATTR_TUNNEL_PUSH:
5962 if (should_steal) {
5963 /* We're requested to push tunnel header, but also we need to take
5964 * the ownership of these packets. Thus, we can avoid performing
5965 * the action, because the caller will not use the result anyway.
5966 * Just break to free the batch. */
5967 break;
5968 }
5969 dp_packet_batch_apply_cutlen(packets_);
5970 push_tnl_action(pmd, a, packets_);
5971 return;
5972
5973 case OVS_ACTION_ATTR_TUNNEL_POP:
5974 if (*depth < MAX_RECIRC_DEPTH) {
5975 struct dp_packet_batch *orig_packets_ = packets_;
5976 odp_port_t portno = nl_attr_get_odp_port(a);
5977
5978 p = pmd_tnl_port_cache_lookup(pmd, portno);
5979 if (p) {
5980 struct dp_packet_batch tnl_pkt;
5981
5982 if (!should_steal) {
5983 dp_packet_batch_clone(&tnl_pkt, packets_);
5984 packets_ = &tnl_pkt;
5985 dp_packet_batch_reset_cutlen(orig_packets_);
5986 }
5987
5988 dp_packet_batch_apply_cutlen(packets_);
5989
5990 netdev_pop_header(p->port->netdev, packets_);
5991 if (dp_packet_batch_is_empty(packets_)) {
5992 return;
5993 }
5994
5995 struct dp_packet *packet;
5996 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5997 packet->md.in_port.odp_port = portno;
5998 }
5999
6000 (*depth)++;
6001 dp_netdev_recirculate(pmd, packets_);
6002 (*depth)--;
6003 return;
6004 }
6005 }
6006 break;
6007
6008 case OVS_ACTION_ATTR_USERSPACE:
6009 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6010 struct dp_packet_batch *orig_packets_ = packets_;
6011 const struct nlattr *userdata;
6012 struct dp_packet_batch usr_pkt;
6013 struct ofpbuf actions;
6014 struct flow flow;
6015 ovs_u128 ufid;
6016 bool clone = false;
6017
6018 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
6019 ofpbuf_init(&actions, 0);
6020
6021 if (packets_->trunc) {
6022 if (!should_steal) {
6023 dp_packet_batch_clone(&usr_pkt, packets_);
6024 packets_ = &usr_pkt;
6025 clone = true;
6026 dp_packet_batch_reset_cutlen(orig_packets_);
6027 }
6028
6029 dp_packet_batch_apply_cutlen(packets_);
6030 }
6031
6032 struct dp_packet *packet;
6033 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6034 flow_extract(packet, &flow);
6035 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
6036 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
6037 &ufid, &actions, userdata);
6038 }
6039
6040 if (clone) {
6041 dp_packet_delete_batch(packets_, true);
6042 }
6043
6044 ofpbuf_uninit(&actions);
6045 fat_rwlock_unlock(&dp->upcall_rwlock);
6046
6047 return;
6048 }
6049 break;
6050
6051 case OVS_ACTION_ATTR_RECIRC:
6052 if (*depth < MAX_RECIRC_DEPTH) {
6053 struct dp_packet_batch recirc_pkts;
6054
6055 if (!should_steal) {
6056 dp_packet_batch_clone(&recirc_pkts, packets_);
6057 packets_ = &recirc_pkts;
6058 }
6059
6060 struct dp_packet *packet;
6061 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6062 packet->md.recirc_id = nl_attr_get_u32(a);
6063 }
6064
6065 (*depth)++;
6066 dp_netdev_recirculate(pmd, packets_);
6067 (*depth)--;
6068
6069 return;
6070 }
6071
6072 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
6073 break;
6074
6075 case OVS_ACTION_ATTR_CT: {
6076 const struct nlattr *b;
6077 bool force = false;
6078 bool commit = false;
6079 unsigned int left;
6080 uint16_t zone = 0;
6081 const char *helper = NULL;
6082 const uint32_t *setmark = NULL;
6083 const struct ovs_key_ct_labels *setlabel = NULL;
6084 struct nat_action_info_t nat_action_info;
6085 struct nat_action_info_t *nat_action_info_ref = NULL;
6086 bool nat_config = false;
6087
6088 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
6089 nl_attr_get_size(a)) {
6090 enum ovs_ct_attr sub_type = nl_attr_type(b);
6091
6092 switch(sub_type) {
6093 case OVS_CT_ATTR_FORCE_COMMIT:
6094 force = true;
6095 /* fall through. */
6096 case OVS_CT_ATTR_COMMIT:
6097 commit = true;
6098 break;
6099 case OVS_CT_ATTR_ZONE:
6100 zone = nl_attr_get_u16(b);
6101 break;
6102 case OVS_CT_ATTR_HELPER:
6103 helper = nl_attr_get_string(b);
6104 break;
6105 case OVS_CT_ATTR_MARK:
6106 setmark = nl_attr_get(b);
6107 break;
6108 case OVS_CT_ATTR_LABELS:
6109 setlabel = nl_attr_get(b);
6110 break;
6111 case OVS_CT_ATTR_EVENTMASK:
6112 /* Silently ignored, as userspace datapath does not generate
6113 * netlink events. */
6114 break;
6115 case OVS_CT_ATTR_NAT: {
6116 const struct nlattr *b_nest;
6117 unsigned int left_nest;
6118 bool ip_min_specified = false;
6119 bool proto_num_min_specified = false;
6120 bool ip_max_specified = false;
6121 bool proto_num_max_specified = false;
6122 memset(&nat_action_info, 0, sizeof nat_action_info);
6123 nat_action_info_ref = &nat_action_info;
6124
6125 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
6126 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
6127
6128 switch (sub_type_nest) {
6129 case OVS_NAT_ATTR_SRC:
6130 case OVS_NAT_ATTR_DST:
6131 nat_config = true;
6132 nat_action_info.nat_action |=
6133 ((sub_type_nest == OVS_NAT_ATTR_SRC)
6134 ? NAT_ACTION_SRC : NAT_ACTION_DST);
6135 break;
6136 case OVS_NAT_ATTR_IP_MIN:
6137 memcpy(&nat_action_info.min_addr,
6138 nl_attr_get(b_nest),
6139 nl_attr_get_size(b_nest));
6140 ip_min_specified = true;
6141 break;
6142 case OVS_NAT_ATTR_IP_MAX:
6143 memcpy(&nat_action_info.max_addr,
6144 nl_attr_get(b_nest),
6145 nl_attr_get_size(b_nest));
6146 ip_max_specified = true;
6147 break;
6148 case OVS_NAT_ATTR_PROTO_MIN:
6149 nat_action_info.min_port =
6150 nl_attr_get_u16(b_nest);
6151 proto_num_min_specified = true;
6152 break;
6153 case OVS_NAT_ATTR_PROTO_MAX:
6154 nat_action_info.max_port =
6155 nl_attr_get_u16(b_nest);
6156 proto_num_max_specified = true;
6157 break;
6158 case OVS_NAT_ATTR_PERSISTENT:
6159 case OVS_NAT_ATTR_PROTO_HASH:
6160 case OVS_NAT_ATTR_PROTO_RANDOM:
6161 break;
6162 case OVS_NAT_ATTR_UNSPEC:
6163 case __OVS_NAT_ATTR_MAX:
6164 OVS_NOT_REACHED();
6165 }
6166 }
6167
6168 if (ip_min_specified && !ip_max_specified) {
6169 nat_action_info.max_addr = nat_action_info.min_addr;
6170 }
6171 if (proto_num_min_specified && !proto_num_max_specified) {
6172 nat_action_info.max_port = nat_action_info.min_port;
6173 }
6174 if (proto_num_min_specified || proto_num_max_specified) {
6175 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
6176 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
6177 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
6178 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
6179 }
6180 }
6181 break;
6182 }
6183 case OVS_CT_ATTR_UNSPEC:
6184 case __OVS_CT_ATTR_MAX:
6185 OVS_NOT_REACHED();
6186 }
6187 }
6188
6189 /* We won't be able to function properly in this case, hence
6190 * complain loudly. */
6191 if (nat_config && !commit) {
6192 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
6193 VLOG_WARN_RL(&rl, "NAT specified without commit.");
6194 }
6195
6196 conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
6197 commit, zone, setmark, setlabel, aux->flow->tp_src,
6198 aux->flow->tp_dst, helper, nat_action_info_ref,
6199 pmd->ctx.now / 1000);
6200 break;
6201 }
6202
6203 case OVS_ACTION_ATTR_METER:
6204 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
6205 pmd->ctx.now);
6206 break;
6207
6208 case OVS_ACTION_ATTR_PUSH_VLAN:
6209 case OVS_ACTION_ATTR_POP_VLAN:
6210 case OVS_ACTION_ATTR_PUSH_MPLS:
6211 case OVS_ACTION_ATTR_POP_MPLS:
6212 case OVS_ACTION_ATTR_SET:
6213 case OVS_ACTION_ATTR_SET_MASKED:
6214 case OVS_ACTION_ATTR_SAMPLE:
6215 case OVS_ACTION_ATTR_HASH:
6216 case OVS_ACTION_ATTR_UNSPEC:
6217 case OVS_ACTION_ATTR_TRUNC:
6218 case OVS_ACTION_ATTR_PUSH_ETH:
6219 case OVS_ACTION_ATTR_POP_ETH:
6220 case OVS_ACTION_ATTR_CLONE:
6221 case OVS_ACTION_ATTR_PUSH_NSH:
6222 case OVS_ACTION_ATTR_POP_NSH:
6223 case OVS_ACTION_ATTR_CT_CLEAR:
6224 case __OVS_ACTION_ATTR_MAX:
6225 OVS_NOT_REACHED();
6226 }
6227
6228 dp_packet_delete_batch(packets_, should_steal);
6229 }
6230
6231 static void
6232 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
6233 struct dp_packet_batch *packets,
6234 bool should_steal, const struct flow *flow,
6235 const struct nlattr *actions, size_t actions_len)
6236 {
6237 struct dp_netdev_execute_aux aux = { pmd, flow };
6238
6239 odp_execute_actions(&aux, packets, should_steal, actions,
6240 actions_len, dp_execute_cb);
6241 }
6242
6243 struct dp_netdev_ct_dump {
6244 struct ct_dpif_dump_state up;
6245 struct conntrack_dump dump;
6246 struct conntrack *ct;
6247 struct dp_netdev *dp;
6248 };
6249
6250 static int
6251 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
6252 const uint16_t *pzone, int *ptot_bkts)
6253 {
6254 struct dp_netdev *dp = get_dp_netdev(dpif);
6255 struct dp_netdev_ct_dump *dump;
6256
6257 dump = xzalloc(sizeof *dump);
6258 dump->dp = dp;
6259 dump->ct = &dp->conntrack;
6260
6261 conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
6262
6263 *dump_ = &dump->up;
6264
6265 return 0;
6266 }
6267
6268 static int
6269 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
6270 struct ct_dpif_dump_state *dump_,
6271 struct ct_dpif_entry *entry)
6272 {
6273 struct dp_netdev_ct_dump *dump;
6274
6275 INIT_CONTAINER(dump, dump_, up);
6276
6277 return conntrack_dump_next(&dump->dump, entry);
6278 }
6279
6280 static int
6281 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
6282 struct ct_dpif_dump_state *dump_)
6283 {
6284 struct dp_netdev_ct_dump *dump;
6285 int err;
6286
6287 INIT_CONTAINER(dump, dump_, up);
6288
6289 err = conntrack_dump_done(&dump->dump);
6290
6291 free(dump);
6292
6293 return err;
6294 }
6295
6296 static int
6297 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
6298 const struct ct_dpif_tuple *tuple)
6299 {
6300 struct dp_netdev *dp = get_dp_netdev(dpif);
6301
6302 if (tuple) {
6303 return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0);
6304 }
6305 return conntrack_flush(&dp->conntrack, zone);
6306 }
6307
6308 static int
6309 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
6310 {
6311 struct dp_netdev *dp = get_dp_netdev(dpif);
6312
6313 return conntrack_set_maxconns(&dp->conntrack, maxconns);
6314 }
6315
6316 static int
6317 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
6318 {
6319 struct dp_netdev *dp = get_dp_netdev(dpif);
6320
6321 return conntrack_get_maxconns(&dp->conntrack, maxconns);
6322 }
6323
6324 static int
6325 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
6326 {
6327 struct dp_netdev *dp = get_dp_netdev(dpif);
6328
6329 return conntrack_get_nconns(&dp->conntrack, nconns);
6330 }
6331
6332 const struct dpif_class dpif_netdev_class = {
6333 "netdev",
6334 dpif_netdev_init,
6335 dpif_netdev_enumerate,
6336 dpif_netdev_port_open_type,
6337 dpif_netdev_open,
6338 dpif_netdev_close,
6339 dpif_netdev_destroy,
6340 dpif_netdev_run,
6341 dpif_netdev_wait,
6342 dpif_netdev_get_stats,
6343 dpif_netdev_port_add,
6344 dpif_netdev_port_del,
6345 dpif_netdev_port_set_config,
6346 dpif_netdev_port_query_by_number,
6347 dpif_netdev_port_query_by_name,
6348 NULL, /* port_get_pid */
6349 dpif_netdev_port_dump_start,
6350 dpif_netdev_port_dump_next,
6351 dpif_netdev_port_dump_done,
6352 dpif_netdev_port_poll,
6353 dpif_netdev_port_poll_wait,
6354 dpif_netdev_flow_flush,
6355 dpif_netdev_flow_dump_create,
6356 dpif_netdev_flow_dump_destroy,
6357 dpif_netdev_flow_dump_thread_create,
6358 dpif_netdev_flow_dump_thread_destroy,
6359 dpif_netdev_flow_dump_next,
6360 dpif_netdev_operate,
6361 NULL, /* recv_set */
6362 NULL, /* handlers_set */
6363 dpif_netdev_set_config,
6364 dpif_netdev_queue_to_priority,
6365 NULL, /* recv */
6366 NULL, /* recv_wait */
6367 NULL, /* recv_purge */
6368 dpif_netdev_register_dp_purge_cb,
6369 dpif_netdev_register_upcall_cb,
6370 dpif_netdev_enable_upcall,
6371 dpif_netdev_disable_upcall,
6372 dpif_netdev_get_datapath_version,
6373 dpif_netdev_ct_dump_start,
6374 dpif_netdev_ct_dump_next,
6375 dpif_netdev_ct_dump_done,
6376 dpif_netdev_ct_flush,
6377 dpif_netdev_ct_set_maxconns,
6378 dpif_netdev_ct_get_maxconns,
6379 dpif_netdev_ct_get_nconns,
6380 dpif_netdev_meter_get_features,
6381 dpif_netdev_meter_set,
6382 dpif_netdev_meter_get,
6383 dpif_netdev_meter_del,
6384 };
6385
6386 static void
6387 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
6388 const char *argv[], void *aux OVS_UNUSED)
6389 {
6390 struct dp_netdev_port *port;
6391 struct dp_netdev *dp;
6392 odp_port_t port_no;
6393
6394 ovs_mutex_lock(&dp_netdev_mutex);
6395 dp = shash_find_data(&dp_netdevs, argv[1]);
6396 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
6397 ovs_mutex_unlock(&dp_netdev_mutex);
6398 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
6399 return;
6400 }
6401 ovs_refcount_ref(&dp->ref_cnt);
6402 ovs_mutex_unlock(&dp_netdev_mutex);
6403
6404 ovs_mutex_lock(&dp->port_mutex);
6405 if (get_port_by_name(dp, argv[2], &port)) {
6406 unixctl_command_reply_error(conn, "unknown port");
6407 goto exit;
6408 }
6409
6410 port_no = u32_to_odp(atoi(argv[3]));
6411 if (!port_no || port_no == ODPP_NONE) {
6412 unixctl_command_reply_error(conn, "bad port number");
6413 goto exit;
6414 }
6415 if (dp_netdev_lookup_port(dp, port_no)) {
6416 unixctl_command_reply_error(conn, "port number already in use");
6417 goto exit;
6418 }
6419
6420 /* Remove port. */
6421 hmap_remove(&dp->ports, &port->node);
6422 reconfigure_datapath(dp);
6423
6424 /* Reinsert with new port number. */
6425 port->port_no = port_no;
6426 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
6427 reconfigure_datapath(dp);
6428
6429 seq_change(dp->port_seq);
6430 unixctl_command_reply(conn, NULL);
6431
6432 exit:
6433 ovs_mutex_unlock(&dp->port_mutex);
6434 dp_netdev_unref(dp);
6435 }
6436
6437 static void
6438 dpif_dummy_register__(const char *type)
6439 {
6440 struct dpif_class *class;
6441
6442 class = xmalloc(sizeof *class);
6443 *class = dpif_netdev_class;
6444 class->type = xstrdup(type);
6445 dp_register_provider(class);
6446 }
6447
6448 static void
6449 dpif_dummy_override(const char *type)
6450 {
6451 int error;
6452
6453 /*
6454 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
6455 * a userland-only build. It's useful for testsuite.
6456 */
6457 error = dp_unregister_provider(type);
6458 if (error == 0 || error == EAFNOSUPPORT) {
6459 dpif_dummy_register__(type);
6460 }
6461 }
6462
6463 void
6464 dpif_dummy_register(enum dummy_level level)
6465 {
6466 if (level == DUMMY_OVERRIDE_ALL) {
6467 struct sset types;
6468 const char *type;
6469
6470 sset_init(&types);
6471 dp_enumerate_types(&types);
6472 SSET_FOR_EACH (type, &types) {
6473 dpif_dummy_override(type);
6474 }
6475 sset_destroy(&types);
6476 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
6477 dpif_dummy_override("system");
6478 }
6479
6480 dpif_dummy_register__("dummy");
6481
6482 unixctl_command_register("dpif-dummy/change-port-number",
6483 "dp port new-number",
6484 3, 3, dpif_dummy_change_port_number, NULL);
6485 }
6486 \f
6487 /* Datapath Classifier. */
6488
6489 /* A set of rules that all have the same fields wildcarded. */
6490 struct dpcls_subtable {
6491 /* The fields are only used by writers. */
6492 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
6493
6494 /* These fields are accessed by readers. */
6495 struct cmap rules; /* Contains "struct dpcls_rule"s. */
6496 uint32_t hit_cnt; /* Number of match hits in subtable in current
6497 optimization interval. */
6498 struct netdev_flow_key mask; /* Wildcards for fields (const). */
6499 /* 'mask' must be the last field, additional space is allocated here. */
6500 };
6501
6502 /* Initializes 'cls' as a classifier that initially contains no classification
6503 * rules. */
6504 static void
6505 dpcls_init(struct dpcls *cls)
6506 {
6507 cmap_init(&cls->subtables_map);
6508 pvector_init(&cls->subtables);
6509 }
6510
6511 static void
6512 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
6513 {
6514 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
6515 pvector_remove(&cls->subtables, subtable);
6516 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
6517 subtable->mask.hash);
6518 cmap_destroy(&subtable->rules);
6519 ovsrcu_postpone(free, subtable);
6520 }
6521
6522 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
6523 * caller's responsibility.
6524 * May only be called after all the readers have been terminated. */
6525 static void
6526 dpcls_destroy(struct dpcls *cls)
6527 {
6528 if (cls) {
6529 struct dpcls_subtable *subtable;
6530
6531 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
6532 ovs_assert(cmap_count(&subtable->rules) == 0);
6533 dpcls_destroy_subtable(cls, subtable);
6534 }
6535 cmap_destroy(&cls->subtables_map);
6536 pvector_destroy(&cls->subtables);
6537 }
6538 }
6539
6540 static struct dpcls_subtable *
6541 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6542 {
6543 struct dpcls_subtable *subtable;
6544
6545 /* Need to add one. */
6546 subtable = xmalloc(sizeof *subtable
6547 - sizeof subtable->mask.mf + mask->len);
6548 cmap_init(&subtable->rules);
6549 subtable->hit_cnt = 0;
6550 netdev_flow_key_clone(&subtable->mask, mask);
6551 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
6552 /* Add the new subtable at the end of the pvector (with no hits yet) */
6553 pvector_insert(&cls->subtables, subtable, 0);
6554 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
6555 cmap_count(&cls->subtables_map), subtable, cls->in_port);
6556 pvector_publish(&cls->subtables);
6557
6558 return subtable;
6559 }
6560
6561 static inline struct dpcls_subtable *
6562 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
6563 {
6564 struct dpcls_subtable *subtable;
6565
6566 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
6567 &cls->subtables_map) {
6568 if (netdev_flow_key_equal(&subtable->mask, mask)) {
6569 return subtable;
6570 }
6571 }
6572 return dpcls_create_subtable(cls, mask);
6573 }
6574
6575
6576 /* Periodically sort the dpcls subtable vectors according to hit counts */
6577 static void
6578 dpcls_sort_subtable_vector(struct dpcls *cls)
6579 {
6580 struct pvector *pvec = &cls->subtables;
6581 struct dpcls_subtable *subtable;
6582
6583 PVECTOR_FOR_EACH (subtable, pvec) {
6584 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
6585 subtable->hit_cnt = 0;
6586 }
6587 pvector_publish(pvec);
6588 }
6589
6590 static inline void
6591 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
6592 struct polled_queue *poll_list, int poll_cnt)
6593 {
6594 struct dpcls *cls;
6595
6596 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
6597 uint64_t curr_tsc;
6598 /* Get the cycles that were used to process each queue and store. */
6599 for (unsigned i = 0; i < poll_cnt; i++) {
6600 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
6601 RXQ_CYCLES_PROC_CURR);
6602 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
6603 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
6604 0);
6605 }
6606 curr_tsc = cycles_counter_update(&pmd->perf_stats);
6607 if (pmd->intrvl_tsc_prev) {
6608 /* There is a prev timestamp, store a new intrvl cycle count. */
6609 atomic_store_relaxed(&pmd->intrvl_cycles,
6610 curr_tsc - pmd->intrvl_tsc_prev);
6611 }
6612 pmd->intrvl_tsc_prev = curr_tsc;
6613 /* Start new measuring interval */
6614 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6615 }
6616
6617 if (pmd->ctx.now > pmd->next_optimization) {
6618 /* Try to obtain the flow lock to block out revalidator threads.
6619 * If not possible, just try next time. */
6620 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
6621 /* Optimize each classifier */
6622 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6623 dpcls_sort_subtable_vector(cls);
6624 }
6625 ovs_mutex_unlock(&pmd->flow_mutex);
6626 /* Start new measuring interval */
6627 pmd->next_optimization = pmd->ctx.now
6628 + DPCLS_OPTIMIZATION_INTERVAL;
6629 }
6630 }
6631 }
6632
6633 /* Insert 'rule' into 'cls'. */
6634 static void
6635 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
6636 const struct netdev_flow_key *mask)
6637 {
6638 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
6639
6640 /* Refer to subtable's mask, also for later removal. */
6641 rule->mask = &subtable->mask;
6642 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
6643 }
6644
6645 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
6646 static void
6647 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
6648 {
6649 struct dpcls_subtable *subtable;
6650
6651 ovs_assert(rule->mask);
6652
6653 /* Get subtable from reference in rule->mask. */
6654 INIT_CONTAINER(subtable, rule->mask, mask);
6655 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
6656 == 0) {
6657 /* Delete empty subtable. */
6658 dpcls_destroy_subtable(cls, subtable);
6659 pvector_publish(&cls->subtables);
6660 }
6661 }
6662
6663 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
6664 * in 'mask' the values in 'key' and 'target' are the same. */
6665 static inline bool
6666 dpcls_rule_matches_key(const struct dpcls_rule *rule,
6667 const struct netdev_flow_key *target)
6668 {
6669 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
6670 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
6671 uint64_t value;
6672
6673 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
6674 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
6675 return false;
6676 }
6677 }
6678 return true;
6679 }
6680
6681 /* For each miniflow in 'keys' performs a classifier lookup writing the result
6682 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
6683 * NULL it is skipped.
6684 *
6685 * This function is optimized for use in the userspace datapath and therefore
6686 * does not implement a lot of features available in the standard
6687 * classifier_lookup() function. Specifically, it does not implement
6688 * priorities, instead returning any rule which matches the flow.
6689 *
6690 * Returns true if all miniflows found a corresponding rule. */
6691 static bool
6692 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
6693 struct dpcls_rule **rules, const size_t cnt,
6694 int *num_lookups_p)
6695 {
6696 /* The received 'cnt' miniflows are the search-keys that will be processed
6697 * to find a matching entry into the available subtables.
6698 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
6699 typedef uint32_t map_type;
6700 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
6701 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
6702
6703 struct dpcls_subtable *subtable;
6704
6705 map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
6706 map_type found_map;
6707 uint32_t hashes[MAP_BITS];
6708 const struct cmap_node *nodes[MAP_BITS];
6709
6710 if (cnt != MAP_BITS) {
6711 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
6712 }
6713 memset(rules, 0, cnt * sizeof *rules);
6714
6715 int lookups_match = 0, subtable_pos = 1;
6716
6717 /* The Datapath classifier - aka dpcls - is composed of subtables.
6718 * Subtables are dynamically created as needed when new rules are inserted.
6719 * Each subtable collects rules with matches on a specific subset of packet
6720 * fields as defined by the subtable's mask. We proceed to process every
6721 * search-key against each subtable, but when a match is found for a
6722 * search-key, the search for that key can stop because the rules are
6723 * non-overlapping. */
6724 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
6725 int i;
6726
6727 /* Compute hashes for the remaining keys. Each search-key is
6728 * masked with the subtable's mask to avoid hashing the wildcarded
6729 * bits. */
6730 ULLONG_FOR_EACH_1(i, keys_map) {
6731 hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
6732 &subtable->mask);
6733 }
6734 /* Lookup. */
6735 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
6736 /* Check results. When the i-th bit of found_map is set, it means
6737 * that a set of nodes with a matching hash value was found for the
6738 * i-th search-key. Due to possible hash collisions we need to check
6739 * which of the found rules, if any, really matches our masked
6740 * search-key. */
6741 ULLONG_FOR_EACH_1(i, found_map) {
6742 struct dpcls_rule *rule;
6743
6744 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
6745 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
6746 rules[i] = rule;
6747 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
6748 * within one second optimization interval. */
6749 subtable->hit_cnt++;
6750 lookups_match += subtable_pos;
6751 goto next;
6752 }
6753 }
6754 /* None of the found rules was a match. Reset the i-th bit to
6755 * keep searching this key in the next subtable. */
6756 ULLONG_SET0(found_map, i); /* Did not match. */
6757 next:
6758 ; /* Keep Sparse happy. */
6759 }
6760 keys_map &= ~found_map; /* Clear the found rules. */
6761 if (!keys_map) {
6762 if (num_lookups_p) {
6763 *num_lookups_p = lookups_match;
6764 }
6765 return true; /* All found. */
6766 }
6767 subtable_pos++;
6768 }
6769 if (num_lookups_p) {
6770 *num_lookups_p = lookups_match;
6771 }
6772 return false; /* Some misses. */
6773 }