]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: Move dpcls lookup structures to .h
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19 #include "dpif-netdev-private.h"
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <net/if.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/ioctl.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35
36 #include "bitmap.h"
37 #include "cmap.h"
38 #include "conntrack.h"
39 #include "coverage.h"
40 #include "ct-dpif.h"
41 #include "csum.h"
42 #include "dp-packet.h"
43 #include "dpif.h"
44 #include "dpif-netdev-perf.h"
45 #include "dpif-provider.h"
46 #include "dummy.h"
47 #include "fat-rwlock.h"
48 #include "flow.h"
49 #include "hmapx.h"
50 #include "id-pool.h"
51 #include "ipf.h"
52 #include "netdev.h"
53 #include "netdev-offload.h"
54 #include "netdev-provider.h"
55 #include "netdev-vport.h"
56 #include "netlink.h"
57 #include "odp-execute.h"
58 #include "odp-util.h"
59 #include "openvswitch/dynamic-string.h"
60 #include "openvswitch/list.h"
61 #include "openvswitch/match.h"
62 #include "openvswitch/ofp-parse.h"
63 #include "openvswitch/ofp-print.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openvswitch/shash.h"
66 #include "openvswitch/vlog.h"
67 #include "ovs-numa.h"
68 #include "ovs-rcu.h"
69 #include "packets.h"
70 #include "openvswitch/poll-loop.h"
71 #include "pvector.h"
72 #include "random.h"
73 #include "seq.h"
74 #include "smap.h"
75 #include "sset.h"
76 #include "timeval.h"
77 #include "tnl-neigh-cache.h"
78 #include "tnl-ports.h"
79 #include "unixctl.h"
80 #include "util.h"
81 #include "uuid.h"
82
83 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
84
85 /* Auto Load Balancing Defaults */
86 #define ALB_ACCEPTABLE_IMPROVEMENT 25
87 #define ALB_PMD_LOAD_THRESHOLD 95
88 #define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
89 #define MIN_TO_MSEC 60000
90
91 #define FLOW_DUMP_MAX_BATCH 50
92 /* Use per thread recirc_depth to prevent recirculation loop. */
93 #define MAX_RECIRC_DEPTH 6
94 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
95
96 /* Use instant packet send by default. */
97 #define DEFAULT_TX_FLUSH_INTERVAL 0
98
99 /* Configuration parameters. */
100 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
101 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
102 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
103 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
104
105 /* Protects against changes to 'dp_netdevs'. */
106 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
107
108 /* Contains all 'struct dp_netdev's. */
109 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
110 = SHASH_INITIALIZER(&dp_netdevs);
111
112 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
113
114 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
115 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
116 | CS_SRC_NAT | CS_DST_NAT)
117 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
118
119 static struct odp_support dp_netdev_support = {
120 .max_vlan_headers = SIZE_MAX,
121 .max_mpls_depth = SIZE_MAX,
122 .recirc = true,
123 .ct_state = true,
124 .ct_zone = true,
125 .ct_mark = true,
126 .ct_label = true,
127 .ct_state_nat = true,
128 .ct_orig_tuple = true,
129 .ct_orig_tuple6 = true,
130 };
131
132 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
133 *
134 * Exact match cache for frequently used flows
135 *
136 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
137 * search its entries for a miniflow that matches exactly the miniflow of the
138 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
139 *
140 * A cache entry holds a reference to its 'dp_netdev_flow'.
141 *
142 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
143 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
144 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
145 * value is the index of a cache entry where the miniflow could be.
146 *
147 *
148 * Signature match cache (SMC)
149 *
150 * This cache stores a 16-bit signature for each flow without storing keys, and
151 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
152 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
153 * SMC uses a set-associative design that each bucket contains
154 * SMC_ENTRY_PER_BUCKET number of entries.
155 * Since 16-bit flow_table index is used, if there are more than 2^16
156 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
157 *
158 *
159 * Thread-safety
160 * =============
161 *
162 * Each pmd_thread has its own private exact match cache.
163 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
164 */
165
166 #define EM_FLOW_HASH_SHIFT 13
167 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
168 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
169 #define EM_FLOW_HASH_SEGS 2
170
171 /* SMC uses a set-associative design. A bucket contains a set of entries that
172 * a flow item can occupy. For now, it uses one hash function rather than two
173 * as for the EMC design. */
174 #define SMC_ENTRY_PER_BUCKET 4
175 #define SMC_ENTRIES (1u << 20)
176 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
177 #define SMC_MASK (SMC_BUCKET_CNT - 1)
178
179 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
180 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
181 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
182 DEFAULT_EM_FLOW_INSERT_INV_PROB)
183
184 struct emc_entry {
185 struct dp_netdev_flow *flow;
186 struct netdev_flow_key key; /* key.hash used for emc hash value. */
187 };
188
189 struct emc_cache {
190 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
191 int sweep_idx; /* For emc_cache_slow_sweep(). */
192 };
193
194 struct smc_bucket {
195 uint16_t sig[SMC_ENTRY_PER_BUCKET];
196 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
197 };
198
199 /* Signature match cache, differentiate from EMC cache */
200 struct smc_cache {
201 struct smc_bucket buckets[SMC_BUCKET_CNT];
202 };
203
204 struct dfc_cache {
205 struct emc_cache emc_cache;
206 struct smc_cache smc_cache;
207 };
208
209 /* Iterate in the exact match cache through every entry that might contain a
210 * miniflow with hash 'HASH'. */
211 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
212 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
213 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
214 i__ < EM_FLOW_HASH_SEGS; \
215 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
216 \f
217 /* Simple non-wildcarding single-priority classifier. */
218
219 /* Time in microseconds between successive optimizations of the dpcls
220 * subtable vector */
221 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
222
223 /* Time in microseconds of the interval in which rxq processing cycles used
224 * in rxq to pmd assignments is measured and stored. */
225 #define PMD_RXQ_INTERVAL_LEN 10000000LL
226
227 /* Number of intervals for which cycles are stored
228 * and used during rxq to pmd assignment. */
229 #define PMD_RXQ_INTERVAL_MAX 6
230
231 struct dpcls {
232 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
233 odp_port_t in_port;
234 struct cmap subtables_map;
235 struct pvector subtables;
236 };
237
238 /* Data structure to keep packet order till fastpath processing. */
239 struct dp_packet_flow_map {
240 struct dp_packet *packet;
241 struct dp_netdev_flow *flow;
242 uint16_t tcp_flags;
243 };
244
245 static void dpcls_init(struct dpcls *);
246 static void dpcls_destroy(struct dpcls *);
247 static void dpcls_sort_subtable_vector(struct dpcls *);
248 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
249 const struct netdev_flow_key *mask);
250 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
251 static bool dpcls_lookup(struct dpcls *cls,
252 const struct netdev_flow_key *keys[],
253 struct dpcls_rule **rules, size_t cnt,
254 int *num_lookups_p);
255 bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
256 const struct netdev_flow_key *target);
257 /* Set of supported meter flags */
258 #define DP_SUPPORTED_METER_FLAGS_MASK \
259 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
260
261 /* Set of supported meter band types */
262 #define DP_SUPPORTED_METER_BAND_TYPES \
263 ( 1 << OFPMBT13_DROP )
264
265 struct dp_meter_band {
266 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
267 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
268 uint64_t packet_count;
269 uint64_t byte_count;
270 };
271
272 struct dp_meter {
273 uint16_t flags;
274 uint16_t n_bands;
275 uint32_t max_delta_t;
276 uint64_t used;
277 uint64_t packet_count;
278 uint64_t byte_count;
279 struct dp_meter_band bands[];
280 };
281
282 struct pmd_auto_lb {
283 bool auto_lb_requested; /* Auto load balancing requested by user. */
284 bool is_enabled; /* Current status of Auto load balancing. */
285 uint64_t rebalance_intvl;
286 uint64_t rebalance_poll_timer;
287 };
288
289 /* Datapath based on the network device interface from netdev.h.
290 *
291 *
292 * Thread-safety
293 * =============
294 *
295 * Some members, marked 'const', are immutable. Accessing other members
296 * requires synchronization, as noted in more detail below.
297 *
298 * Acquisition order is, from outermost to innermost:
299 *
300 * dp_netdev_mutex (global)
301 * port_mutex
302 * non_pmd_mutex
303 */
304 struct dp_netdev {
305 const struct dpif_class *const class;
306 const char *const name;
307 struct dpif *dpif;
308 struct ovs_refcount ref_cnt;
309 atomic_flag destroyed;
310
311 /* Ports.
312 *
313 * Any lookup into 'ports' or any access to the dp_netdev_ports found
314 * through 'ports' requires taking 'port_mutex'. */
315 struct ovs_mutex port_mutex;
316 struct hmap ports;
317 struct seq *port_seq; /* Incremented whenever a port changes. */
318
319 /* The time that a packet can wait in output batch for sending. */
320 atomic_uint32_t tx_flush_interval;
321
322 /* Meters. */
323 struct ovs_mutex meter_locks[N_METER_LOCKS];
324 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
325
326 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
327 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
328 /* Enable collection of PMD performance metrics. */
329 atomic_bool pmd_perf_metrics;
330 /* Enable the SMC cache from ovsdb config */
331 atomic_bool smc_enable_db;
332
333 /* Protects access to ofproto-dpif-upcall interface during revalidator
334 * thread synchronization. */
335 struct fat_rwlock upcall_rwlock;
336 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
337 void *upcall_aux;
338
339 /* Callback function for notifying the purging of dp flows (during
340 * reseting pmd deletion). */
341 dp_purge_callback *dp_purge_cb;
342 void *dp_purge_aux;
343
344 /* Stores all 'struct dp_netdev_pmd_thread's. */
345 struct cmap poll_threads;
346 /* id pool for per thread static_tx_qid. */
347 struct id_pool *tx_qid_pool;
348 struct ovs_mutex tx_qid_pool_mutex;
349 /* Use measured cycles for rxq to pmd assignment. */
350 bool pmd_rxq_assign_cyc;
351
352 /* Protects the access of the 'struct dp_netdev_pmd_thread'
353 * instance for non-pmd thread. */
354 struct ovs_mutex non_pmd_mutex;
355
356 /* Each pmd thread will store its pointer to
357 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
358 ovsthread_key_t per_pmd_key;
359
360 struct seq *reconfigure_seq;
361 uint64_t last_reconfigure_seq;
362
363 /* Cpu mask for pin of pmd threads. */
364 char *pmd_cmask;
365
366 uint64_t last_tnl_conf_seq;
367
368 struct conntrack *conntrack;
369 struct pmd_auto_lb pmd_alb;
370 };
371
372 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
373 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
374 {
375 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
376 }
377
378 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
379 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
380 {
381 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
382 }
383
384
385 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
386 odp_port_t)
387 OVS_REQUIRES(dp->port_mutex);
388
389 enum rxq_cycles_counter_type {
390 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
391 processing packets during the current
392 interval. */
393 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
394 during rxq to pmd assignment. */
395 RXQ_N_CYCLES
396 };
397
398 enum {
399 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
400 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
401 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
402 };
403
404 struct dp_flow_offload_item {
405 struct dp_netdev_pmd_thread *pmd;
406 struct dp_netdev_flow *flow;
407 int op;
408 struct match match;
409 struct nlattr *actions;
410 size_t actions_len;
411
412 struct ovs_list node;
413 };
414
415 struct dp_flow_offload {
416 struct ovs_mutex mutex;
417 struct ovs_list list;
418 pthread_cond_t cond;
419 };
420
421 static struct dp_flow_offload dp_flow_offload = {
422 .mutex = OVS_MUTEX_INITIALIZER,
423 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
424 };
425
426 static struct ovsthread_once offload_thread_once
427 = OVSTHREAD_ONCE_INITIALIZER;
428
429 #define XPS_TIMEOUT 500000LL /* In microseconds. */
430
431 /* Contained by struct dp_netdev_port's 'rxqs' member. */
432 struct dp_netdev_rxq {
433 struct dp_netdev_port *port;
434 struct netdev_rxq *rx;
435 unsigned core_id; /* Core to which this queue should be
436 pinned. OVS_CORE_UNSPEC if the
437 queue doesn't need to be pinned to a
438 particular core. */
439 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
440 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
441 bool is_vhost; /* Is rxq of a vhost port. */
442
443 /* Counters of cycles spent successfully polling and processing pkts. */
444 atomic_ullong cycles[RXQ_N_CYCLES];
445 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
446 sum them to yield the cycles used for an rxq. */
447 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
448 };
449
450 /* A port in a netdev-based datapath. */
451 struct dp_netdev_port {
452 odp_port_t port_no;
453 bool dynamic_txqs; /* If true XPS will be used. */
454 bool need_reconfigure; /* True if we should reconfigure netdev. */
455 struct netdev *netdev;
456 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
457 struct netdev_saved_flags *sf;
458 struct dp_netdev_rxq *rxqs;
459 unsigned n_rxq; /* Number of elements in 'rxqs' */
460 unsigned *txq_used; /* Number of threads that use each tx queue. */
461 struct ovs_mutex txq_used_mutex;
462 bool emc_enabled; /* If true EMC will be used. */
463 char *type; /* Port type as requested by user. */
464 char *rxq_affinity_list; /* Requested affinity of rx queues. */
465 };
466
467 /* Contained by struct dp_netdev_flow's 'stats' member. */
468 struct dp_netdev_flow_stats {
469 atomic_llong used; /* Last used time, in monotonic msecs. */
470 atomic_ullong packet_count; /* Number of packets matched. */
471 atomic_ullong byte_count; /* Number of bytes matched. */
472 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
473 };
474
475 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
476 *
477 *
478 * Thread-safety
479 * =============
480 *
481 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
482 * its pmd thread's classifier. The text below calls this classifier 'cls'.
483 *
484 * Motivation
485 * ----------
486 *
487 * The thread safety rules described here for "struct dp_netdev_flow" are
488 * motivated by two goals:
489 *
490 * - Prevent threads that read members of "struct dp_netdev_flow" from
491 * reading bad data due to changes by some thread concurrently modifying
492 * those members.
493 *
494 * - Prevent two threads making changes to members of a given "struct
495 * dp_netdev_flow" from interfering with each other.
496 *
497 *
498 * Rules
499 * -----
500 *
501 * A flow 'flow' may be accessed without a risk of being freed during an RCU
502 * grace period. Code that needs to hold onto a flow for a while
503 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
504 *
505 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
506 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
507 * from modification.
508 *
509 * Some members, marked 'const', are immutable. Accessing other members
510 * requires synchronization, as noted in more detail below.
511 */
512 struct dp_netdev_flow {
513 const struct flow flow; /* Unmasked flow that created this entry. */
514 /* Hash table index by unmasked flow. */
515 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
516 /* 'flow_table'. */
517 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
518 const ovs_u128 ufid; /* Unique flow identifier. */
519 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
520 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
521 /* flow. */
522
523 /* Number of references.
524 * The classifier owns one reference.
525 * Any thread trying to keep a rule from being freed should hold its own
526 * reference. */
527 struct ovs_refcount ref_cnt;
528
529 bool dead;
530 uint32_t mark; /* Unique flow mark assigned to a flow */
531
532 /* Statistics. */
533 struct dp_netdev_flow_stats stats;
534
535 /* Actions. */
536 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
537
538 /* While processing a group of input packets, the datapath uses the next
539 * member to store a pointer to the output batch for the flow. It is
540 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
541 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
542 struct packet_batch_per_flow *batch;
543
544 /* Packet classification. */
545 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
546 /* 'cr' must be the last member. */
547 };
548
549 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
550 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
551 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
552 struct flow *, bool);
553
554 /* A set of datapath actions within a "struct dp_netdev_flow".
555 *
556 *
557 * Thread-safety
558 * =============
559 *
560 * A struct dp_netdev_actions 'actions' is protected with RCU. */
561 struct dp_netdev_actions {
562 /* These members are immutable: they do not change during the struct's
563 * lifetime. */
564 unsigned int size; /* Size of 'actions', in bytes. */
565 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
566 };
567
568 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
569 size_t);
570 struct dp_netdev_actions *dp_netdev_flow_get_actions(
571 const struct dp_netdev_flow *);
572 static void dp_netdev_actions_free(struct dp_netdev_actions *);
573
574 struct polled_queue {
575 struct dp_netdev_rxq *rxq;
576 odp_port_t port_no;
577 bool emc_enabled;
578 bool rxq_enabled;
579 uint64_t change_seq;
580 };
581
582 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
583 struct rxq_poll {
584 struct dp_netdev_rxq *rxq;
585 struct hmap_node node;
586 };
587
588 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
589 * 'tnl_port_cache' or 'tx_ports'. */
590 struct tx_port {
591 struct dp_netdev_port *port;
592 int qid;
593 long long last_used;
594 struct hmap_node node;
595 long long flush_time;
596 struct dp_packet_batch output_pkts;
597 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
598 };
599
600 /* A set of properties for the current processing loop that is not directly
601 * associated with the pmd thread itself, but with the packets being
602 * processed or the short-term system configuration (for example, time).
603 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
604 struct dp_netdev_pmd_thread_ctx {
605 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
606 long long now;
607 /* RX queue from which last packet was received. */
608 struct dp_netdev_rxq *last_rxq;
609 /* EMC insertion probability context for the current processing cycle. */
610 uint32_t emc_insert_min;
611 };
612
613 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
614 * the performance overhead of interrupt processing. Therefore netdev can
615 * not implement rx-wait for these devices. dpif-netdev needs to poll
616 * these device to check for recv buffer. pmd-thread does polling for
617 * devices assigned to itself.
618 *
619 * DPDK used PMD for accessing NIC.
620 *
621 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
622 * I/O of all non-pmd threads. There will be no actual thread created
623 * for the instance.
624 *
625 * Each struct has its own flow cache and classifier per managed ingress port.
626 * For packets received on ingress port, a look up is done on corresponding PMD
627 * thread's flow cache and in case of a miss, lookup is performed in the
628 * corresponding classifier of port. Packets are executed with the found
629 * actions in either case.
630 * */
631 struct dp_netdev_pmd_thread {
632 struct dp_netdev *dp;
633 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
634 struct cmap_node node; /* In 'dp->poll_threads'. */
635
636 /* Per thread exact-match cache. Note, the instance for cpu core
637 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
638 * need to be protected by 'non_pmd_mutex'. Every other instance
639 * will only be accessed by its own pmd thread. */
640 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
641
642 /* Flow-Table and classifiers
643 *
644 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
645 * changes to 'classifiers' must be made while still holding the
646 * 'flow_mutex'.
647 */
648 struct ovs_mutex flow_mutex;
649 struct cmap flow_table OVS_GUARDED; /* Flow table. */
650
651 /* One classifier per in_port polled by the pmd */
652 struct cmap classifiers;
653 /* Periodically sort subtable vectors according to hit frequencies */
654 long long int next_optimization;
655 /* End of the next time interval for which processing cycles
656 are stored for each polled rxq. */
657 long long int rxq_next_cycle_store;
658
659 /* Last interval timestamp. */
660 uint64_t intrvl_tsc_prev;
661 /* Last interval cycles. */
662 atomic_ullong intrvl_cycles;
663
664 /* Current context of the PMD thread. */
665 struct dp_netdev_pmd_thread_ctx ctx;
666
667 struct seq *reload_seq;
668 uint64_t last_reload_seq;
669
670 /* These are atomic variables used as a synchronization and configuration
671 * points for thread reload/exit.
672 *
673 * 'reload' atomic is the main one and it's used as a memory
674 * synchronization point for all other knobs and data.
675 *
676 * For a thread that requests PMD reload:
677 *
678 * * All changes that should be visible to the PMD thread must be made
679 * before setting the 'reload'. These changes could use any memory
680 * ordering model including 'relaxed'.
681 * * Setting the 'reload' atomic should occur in the same thread where
682 * all other PMD configuration options updated.
683 * * Setting the 'reload' atomic should be done with 'release' memory
684 * ordering model or stricter. This will guarantee that all previous
685 * changes (including non-atomic and 'relaxed') will be visible to
686 * the PMD thread.
687 * * To check that reload is done, thread should poll the 'reload' atomic
688 * to become 'false'. Polling should be done with 'acquire' memory
689 * ordering model or stricter. This ensures that PMD thread completed
690 * the reload process.
691 *
692 * For the PMD thread:
693 *
694 * * PMD thread should read 'reload' atomic with 'acquire' memory
695 * ordering model or stricter. This will guarantee that all changes
696 * made before setting the 'reload' in the requesting thread will be
697 * visible to the PMD thread.
698 * * All other configuration data could be read with any memory
699 * ordering model (including non-atomic and 'relaxed') but *only after*
700 * reading the 'reload' atomic set to 'true'.
701 * * When the PMD reload done, PMD should (optionally) set all the below
702 * knobs except the 'reload' to their default ('false') values and
703 * (mandatory), as the last step, set the 'reload' to 'false' using
704 * 'release' memory ordering model or stricter. This will inform the
705 * requesting thread that PMD has completed a reload cycle.
706 */
707 atomic_bool reload; /* Do we need to reload ports? */
708 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
709 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
710 atomic_bool exit; /* For terminating the pmd thread. */
711
712 pthread_t thread;
713 unsigned core_id; /* CPU core id of this pmd thread. */
714 int numa_id; /* numa node id of this pmd thread. */
715 bool isolated;
716
717 /* Queue id used by this pmd thread to send packets on all netdevs if
718 * XPS disabled for this netdev. All static_tx_qid's are unique and less
719 * than 'cmap_count(dp->poll_threads)'. */
720 uint32_t static_tx_qid;
721
722 /* Number of filled output batches. */
723 int n_output_batches;
724
725 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
726 /* List of rx queues to poll. */
727 struct hmap poll_list OVS_GUARDED;
728 /* Map of 'tx_port's used for transmission. Written by the main thread,
729 * read by the pmd thread. */
730 struct hmap tx_ports OVS_GUARDED;
731
732 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
733 * ports (that support push_tunnel/pop_tunnel), the other contains ports
734 * with at least one txq (that support send). A port can be in both.
735 *
736 * There are two separate maps to make sure that we don't try to execute
737 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
738 *
739 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
740 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
741 * other instance will only be accessed by its own pmd thread. */
742 struct hmap tnl_port_cache;
743 struct hmap send_port_cache;
744
745 /* Keep track of detailed PMD performance statistics. */
746 struct pmd_perf_stats perf_stats;
747
748 /* Stats from previous iteration used by automatic pmd
749 * load balance logic. */
750 uint64_t prev_stats[PMD_N_STATS];
751 atomic_count pmd_overloaded;
752
753 /* Set to true if the pmd thread needs to be reloaded. */
754 bool need_reload;
755 };
756
757 /* Interface to netdev-based datapath. */
758 struct dpif_netdev {
759 struct dpif dpif;
760 struct dp_netdev *dp;
761 uint64_t last_port_seq;
762 };
763
764 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
765 struct dp_netdev_port **portp)
766 OVS_REQUIRES(dp->port_mutex);
767 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
768 struct dp_netdev_port **portp)
769 OVS_REQUIRES(dp->port_mutex);
770 static void dp_netdev_free(struct dp_netdev *)
771 OVS_REQUIRES(dp_netdev_mutex);
772 static int do_add_port(struct dp_netdev *dp, const char *devname,
773 const char *type, odp_port_t port_no)
774 OVS_REQUIRES(dp->port_mutex);
775 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
776 OVS_REQUIRES(dp->port_mutex);
777 static int dpif_netdev_open(const struct dpif_class *, const char *name,
778 bool create, struct dpif **);
779 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
780 struct dp_packet_batch *,
781 bool should_steal,
782 const struct flow *flow,
783 const struct nlattr *actions,
784 size_t actions_len);
785 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
786 struct dp_packet_batch *, odp_port_t port_no);
787 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
788 struct dp_packet_batch *);
789
790 static void dp_netdev_disable_upcall(struct dp_netdev *);
791 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
792 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
793 struct dp_netdev *dp, unsigned core_id,
794 int numa_id);
795 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
796 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
797 OVS_REQUIRES(dp->port_mutex);
798
799 static void *pmd_thread_main(void *);
800 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
801 unsigned core_id);
802 static struct dp_netdev_pmd_thread *
803 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
804 static void dp_netdev_del_pmd(struct dp_netdev *dp,
805 struct dp_netdev_pmd_thread *pmd);
806 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
807 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
808 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
809 struct dp_netdev_port *port)
810 OVS_REQUIRES(pmd->port_mutex);
811 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
812 struct tx_port *tx)
813 OVS_REQUIRES(pmd->port_mutex);
814 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
815 struct dp_netdev_rxq *rxq)
816 OVS_REQUIRES(pmd->port_mutex);
817 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
818 struct rxq_poll *poll)
819 OVS_REQUIRES(pmd->port_mutex);
820 static int
821 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
822 bool force);
823
824 static void reconfigure_datapath(struct dp_netdev *dp)
825 OVS_REQUIRES(dp->port_mutex);
826 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
827 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
828 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
829 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
830 OVS_REQUIRES(pmd->port_mutex);
831 static inline void
832 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
833 struct polled_queue *poll_list, int poll_cnt);
834 static void
835 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
836 enum rxq_cycles_counter_type type,
837 unsigned long long cycles);
838 static uint64_t
839 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
840 enum rxq_cycles_counter_type type);
841 static void
842 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
843 unsigned long long cycles);
844 static uint64_t
845 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
846 static void
847 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
848 bool purge);
849 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
850 struct tx_port *tx);
851
852 static inline bool emc_entry_alive(struct emc_entry *ce);
853 static void emc_clear_entry(struct emc_entry *ce);
854 static void smc_clear_entry(struct smc_bucket *b, int idx);
855
856 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
857 static inline bool
858 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
859 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
860 struct dp_netdev_flow *flow);
861
862 static void
863 emc_cache_init(struct emc_cache *flow_cache)
864 {
865 int i;
866
867 flow_cache->sweep_idx = 0;
868 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
869 flow_cache->entries[i].flow = NULL;
870 flow_cache->entries[i].key.hash = 0;
871 flow_cache->entries[i].key.len = sizeof(struct miniflow);
872 flowmap_init(&flow_cache->entries[i].key.mf.map);
873 }
874 }
875
876 static void
877 smc_cache_init(struct smc_cache *smc_cache)
878 {
879 int i, j;
880 for (i = 0; i < SMC_BUCKET_CNT; i++) {
881 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
882 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
883 }
884 }
885 }
886
887 static void
888 dfc_cache_init(struct dfc_cache *flow_cache)
889 {
890 emc_cache_init(&flow_cache->emc_cache);
891 smc_cache_init(&flow_cache->smc_cache);
892 }
893
894 static void
895 emc_cache_uninit(struct emc_cache *flow_cache)
896 {
897 int i;
898
899 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
900 emc_clear_entry(&flow_cache->entries[i]);
901 }
902 }
903
904 static void
905 smc_cache_uninit(struct smc_cache *smc)
906 {
907 int i, j;
908
909 for (i = 0; i < SMC_BUCKET_CNT; i++) {
910 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
911 smc_clear_entry(&(smc->buckets[i]), j);
912 }
913 }
914 }
915
916 static void
917 dfc_cache_uninit(struct dfc_cache *flow_cache)
918 {
919 smc_cache_uninit(&flow_cache->smc_cache);
920 emc_cache_uninit(&flow_cache->emc_cache);
921 }
922
923 /* Check and clear dead flow references slowly (one entry at each
924 * invocation). */
925 static void
926 emc_cache_slow_sweep(struct emc_cache *flow_cache)
927 {
928 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
929
930 if (!emc_entry_alive(entry)) {
931 emc_clear_entry(entry);
932 }
933 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
934 }
935
936 /* Updates the time in PMD threads context and should be called in three cases:
937 *
938 * 1. PMD structure initialization:
939 * - dp_netdev_configure_pmd()
940 *
941 * 2. Before processing of the new packet batch:
942 * - dpif_netdev_execute()
943 * - dp_netdev_process_rxq_port()
944 *
945 * 3. At least once per polling iteration in main polling threads if no
946 * packets received on current iteration:
947 * - dpif_netdev_run()
948 * - pmd_thread_main()
949 *
950 * 'pmd->ctx.now' should be used without update in all other cases if possible.
951 */
952 static inline void
953 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
954 {
955 pmd->ctx.now = time_usec();
956 }
957
958 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
959 bool
960 dpif_is_netdev(const struct dpif *dpif)
961 {
962 return dpif->dpif_class->open == dpif_netdev_open;
963 }
964
965 static struct dpif_netdev *
966 dpif_netdev_cast(const struct dpif *dpif)
967 {
968 ovs_assert(dpif_is_netdev(dpif));
969 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
970 }
971
972 static struct dp_netdev *
973 get_dp_netdev(const struct dpif *dpif)
974 {
975 return dpif_netdev_cast(dpif)->dp;
976 }
977 \f
978 enum pmd_info_type {
979 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
980 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
981 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
982 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
983 };
984
985 static void
986 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
987 {
988 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
989 ? "main thread" : "pmd thread");
990 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
991 ds_put_format(reply, " numa_id %d", pmd->numa_id);
992 }
993 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
994 ds_put_format(reply, " core_id %u", pmd->core_id);
995 }
996 ds_put_cstr(reply, ":\n");
997 }
998
999 static void
1000 pmd_info_show_stats(struct ds *reply,
1001 struct dp_netdev_pmd_thread *pmd)
1002 {
1003 uint64_t stats[PMD_N_STATS];
1004 uint64_t total_cycles, total_packets;
1005 double passes_per_pkt = 0;
1006 double lookups_per_hit = 0;
1007 double packets_per_batch = 0;
1008
1009 pmd_perf_read_counters(&pmd->perf_stats, stats);
1010 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1011 + stats[PMD_CYCLES_ITER_BUSY];
1012 total_packets = stats[PMD_STAT_RECV];
1013
1014 format_pmd_thread(reply, pmd);
1015
1016 if (total_packets > 0) {
1017 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1018 / (double) total_packets;
1019 }
1020 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1021 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1022 / (double) stats[PMD_STAT_MASKED_HIT];
1023 }
1024 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1025 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1026 / (double) stats[PMD_STAT_SENT_BATCHES];
1027 }
1028
1029 ds_put_format(reply,
1030 " packets received: %"PRIu64"\n"
1031 " packet recirculations: %"PRIu64"\n"
1032 " avg. datapath passes per packet: %.02f\n"
1033 " emc hits: %"PRIu64"\n"
1034 " smc hits: %"PRIu64"\n"
1035 " megaflow hits: %"PRIu64"\n"
1036 " avg. subtable lookups per megaflow hit: %.02f\n"
1037 " miss with success upcall: %"PRIu64"\n"
1038 " miss with failed upcall: %"PRIu64"\n"
1039 " avg. packets per output batch: %.02f\n",
1040 total_packets, stats[PMD_STAT_RECIRC],
1041 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1042 stats[PMD_STAT_SMC_HIT],
1043 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1044 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1045 packets_per_batch);
1046
1047 if (total_cycles == 0) {
1048 return;
1049 }
1050
1051 ds_put_format(reply,
1052 " idle cycles: %"PRIu64" (%.02f%%)\n"
1053 " processing cycles: %"PRIu64" (%.02f%%)\n",
1054 stats[PMD_CYCLES_ITER_IDLE],
1055 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1056 stats[PMD_CYCLES_ITER_BUSY],
1057 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1058
1059 if (total_packets == 0) {
1060 return;
1061 }
1062
1063 ds_put_format(reply,
1064 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1065 total_cycles / (double) total_packets,
1066 total_cycles, total_packets);
1067
1068 ds_put_format(reply,
1069 " avg processing cycles per packet: "
1070 "%.02f (%"PRIu64"/%"PRIu64")\n",
1071 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1072 stats[PMD_CYCLES_ITER_BUSY], total_packets);
1073 }
1074
1075 static void
1076 pmd_info_show_perf(struct ds *reply,
1077 struct dp_netdev_pmd_thread *pmd,
1078 struct pmd_perf_params *par)
1079 {
1080 if (pmd->core_id != NON_PMD_CORE_ID) {
1081 char *time_str =
1082 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1083 long long now = time_msec();
1084 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1085
1086 ds_put_cstr(reply, "\n");
1087 ds_put_format(reply, "Time: %s\n", time_str);
1088 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1089 ds_put_cstr(reply, "\n");
1090 format_pmd_thread(reply, pmd);
1091 ds_put_cstr(reply, "\n");
1092 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1093 if (pmd_perf_metrics_enabled(pmd)) {
1094 /* Prevent parallel clearing of perf metrics. */
1095 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1096 if (par->histograms) {
1097 ds_put_cstr(reply, "\n");
1098 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1099 }
1100 if (par->iter_hist_len > 0) {
1101 ds_put_cstr(reply, "\n");
1102 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1103 par->iter_hist_len);
1104 }
1105 if (par->ms_hist_len > 0) {
1106 ds_put_cstr(reply, "\n");
1107 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1108 par->ms_hist_len);
1109 }
1110 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1111 }
1112 free(time_str);
1113 }
1114 }
1115
1116 static int
1117 compare_poll_list(const void *a_, const void *b_)
1118 {
1119 const struct rxq_poll *a = a_;
1120 const struct rxq_poll *b = b_;
1121
1122 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1123 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1124
1125 int cmp = strcmp(namea, nameb);
1126 if (!cmp) {
1127 return netdev_rxq_get_queue_id(a->rxq->rx)
1128 - netdev_rxq_get_queue_id(b->rxq->rx);
1129 } else {
1130 return cmp;
1131 }
1132 }
1133
1134 static void
1135 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1136 size_t *n)
1137 OVS_REQUIRES(pmd->port_mutex)
1138 {
1139 struct rxq_poll *ret, *poll;
1140 size_t i;
1141
1142 *n = hmap_count(&pmd->poll_list);
1143 if (!*n) {
1144 ret = NULL;
1145 } else {
1146 ret = xcalloc(*n, sizeof *ret);
1147 i = 0;
1148 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1149 ret[i] = *poll;
1150 i++;
1151 }
1152 ovs_assert(i == *n);
1153 qsort(ret, *n, sizeof *ret, compare_poll_list);
1154 }
1155
1156 *list = ret;
1157 }
1158
1159 static void
1160 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1161 {
1162 if (pmd->core_id != NON_PMD_CORE_ID) {
1163 struct rxq_poll *list;
1164 size_t n_rxq;
1165 uint64_t total_cycles = 0;
1166
1167 ds_put_format(reply,
1168 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
1169 pmd->numa_id, pmd->core_id, (pmd->isolated)
1170 ? "true" : "false");
1171
1172 ovs_mutex_lock(&pmd->port_mutex);
1173 sorted_poll_list(pmd, &list, &n_rxq);
1174
1175 /* Get the total pmd cycles for an interval. */
1176 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1177 /* Estimate the cycles to cover all intervals. */
1178 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1179
1180 for (int i = 0; i < n_rxq; i++) {
1181 struct dp_netdev_rxq *rxq = list[i].rxq;
1182 const char *name = netdev_rxq_get_name(rxq->rx);
1183 uint64_t proc_cycles = 0;
1184
1185 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1186 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1187 }
1188 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1189 netdev_rxq_get_queue_id(list[i].rxq->rx));
1190 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1191 ? "(enabled) " : "(disabled)");
1192 ds_put_format(reply, " pmd usage: ");
1193 if (total_cycles) {
1194 ds_put_format(reply, "%2"PRIu64"",
1195 proc_cycles * 100 / total_cycles);
1196 ds_put_cstr(reply, " %");
1197 } else {
1198 ds_put_format(reply, "%s", "NOT AVAIL");
1199 }
1200 ds_put_cstr(reply, "\n");
1201 }
1202 ovs_mutex_unlock(&pmd->port_mutex);
1203 free(list);
1204 }
1205 }
1206
1207 static int
1208 compare_poll_thread_list(const void *a_, const void *b_)
1209 {
1210 const struct dp_netdev_pmd_thread *a, *b;
1211
1212 a = *(struct dp_netdev_pmd_thread **)a_;
1213 b = *(struct dp_netdev_pmd_thread **)b_;
1214
1215 if (a->core_id < b->core_id) {
1216 return -1;
1217 }
1218 if (a->core_id > b->core_id) {
1219 return 1;
1220 }
1221 return 0;
1222 }
1223
1224 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1225 * this list, as long as we do not go to quiescent state. */
1226 static void
1227 sorted_poll_thread_list(struct dp_netdev *dp,
1228 struct dp_netdev_pmd_thread ***list,
1229 size_t *n)
1230 {
1231 struct dp_netdev_pmd_thread *pmd;
1232 struct dp_netdev_pmd_thread **pmd_list;
1233 size_t k = 0, n_pmds;
1234
1235 n_pmds = cmap_count(&dp->poll_threads);
1236 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1237
1238 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1239 if (k >= n_pmds) {
1240 break;
1241 }
1242 pmd_list[k++] = pmd;
1243 }
1244
1245 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1246
1247 *list = pmd_list;
1248 *n = k;
1249 }
1250
1251 static void
1252 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1253 const char *argv[], void *aux OVS_UNUSED)
1254 {
1255 struct ds reply = DS_EMPTY_INITIALIZER;
1256 struct dp_netdev *dp = NULL;
1257
1258 ovs_mutex_lock(&dp_netdev_mutex);
1259
1260 if (argc == 2) {
1261 dp = shash_find_data(&dp_netdevs, argv[1]);
1262 } else if (shash_count(&dp_netdevs) == 1) {
1263 /* There's only one datapath */
1264 dp = shash_first(&dp_netdevs)->data;
1265 }
1266
1267 if (!dp) {
1268 ovs_mutex_unlock(&dp_netdev_mutex);
1269 unixctl_command_reply_error(conn,
1270 "please specify an existing datapath");
1271 return;
1272 }
1273
1274 dp_netdev_request_reconfigure(dp);
1275 ovs_mutex_unlock(&dp_netdev_mutex);
1276 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1277 unixctl_command_reply(conn, ds_cstr(&reply));
1278 ds_destroy(&reply);
1279 }
1280
1281 static void
1282 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1283 void *aux)
1284 {
1285 struct ds reply = DS_EMPTY_INITIALIZER;
1286 struct dp_netdev_pmd_thread **pmd_list;
1287 struct dp_netdev *dp = NULL;
1288 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1289 unsigned int core_id;
1290 bool filter_on_pmd = false;
1291 size_t n;
1292
1293 ovs_mutex_lock(&dp_netdev_mutex);
1294
1295 while (argc > 1) {
1296 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1297 if (str_to_uint(argv[2], 10, &core_id)) {
1298 filter_on_pmd = true;
1299 }
1300 argc -= 2;
1301 argv += 2;
1302 } else {
1303 dp = shash_find_data(&dp_netdevs, argv[1]);
1304 argc -= 1;
1305 argv += 1;
1306 }
1307 }
1308
1309 if (!dp) {
1310 if (shash_count(&dp_netdevs) == 1) {
1311 /* There's only one datapath */
1312 dp = shash_first(&dp_netdevs)->data;
1313 } else {
1314 ovs_mutex_unlock(&dp_netdev_mutex);
1315 unixctl_command_reply_error(conn,
1316 "please specify an existing datapath");
1317 return;
1318 }
1319 }
1320
1321 sorted_poll_thread_list(dp, &pmd_list, &n);
1322 for (size_t i = 0; i < n; i++) {
1323 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1324 if (!pmd) {
1325 break;
1326 }
1327 if (filter_on_pmd && pmd->core_id != core_id) {
1328 continue;
1329 }
1330 if (type == PMD_INFO_SHOW_RXQ) {
1331 pmd_info_show_rxq(&reply, pmd);
1332 } else if (type == PMD_INFO_CLEAR_STATS) {
1333 pmd_perf_stats_clear(&pmd->perf_stats);
1334 } else if (type == PMD_INFO_SHOW_STATS) {
1335 pmd_info_show_stats(&reply, pmd);
1336 } else if (type == PMD_INFO_PERF_SHOW) {
1337 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1338 }
1339 }
1340 free(pmd_list);
1341
1342 ovs_mutex_unlock(&dp_netdev_mutex);
1343
1344 unixctl_command_reply(conn, ds_cstr(&reply));
1345 ds_destroy(&reply);
1346 }
1347
1348 static void
1349 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1350 const char *argv[],
1351 void *aux OVS_UNUSED)
1352 {
1353 struct pmd_perf_params par;
1354 long int it_hist = 0, ms_hist = 0;
1355 par.histograms = true;
1356
1357 while (argc > 1) {
1358 if (!strcmp(argv[1], "-nh")) {
1359 par.histograms = false;
1360 argc -= 1;
1361 argv += 1;
1362 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1363 it_hist = strtol(argv[2], NULL, 10);
1364 if (it_hist < 0) {
1365 it_hist = 0;
1366 } else if (it_hist > HISTORY_LEN) {
1367 it_hist = HISTORY_LEN;
1368 }
1369 argc -= 2;
1370 argv += 2;
1371 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1372 ms_hist = strtol(argv[2], NULL, 10);
1373 if (ms_hist < 0) {
1374 ms_hist = 0;
1375 } else if (ms_hist > HISTORY_LEN) {
1376 ms_hist = HISTORY_LEN;
1377 }
1378 argc -= 2;
1379 argv += 2;
1380 } else {
1381 break;
1382 }
1383 }
1384 par.iter_hist_len = it_hist;
1385 par.ms_hist_len = ms_hist;
1386 par.command_type = PMD_INFO_PERF_SHOW;
1387 dpif_netdev_pmd_info(conn, argc, argv, &par);
1388 }
1389 \f
1390 static int
1391 dpif_netdev_init(void)
1392 {
1393 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1394 clear_aux = PMD_INFO_CLEAR_STATS,
1395 poll_aux = PMD_INFO_SHOW_RXQ;
1396
1397 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1398 0, 3, dpif_netdev_pmd_info,
1399 (void *)&show_aux);
1400 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1401 0, 3, dpif_netdev_pmd_info,
1402 (void *)&clear_aux);
1403 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1404 0, 3, dpif_netdev_pmd_info,
1405 (void *)&poll_aux);
1406 unixctl_command_register("dpif-netdev/pmd-perf-show",
1407 "[-nh] [-it iter-history-len]"
1408 " [-ms ms-history-len]"
1409 " [-pmd core] [dp]",
1410 0, 8, pmd_perf_show_cmd,
1411 NULL);
1412 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1413 0, 1, dpif_netdev_pmd_rebalance,
1414 NULL);
1415 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1416 "on|off [-b before] [-a after] [-e|-ne] "
1417 "[-us usec] [-q qlen]",
1418 0, 10, pmd_perf_log_set_cmd,
1419 NULL);
1420 return 0;
1421 }
1422
1423 static int
1424 dpif_netdev_enumerate(struct sset *all_dps,
1425 const struct dpif_class *dpif_class)
1426 {
1427 struct shash_node *node;
1428
1429 ovs_mutex_lock(&dp_netdev_mutex);
1430 SHASH_FOR_EACH(node, &dp_netdevs) {
1431 struct dp_netdev *dp = node->data;
1432 if (dpif_class != dp->class) {
1433 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1434 * If the class doesn't match, skip this dpif. */
1435 continue;
1436 }
1437 sset_add(all_dps, node->name);
1438 }
1439 ovs_mutex_unlock(&dp_netdev_mutex);
1440
1441 return 0;
1442 }
1443
1444 static bool
1445 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1446 {
1447 return class != &dpif_netdev_class;
1448 }
1449
1450 static const char *
1451 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1452 {
1453 return strcmp(type, "internal") ? type
1454 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1455 : "tap";
1456 }
1457
1458 static struct dpif *
1459 create_dpif_netdev(struct dp_netdev *dp)
1460 {
1461 uint16_t netflow_id = hash_string(dp->name, 0);
1462 struct dpif_netdev *dpif;
1463
1464 ovs_refcount_ref(&dp->ref_cnt);
1465
1466 dpif = xmalloc(sizeof *dpif);
1467 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1468 dpif->dp = dp;
1469 dpif->last_port_seq = seq_read(dp->port_seq);
1470
1471 return &dpif->dpif;
1472 }
1473
1474 /* Choose an unused, non-zero port number and return it on success.
1475 * Return ODPP_NONE on failure. */
1476 static odp_port_t
1477 choose_port(struct dp_netdev *dp, const char *name)
1478 OVS_REQUIRES(dp->port_mutex)
1479 {
1480 uint32_t port_no;
1481
1482 if (dp->class != &dpif_netdev_class) {
1483 const char *p;
1484 int start_no = 0;
1485
1486 /* If the port name begins with "br", start the number search at
1487 * 100 to make writing tests easier. */
1488 if (!strncmp(name, "br", 2)) {
1489 start_no = 100;
1490 }
1491
1492 /* If the port name contains a number, try to assign that port number.
1493 * This can make writing unit tests easier because port numbers are
1494 * predictable. */
1495 for (p = name; *p != '\0'; p++) {
1496 if (isdigit((unsigned char) *p)) {
1497 port_no = start_no + strtol(p, NULL, 10);
1498 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1499 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1500 return u32_to_odp(port_no);
1501 }
1502 break;
1503 }
1504 }
1505 }
1506
1507 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1508 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1509 return u32_to_odp(port_no);
1510 }
1511 }
1512
1513 return ODPP_NONE;
1514 }
1515
1516 static int
1517 create_dp_netdev(const char *name, const struct dpif_class *class,
1518 struct dp_netdev **dpp)
1519 OVS_REQUIRES(dp_netdev_mutex)
1520 {
1521 struct dp_netdev *dp;
1522 int error;
1523
1524 dp = xzalloc(sizeof *dp);
1525 shash_add(&dp_netdevs, name, dp);
1526
1527 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1528 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1529 ovs_refcount_init(&dp->ref_cnt);
1530 atomic_flag_clear(&dp->destroyed);
1531
1532 ovs_mutex_init(&dp->port_mutex);
1533 hmap_init(&dp->ports);
1534 dp->port_seq = seq_create();
1535 fat_rwlock_init(&dp->upcall_rwlock);
1536
1537 dp->reconfigure_seq = seq_create();
1538 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1539
1540 for (int i = 0; i < N_METER_LOCKS; ++i) {
1541 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1542 }
1543
1544 /* Disable upcalls by default. */
1545 dp_netdev_disable_upcall(dp);
1546 dp->upcall_aux = NULL;
1547 dp->upcall_cb = NULL;
1548
1549 dp->conntrack = conntrack_init();
1550
1551 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1552 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1553
1554 cmap_init(&dp->poll_threads);
1555 dp->pmd_rxq_assign_cyc = true;
1556
1557 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1558 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1559 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1560
1561 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1562 ovsthread_key_create(&dp->per_pmd_key, NULL);
1563
1564 ovs_mutex_lock(&dp->port_mutex);
1565 /* non-PMD will be created before all other threads and will
1566 * allocate static_tx_qid = 0. */
1567 dp_netdev_set_nonpmd(dp);
1568
1569 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1570 "internal"),
1571 ODPP_LOCAL);
1572 ovs_mutex_unlock(&dp->port_mutex);
1573 if (error) {
1574 dp_netdev_free(dp);
1575 return error;
1576 }
1577
1578 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1579 *dpp = dp;
1580 return 0;
1581 }
1582
1583 static void
1584 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1585 {
1586 seq_change(dp->reconfigure_seq);
1587 }
1588
1589 static bool
1590 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1591 {
1592 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1593 }
1594
1595 static int
1596 dpif_netdev_open(const struct dpif_class *class, const char *name,
1597 bool create, struct dpif **dpifp)
1598 {
1599 struct dp_netdev *dp;
1600 int error;
1601
1602 ovs_mutex_lock(&dp_netdev_mutex);
1603 dp = shash_find_data(&dp_netdevs, name);
1604 if (!dp) {
1605 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1606 } else {
1607 error = (dp->class != class ? EINVAL
1608 : create ? EEXIST
1609 : 0);
1610 }
1611 if (!error) {
1612 *dpifp = create_dpif_netdev(dp);
1613 dp->dpif = *dpifp;
1614 }
1615 ovs_mutex_unlock(&dp_netdev_mutex);
1616
1617 return error;
1618 }
1619
1620 static void
1621 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1622 OVS_NO_THREAD_SAFETY_ANALYSIS
1623 {
1624 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1625 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1626
1627 /* Before freeing a lock we should release it */
1628 fat_rwlock_unlock(&dp->upcall_rwlock);
1629 fat_rwlock_destroy(&dp->upcall_rwlock);
1630 }
1631
1632 static void
1633 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1634 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1635 {
1636 if (dp->meters[meter_id]) {
1637 free(dp->meters[meter_id]);
1638 dp->meters[meter_id] = NULL;
1639 }
1640 }
1641
1642 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1643 * through the 'dp_netdevs' shash while freeing 'dp'. */
1644 static void
1645 dp_netdev_free(struct dp_netdev *dp)
1646 OVS_REQUIRES(dp_netdev_mutex)
1647 {
1648 struct dp_netdev_port *port, *next;
1649
1650 shash_find_and_delete(&dp_netdevs, dp->name);
1651
1652 ovs_mutex_lock(&dp->port_mutex);
1653 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1654 do_del_port(dp, port);
1655 }
1656 ovs_mutex_unlock(&dp->port_mutex);
1657
1658 dp_netdev_destroy_all_pmds(dp, true);
1659 cmap_destroy(&dp->poll_threads);
1660
1661 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1662 id_pool_destroy(dp->tx_qid_pool);
1663
1664 ovs_mutex_destroy(&dp->non_pmd_mutex);
1665 ovsthread_key_delete(dp->per_pmd_key);
1666
1667 conntrack_destroy(dp->conntrack);
1668
1669
1670 seq_destroy(dp->reconfigure_seq);
1671
1672 seq_destroy(dp->port_seq);
1673 hmap_destroy(&dp->ports);
1674 ovs_mutex_destroy(&dp->port_mutex);
1675
1676 /* Upcalls must be disabled at this point */
1677 dp_netdev_destroy_upcall_lock(dp);
1678
1679 int i;
1680
1681 for (i = 0; i < MAX_METERS; ++i) {
1682 meter_lock(dp, i);
1683 dp_delete_meter(dp, i);
1684 meter_unlock(dp, i);
1685 }
1686 for (i = 0; i < N_METER_LOCKS; ++i) {
1687 ovs_mutex_destroy(&dp->meter_locks[i]);
1688 }
1689
1690 free(dp->pmd_cmask);
1691 free(CONST_CAST(char *, dp->name));
1692 free(dp);
1693 }
1694
1695 static void
1696 dp_netdev_unref(struct dp_netdev *dp)
1697 {
1698 if (dp) {
1699 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1700 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1701 ovs_mutex_lock(&dp_netdev_mutex);
1702 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1703 dp_netdev_free(dp);
1704 }
1705 ovs_mutex_unlock(&dp_netdev_mutex);
1706 }
1707 }
1708
1709 static void
1710 dpif_netdev_close(struct dpif *dpif)
1711 {
1712 struct dp_netdev *dp = get_dp_netdev(dpif);
1713
1714 dp_netdev_unref(dp);
1715 free(dpif);
1716 }
1717
1718 static int
1719 dpif_netdev_destroy(struct dpif *dpif)
1720 {
1721 struct dp_netdev *dp = get_dp_netdev(dpif);
1722
1723 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1724 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1725 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1726 OVS_NOT_REACHED();
1727 }
1728 }
1729
1730 return 0;
1731 }
1732
1733 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1734 * load/store semantics. While the increment is not atomic, the load and
1735 * store operations are, making it impossible to read inconsistent values.
1736 *
1737 * This is used to update thread local stats counters. */
1738 static void
1739 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1740 {
1741 unsigned long long tmp;
1742
1743 atomic_read_relaxed(var, &tmp);
1744 tmp += n;
1745 atomic_store_relaxed(var, tmp);
1746 }
1747
1748 static int
1749 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1750 {
1751 struct dp_netdev *dp = get_dp_netdev(dpif);
1752 struct dp_netdev_pmd_thread *pmd;
1753 uint64_t pmd_stats[PMD_N_STATS];
1754
1755 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1756 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1757 stats->n_flows += cmap_count(&pmd->flow_table);
1758 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1759 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1760 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1761 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1762 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1763 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1764 }
1765 stats->n_masks = UINT32_MAX;
1766 stats->n_mask_hit = UINT64_MAX;
1767
1768 return 0;
1769 }
1770
1771 static void
1772 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1773 {
1774 if (pmd->core_id == NON_PMD_CORE_ID) {
1775 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1776 ovs_mutex_lock(&pmd->port_mutex);
1777 pmd_load_cached_ports(pmd);
1778 ovs_mutex_unlock(&pmd->port_mutex);
1779 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1780 return;
1781 }
1782
1783 seq_change(pmd->reload_seq);
1784 atomic_store_explicit(&pmd->reload, true, memory_order_release);
1785 }
1786
1787 static uint32_t
1788 hash_port_no(odp_port_t port_no)
1789 {
1790 return hash_int(odp_to_u32(port_no), 0);
1791 }
1792
1793 static int
1794 port_create(const char *devname, const char *type,
1795 odp_port_t port_no, struct dp_netdev_port **portp)
1796 {
1797 struct netdev_saved_flags *sf;
1798 struct dp_netdev_port *port;
1799 enum netdev_flags flags;
1800 struct netdev *netdev;
1801 int error;
1802
1803 *portp = NULL;
1804
1805 /* Open and validate network device. */
1806 error = netdev_open(devname, type, &netdev);
1807 if (error) {
1808 return error;
1809 }
1810 /* XXX reject non-Ethernet devices */
1811
1812 netdev_get_flags(netdev, &flags);
1813 if (flags & NETDEV_LOOPBACK) {
1814 VLOG_ERR("%s: cannot add a loopback device", devname);
1815 error = EINVAL;
1816 goto out;
1817 }
1818
1819 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1820 if (error) {
1821 VLOG_ERR("%s: cannot set promisc flag", devname);
1822 goto out;
1823 }
1824
1825 port = xzalloc(sizeof *port);
1826 port->port_no = port_no;
1827 port->netdev = netdev;
1828 port->type = xstrdup(type);
1829 port->sf = sf;
1830 port->emc_enabled = true;
1831 port->need_reconfigure = true;
1832 ovs_mutex_init(&port->txq_used_mutex);
1833
1834 *portp = port;
1835
1836 return 0;
1837
1838 out:
1839 netdev_close(netdev);
1840 return error;
1841 }
1842
1843 static int
1844 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1845 odp_port_t port_no)
1846 OVS_REQUIRES(dp->port_mutex)
1847 {
1848 struct dp_netdev_port *port;
1849 int error;
1850
1851 /* Reject devices already in 'dp'. */
1852 if (!get_port_by_name(dp, devname, &port)) {
1853 return EEXIST;
1854 }
1855
1856 error = port_create(devname, type, port_no, &port);
1857 if (error) {
1858 return error;
1859 }
1860
1861 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1862 seq_change(dp->port_seq);
1863
1864 reconfigure_datapath(dp);
1865
1866 return 0;
1867 }
1868
1869 static int
1870 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1871 odp_port_t *port_nop)
1872 {
1873 struct dp_netdev *dp = get_dp_netdev(dpif);
1874 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1875 const char *dpif_port;
1876 odp_port_t port_no;
1877 int error;
1878
1879 ovs_mutex_lock(&dp->port_mutex);
1880 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1881 if (*port_nop != ODPP_NONE) {
1882 port_no = *port_nop;
1883 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1884 } else {
1885 port_no = choose_port(dp, dpif_port);
1886 error = port_no == ODPP_NONE ? EFBIG : 0;
1887 }
1888 if (!error) {
1889 *port_nop = port_no;
1890 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1891 }
1892 ovs_mutex_unlock(&dp->port_mutex);
1893
1894 return error;
1895 }
1896
1897 static int
1898 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1899 {
1900 struct dp_netdev *dp = get_dp_netdev(dpif);
1901 int error;
1902
1903 ovs_mutex_lock(&dp->port_mutex);
1904 if (port_no == ODPP_LOCAL) {
1905 error = EINVAL;
1906 } else {
1907 struct dp_netdev_port *port;
1908
1909 error = get_port_by_number(dp, port_no, &port);
1910 if (!error) {
1911 do_del_port(dp, port);
1912 }
1913 }
1914 ovs_mutex_unlock(&dp->port_mutex);
1915
1916 return error;
1917 }
1918
1919 static bool
1920 is_valid_port_number(odp_port_t port_no)
1921 {
1922 return port_no != ODPP_NONE;
1923 }
1924
1925 static struct dp_netdev_port *
1926 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1927 OVS_REQUIRES(dp->port_mutex)
1928 {
1929 struct dp_netdev_port *port;
1930
1931 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1932 if (port->port_no == port_no) {
1933 return port;
1934 }
1935 }
1936 return NULL;
1937 }
1938
1939 static int
1940 get_port_by_number(struct dp_netdev *dp,
1941 odp_port_t port_no, struct dp_netdev_port **portp)
1942 OVS_REQUIRES(dp->port_mutex)
1943 {
1944 if (!is_valid_port_number(port_no)) {
1945 *portp = NULL;
1946 return EINVAL;
1947 } else {
1948 *portp = dp_netdev_lookup_port(dp, port_no);
1949 return *portp ? 0 : ENODEV;
1950 }
1951 }
1952
1953 static void
1954 port_destroy(struct dp_netdev_port *port)
1955 {
1956 if (!port) {
1957 return;
1958 }
1959
1960 netdev_close(port->netdev);
1961 netdev_restore_flags(port->sf);
1962
1963 for (unsigned i = 0; i < port->n_rxq; i++) {
1964 netdev_rxq_close(port->rxqs[i].rx);
1965 }
1966 ovs_mutex_destroy(&port->txq_used_mutex);
1967 free(port->rxq_affinity_list);
1968 free(port->txq_used);
1969 free(port->rxqs);
1970 free(port->type);
1971 free(port);
1972 }
1973
1974 static int
1975 get_port_by_name(struct dp_netdev *dp,
1976 const char *devname, struct dp_netdev_port **portp)
1977 OVS_REQUIRES(dp->port_mutex)
1978 {
1979 struct dp_netdev_port *port;
1980
1981 HMAP_FOR_EACH (port, node, &dp->ports) {
1982 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1983 *portp = port;
1984 return 0;
1985 }
1986 }
1987
1988 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1989 * existing port. */
1990 return ENODEV;
1991 }
1992
1993 /* Returns 'true' if there is a port with pmd netdev. */
1994 static bool
1995 has_pmd_port(struct dp_netdev *dp)
1996 OVS_REQUIRES(dp->port_mutex)
1997 {
1998 struct dp_netdev_port *port;
1999
2000 HMAP_FOR_EACH (port, node, &dp->ports) {
2001 if (netdev_is_pmd(port->netdev)) {
2002 return true;
2003 }
2004 }
2005
2006 return false;
2007 }
2008
2009 static void
2010 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2011 OVS_REQUIRES(dp->port_mutex)
2012 {
2013 hmap_remove(&dp->ports, &port->node);
2014 seq_change(dp->port_seq);
2015
2016 reconfigure_datapath(dp);
2017
2018 port_destroy(port);
2019 }
2020
2021 static void
2022 answer_port_query(const struct dp_netdev_port *port,
2023 struct dpif_port *dpif_port)
2024 {
2025 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2026 dpif_port->type = xstrdup(port->type);
2027 dpif_port->port_no = port->port_no;
2028 }
2029
2030 static int
2031 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2032 struct dpif_port *dpif_port)
2033 {
2034 struct dp_netdev *dp = get_dp_netdev(dpif);
2035 struct dp_netdev_port *port;
2036 int error;
2037
2038 ovs_mutex_lock(&dp->port_mutex);
2039 error = get_port_by_number(dp, port_no, &port);
2040 if (!error && dpif_port) {
2041 answer_port_query(port, dpif_port);
2042 }
2043 ovs_mutex_unlock(&dp->port_mutex);
2044
2045 return error;
2046 }
2047
2048 static int
2049 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2050 struct dpif_port *dpif_port)
2051 {
2052 struct dp_netdev *dp = get_dp_netdev(dpif);
2053 struct dp_netdev_port *port;
2054 int error;
2055
2056 ovs_mutex_lock(&dp->port_mutex);
2057 error = get_port_by_name(dp, devname, &port);
2058 if (!error && dpif_port) {
2059 answer_port_query(port, dpif_port);
2060 }
2061 ovs_mutex_unlock(&dp->port_mutex);
2062
2063 return error;
2064 }
2065
2066 static void
2067 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2068 {
2069 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2070 free(flow);
2071 }
2072
2073 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2074 {
2075 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2076 ovsrcu_postpone(dp_netdev_flow_free, flow);
2077 }
2078 }
2079
2080 static uint32_t
2081 dp_netdev_flow_hash(const ovs_u128 *ufid)
2082 {
2083 return ufid->u32[0];
2084 }
2085
2086 static inline struct dpcls *
2087 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2088 odp_port_t in_port)
2089 {
2090 struct dpcls *cls;
2091 uint32_t hash = hash_port_no(in_port);
2092 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2093 if (cls->in_port == in_port) {
2094 /* Port classifier exists already */
2095 return cls;
2096 }
2097 }
2098 return NULL;
2099 }
2100
2101 static inline struct dpcls *
2102 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2103 odp_port_t in_port)
2104 OVS_REQUIRES(pmd->flow_mutex)
2105 {
2106 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2107 uint32_t hash = hash_port_no(in_port);
2108
2109 if (!cls) {
2110 /* Create new classifier for in_port */
2111 cls = xmalloc(sizeof(*cls));
2112 dpcls_init(cls);
2113 cls->in_port = in_port;
2114 cmap_insert(&pmd->classifiers, &cls->node, hash);
2115 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2116 }
2117 return cls;
2118 }
2119
2120 #define MAX_FLOW_MARK (UINT32_MAX - 1)
2121 #define INVALID_FLOW_MARK (UINT32_MAX)
2122
2123 struct megaflow_to_mark_data {
2124 const struct cmap_node node;
2125 ovs_u128 mega_ufid;
2126 uint32_t mark;
2127 };
2128
2129 struct flow_mark {
2130 struct cmap megaflow_to_mark;
2131 struct cmap mark_to_flow;
2132 struct id_pool *pool;
2133 };
2134
2135 static struct flow_mark flow_mark = {
2136 .megaflow_to_mark = CMAP_INITIALIZER,
2137 .mark_to_flow = CMAP_INITIALIZER,
2138 };
2139
2140 static uint32_t
2141 flow_mark_alloc(void)
2142 {
2143 uint32_t mark;
2144
2145 if (!flow_mark.pool) {
2146 /* Haven't initiated yet, do it here */
2147 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2148 }
2149
2150 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2151 return mark;
2152 }
2153
2154 return INVALID_FLOW_MARK;
2155 }
2156
2157 static void
2158 flow_mark_free(uint32_t mark)
2159 {
2160 id_pool_free_id(flow_mark.pool, mark);
2161 }
2162
2163 /* associate megaflow with a mark, which is a 1:1 mapping */
2164 static void
2165 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2166 {
2167 size_t hash = dp_netdev_flow_hash(mega_ufid);
2168 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2169
2170 data->mega_ufid = *mega_ufid;
2171 data->mark = mark;
2172
2173 cmap_insert(&flow_mark.megaflow_to_mark,
2174 CONST_CAST(struct cmap_node *, &data->node), hash);
2175 }
2176
2177 /* disassociate meagaflow with a mark */
2178 static void
2179 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2180 {
2181 size_t hash = dp_netdev_flow_hash(mega_ufid);
2182 struct megaflow_to_mark_data *data;
2183
2184 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2185 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2186 cmap_remove(&flow_mark.megaflow_to_mark,
2187 CONST_CAST(struct cmap_node *, &data->node), hash);
2188 ovsrcu_postpone(free, data);
2189 return;
2190 }
2191 }
2192
2193 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2194 UUID_ARGS((struct uuid *)mega_ufid));
2195 }
2196
2197 static inline uint32_t
2198 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2199 {
2200 size_t hash = dp_netdev_flow_hash(mega_ufid);
2201 struct megaflow_to_mark_data *data;
2202
2203 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2204 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2205 return data->mark;
2206 }
2207 }
2208
2209 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2210 UUID_ARGS((struct uuid *)mega_ufid));
2211 return INVALID_FLOW_MARK;
2212 }
2213
2214 /* associate mark with a flow, which is 1:N mapping */
2215 static void
2216 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2217 {
2218 dp_netdev_flow_ref(flow);
2219
2220 cmap_insert(&flow_mark.mark_to_flow,
2221 CONST_CAST(struct cmap_node *, &flow->mark_node),
2222 hash_int(mark, 0));
2223 flow->mark = mark;
2224
2225 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2226 }
2227
2228 static bool
2229 flow_mark_has_no_ref(uint32_t mark)
2230 {
2231 struct dp_netdev_flow *flow;
2232
2233 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2234 &flow_mark.mark_to_flow) {
2235 if (flow->mark == mark) {
2236 return false;
2237 }
2238 }
2239
2240 return true;
2241 }
2242
2243 static int
2244 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2245 struct dp_netdev_flow *flow)
2246 {
2247 int ret = 0;
2248 uint32_t mark = flow->mark;
2249 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2250 &flow->mark_node);
2251
2252 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2253 flow->mark = INVALID_FLOW_MARK;
2254
2255 /*
2256 * no flow is referencing the mark any more? If so, let's
2257 * remove the flow from hardware and free the mark.
2258 */
2259 if (flow_mark_has_no_ref(mark)) {
2260 struct dp_netdev_port *port;
2261 odp_port_t in_port = flow->flow.in_port.odp_port;
2262
2263 ovs_mutex_lock(&pmd->dp->port_mutex);
2264 port = dp_netdev_lookup_port(pmd->dp, in_port);
2265 if (port) {
2266 ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2267 }
2268 ovs_mutex_unlock(&pmd->dp->port_mutex);
2269
2270 flow_mark_free(mark);
2271 VLOG_DBG("Freed flow mark %u\n", mark);
2272
2273 megaflow_to_mark_disassociate(&flow->mega_ufid);
2274 }
2275 dp_netdev_flow_unref(flow);
2276
2277 return ret;
2278 }
2279
2280 static void
2281 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2282 {
2283 struct dp_netdev_flow *flow;
2284
2285 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2286 if (flow->pmd_id == pmd->core_id) {
2287 queue_netdev_flow_del(pmd, flow);
2288 }
2289 }
2290 }
2291
2292 static struct dp_netdev_flow *
2293 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2294 const uint32_t mark)
2295 {
2296 struct dp_netdev_flow *flow;
2297
2298 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2299 &flow_mark.mark_to_flow) {
2300 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2301 flow->dead == false) {
2302 return flow;
2303 }
2304 }
2305
2306 return NULL;
2307 }
2308
2309 static struct dp_flow_offload_item *
2310 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2311 struct dp_netdev_flow *flow,
2312 int op)
2313 {
2314 struct dp_flow_offload_item *offload;
2315
2316 offload = xzalloc(sizeof(*offload));
2317 offload->pmd = pmd;
2318 offload->flow = flow;
2319 offload->op = op;
2320
2321 dp_netdev_flow_ref(flow);
2322 dp_netdev_pmd_try_ref(pmd);
2323
2324 return offload;
2325 }
2326
2327 static void
2328 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2329 {
2330 dp_netdev_pmd_unref(offload->pmd);
2331 dp_netdev_flow_unref(offload->flow);
2332
2333 free(offload->actions);
2334 free(offload);
2335 }
2336
2337 static void
2338 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2339 {
2340 ovs_mutex_lock(&dp_flow_offload.mutex);
2341 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2342 xpthread_cond_signal(&dp_flow_offload.cond);
2343 ovs_mutex_unlock(&dp_flow_offload.mutex);
2344 }
2345
2346 static int
2347 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2348 {
2349 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2350 }
2351
2352 /*
2353 * There are two flow offload operations here: addition and modification.
2354 *
2355 * For flow addition, this function does:
2356 * - allocate a new flow mark id
2357 * - perform hardware flow offload
2358 * - associate the flow mark with flow and mega flow
2359 *
2360 * For flow modification, both flow mark and the associations are still
2361 * valid, thus only item 2 needed.
2362 */
2363 static int
2364 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2365 {
2366 struct dp_netdev_port *port;
2367 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2368 struct dp_netdev_flow *flow = offload->flow;
2369 odp_port_t in_port = flow->flow.in_port.odp_port;
2370 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2371 struct offload_info info;
2372 uint32_t mark;
2373 int ret;
2374
2375 if (flow->dead) {
2376 return -1;
2377 }
2378
2379 if (modification) {
2380 mark = flow->mark;
2381 ovs_assert(mark != INVALID_FLOW_MARK);
2382 } else {
2383 /*
2384 * If a mega flow has already been offloaded (from other PMD
2385 * instances), do not offload it again.
2386 */
2387 mark = megaflow_to_mark_find(&flow->mega_ufid);
2388 if (mark != INVALID_FLOW_MARK) {
2389 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2390 if (flow->mark != INVALID_FLOW_MARK) {
2391 ovs_assert(flow->mark == mark);
2392 } else {
2393 mark_to_flow_associate(mark, flow);
2394 }
2395 return 0;
2396 }
2397
2398 mark = flow_mark_alloc();
2399 if (mark == INVALID_FLOW_MARK) {
2400 VLOG_ERR("Failed to allocate flow mark!\n");
2401 }
2402 }
2403 info.flow_mark = mark;
2404
2405 ovs_mutex_lock(&pmd->dp->port_mutex);
2406 port = dp_netdev_lookup_port(pmd->dp, in_port);
2407 if (!port || netdev_vport_is_vport_class(port->netdev->netdev_class)) {
2408 ovs_mutex_unlock(&pmd->dp->port_mutex);
2409 goto err_free;
2410 }
2411 ret = netdev_flow_put(port->netdev, &offload->match,
2412 CONST_CAST(struct nlattr *, offload->actions),
2413 offload->actions_len, &flow->mega_ufid, &info,
2414 NULL);
2415 ovs_mutex_unlock(&pmd->dp->port_mutex);
2416
2417 if (ret) {
2418 goto err_free;
2419 }
2420
2421 if (!modification) {
2422 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2423 mark_to_flow_associate(mark, flow);
2424 }
2425 return 0;
2426
2427 err_free:
2428 if (!modification) {
2429 flow_mark_free(mark);
2430 } else {
2431 mark_to_flow_disassociate(pmd, flow);
2432 }
2433 return -1;
2434 }
2435
2436 static void *
2437 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2438 {
2439 struct dp_flow_offload_item *offload;
2440 struct ovs_list *list;
2441 const char *op;
2442 int ret;
2443
2444 for (;;) {
2445 ovs_mutex_lock(&dp_flow_offload.mutex);
2446 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2447 ovsrcu_quiesce_start();
2448 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2449 &dp_flow_offload.mutex);
2450 ovsrcu_quiesce_end();
2451 }
2452 list = ovs_list_pop_front(&dp_flow_offload.list);
2453 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2454 ovs_mutex_unlock(&dp_flow_offload.mutex);
2455
2456 switch (offload->op) {
2457 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2458 op = "add";
2459 ret = dp_netdev_flow_offload_put(offload);
2460 break;
2461 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2462 op = "modify";
2463 ret = dp_netdev_flow_offload_put(offload);
2464 break;
2465 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2466 op = "delete";
2467 ret = dp_netdev_flow_offload_del(offload);
2468 break;
2469 default:
2470 OVS_NOT_REACHED();
2471 }
2472
2473 VLOG_DBG("%s to %s netdev flow\n",
2474 ret == 0 ? "succeed" : "failed", op);
2475 dp_netdev_free_flow_offload(offload);
2476 }
2477
2478 return NULL;
2479 }
2480
2481 static void
2482 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2483 struct dp_netdev_flow *flow)
2484 {
2485 struct dp_flow_offload_item *offload;
2486
2487 if (ovsthread_once_start(&offload_thread_once)) {
2488 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2489 ovs_thread_create("dp_netdev_flow_offload",
2490 dp_netdev_flow_offload_main, NULL);
2491 ovsthread_once_done(&offload_thread_once);
2492 }
2493
2494 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2495 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2496 dp_netdev_append_flow_offload(offload);
2497 }
2498
2499 static void
2500 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2501 struct dp_netdev_flow *flow, struct match *match,
2502 const struct nlattr *actions, size_t actions_len)
2503 {
2504 struct dp_flow_offload_item *offload;
2505 int op;
2506
2507 if (!netdev_is_flow_api_enabled()) {
2508 return;
2509 }
2510
2511 if (ovsthread_once_start(&offload_thread_once)) {
2512 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2513 ovs_thread_create("dp_netdev_flow_offload",
2514 dp_netdev_flow_offload_main, NULL);
2515 ovsthread_once_done(&offload_thread_once);
2516 }
2517
2518 if (flow->mark != INVALID_FLOW_MARK) {
2519 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2520 } else {
2521 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2522 }
2523 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2524 offload->match = *match;
2525 offload->actions = xmalloc(actions_len);
2526 memcpy(offload->actions, actions, actions_len);
2527 offload->actions_len = actions_len;
2528
2529 dp_netdev_append_flow_offload(offload);
2530 }
2531
2532 static void
2533 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2534 struct dp_netdev_flow *flow)
2535 OVS_REQUIRES(pmd->flow_mutex)
2536 {
2537 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2538 struct dpcls *cls;
2539 odp_port_t in_port = flow->flow.in_port.odp_port;
2540
2541 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2542 ovs_assert(cls != NULL);
2543 dpcls_remove(cls, &flow->cr);
2544 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2545 if (flow->mark != INVALID_FLOW_MARK) {
2546 queue_netdev_flow_del(pmd, flow);
2547 }
2548 flow->dead = true;
2549
2550 dp_netdev_flow_unref(flow);
2551 }
2552
2553 static void
2554 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2555 {
2556 struct dp_netdev_flow *netdev_flow;
2557
2558 ovs_mutex_lock(&pmd->flow_mutex);
2559 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2560 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2561 }
2562 ovs_mutex_unlock(&pmd->flow_mutex);
2563 }
2564
2565 static int
2566 dpif_netdev_flow_flush(struct dpif *dpif)
2567 {
2568 struct dp_netdev *dp = get_dp_netdev(dpif);
2569 struct dp_netdev_pmd_thread *pmd;
2570
2571 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2572 dp_netdev_pmd_flow_flush(pmd);
2573 }
2574
2575 return 0;
2576 }
2577
2578 struct dp_netdev_port_state {
2579 struct hmap_position position;
2580 char *name;
2581 };
2582
2583 static int
2584 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2585 {
2586 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2587 return 0;
2588 }
2589
2590 static int
2591 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2592 struct dpif_port *dpif_port)
2593 {
2594 struct dp_netdev_port_state *state = state_;
2595 struct dp_netdev *dp = get_dp_netdev(dpif);
2596 struct hmap_node *node;
2597 int retval;
2598
2599 ovs_mutex_lock(&dp->port_mutex);
2600 node = hmap_at_position(&dp->ports, &state->position);
2601 if (node) {
2602 struct dp_netdev_port *port;
2603
2604 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2605
2606 free(state->name);
2607 state->name = xstrdup(netdev_get_name(port->netdev));
2608 dpif_port->name = state->name;
2609 dpif_port->type = port->type;
2610 dpif_port->port_no = port->port_no;
2611
2612 retval = 0;
2613 } else {
2614 retval = EOF;
2615 }
2616 ovs_mutex_unlock(&dp->port_mutex);
2617
2618 return retval;
2619 }
2620
2621 static int
2622 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2623 {
2624 struct dp_netdev_port_state *state = state_;
2625 free(state->name);
2626 free(state);
2627 return 0;
2628 }
2629
2630 static int
2631 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2632 {
2633 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2634 uint64_t new_port_seq;
2635 int error;
2636
2637 new_port_seq = seq_read(dpif->dp->port_seq);
2638 if (dpif->last_port_seq != new_port_seq) {
2639 dpif->last_port_seq = new_port_seq;
2640 error = ENOBUFS;
2641 } else {
2642 error = EAGAIN;
2643 }
2644
2645 return error;
2646 }
2647
2648 static void
2649 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2650 {
2651 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2652
2653 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2654 }
2655
2656 static struct dp_netdev_flow *
2657 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2658 {
2659 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2660 }
2661
2662 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2663 {
2664 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2665 }
2666
2667 /* netdev_flow_key utilities.
2668 *
2669 * netdev_flow_key is basically a miniflow. We use these functions
2670 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2671 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2672 *
2673 * - Since we are dealing exclusively with miniflows created by
2674 * miniflow_extract(), if the map is different the miniflow is different.
2675 * Therefore we can be faster by comparing the map and the miniflow in a
2676 * single memcmp().
2677 * - These functions can be inlined by the compiler. */
2678
2679 /* Given the number of bits set in miniflow's maps, returns the size of the
2680 * 'netdev_flow_key.mf' */
2681 static inline size_t
2682 netdev_flow_key_size(size_t flow_u64s)
2683 {
2684 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2685 }
2686
2687 static inline bool
2688 netdev_flow_key_equal(const struct netdev_flow_key *a,
2689 const struct netdev_flow_key *b)
2690 {
2691 /* 'b->len' may be not set yet. */
2692 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2693 }
2694
2695 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2696 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2697 * generated by miniflow_extract. */
2698 static inline bool
2699 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2700 const struct miniflow *mf)
2701 {
2702 return !memcmp(&key->mf, mf, key->len);
2703 }
2704
2705 static inline void
2706 netdev_flow_key_clone(struct netdev_flow_key *dst,
2707 const struct netdev_flow_key *src)
2708 {
2709 memcpy(dst, src,
2710 offsetof(struct netdev_flow_key, mf) + src->len);
2711 }
2712
2713 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2714 static inline void
2715 netdev_flow_mask_init(struct netdev_flow_key *mask,
2716 const struct match *match)
2717 {
2718 uint64_t *dst = miniflow_values(&mask->mf);
2719 struct flowmap fmap;
2720 uint32_t hash = 0;
2721 size_t idx;
2722
2723 /* Only check masks that make sense for the flow. */
2724 flow_wc_map(&match->flow, &fmap);
2725 flowmap_init(&mask->mf.map);
2726
2727 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2728 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2729
2730 if (mask_u64) {
2731 flowmap_set(&mask->mf.map, idx, 1);
2732 *dst++ = mask_u64;
2733 hash = hash_add64(hash, mask_u64);
2734 }
2735 }
2736
2737 map_t map;
2738
2739 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2740 hash = hash_add64(hash, map);
2741 }
2742
2743 size_t n = dst - miniflow_get_values(&mask->mf);
2744
2745 mask->hash = hash_finish(hash, n * 8);
2746 mask->len = netdev_flow_key_size(n);
2747 }
2748
2749 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2750 static inline void
2751 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2752 const struct flow *flow,
2753 const struct netdev_flow_key *mask)
2754 {
2755 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2756 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2757 uint32_t hash = 0;
2758 uint64_t value;
2759
2760 dst->len = mask->len;
2761 dst->mf = mask->mf; /* Copy maps. */
2762
2763 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2764 *dst_u64 = value & *mask_u64++;
2765 hash = hash_add64(hash, *dst_u64++);
2766 }
2767 dst->hash = hash_finish(hash,
2768 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2769 }
2770
2771 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2772 * 'mask'. */
2773 static inline uint32_t
2774 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2775 const struct netdev_flow_key *mask)
2776 {
2777 const uint64_t *p = miniflow_get_values(&mask->mf);
2778 uint32_t hash = 0;
2779 uint64_t value;
2780
2781 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2782 hash = hash_add64(hash, value & *p++);
2783 }
2784
2785 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2786 }
2787
2788 static inline bool
2789 emc_entry_alive(struct emc_entry *ce)
2790 {
2791 return ce->flow && !ce->flow->dead;
2792 }
2793
2794 static void
2795 emc_clear_entry(struct emc_entry *ce)
2796 {
2797 if (ce->flow) {
2798 dp_netdev_flow_unref(ce->flow);
2799 ce->flow = NULL;
2800 }
2801 }
2802
2803 static inline void
2804 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2805 const struct netdev_flow_key *key)
2806 {
2807 if (ce->flow != flow) {
2808 if (ce->flow) {
2809 dp_netdev_flow_unref(ce->flow);
2810 }
2811
2812 if (dp_netdev_flow_ref(flow)) {
2813 ce->flow = flow;
2814 } else {
2815 ce->flow = NULL;
2816 }
2817 }
2818 if (key) {
2819 netdev_flow_key_clone(&ce->key, key);
2820 }
2821 }
2822
2823 static inline void
2824 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2825 struct dp_netdev_flow *flow)
2826 {
2827 struct emc_entry *to_be_replaced = NULL;
2828 struct emc_entry *current_entry;
2829
2830 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2831 if (netdev_flow_key_equal(&current_entry->key, key)) {
2832 /* We found the entry with the 'mf' miniflow */
2833 emc_change_entry(current_entry, flow, NULL);
2834 return;
2835 }
2836
2837 /* Replacement policy: put the flow in an empty (not alive) entry, or
2838 * in the first entry where it can be */
2839 if (!to_be_replaced
2840 || (emc_entry_alive(to_be_replaced)
2841 && !emc_entry_alive(current_entry))
2842 || current_entry->key.hash < to_be_replaced->key.hash) {
2843 to_be_replaced = current_entry;
2844 }
2845 }
2846 /* We didn't find the miniflow in the cache.
2847 * The 'to_be_replaced' entry is where the new flow will be stored */
2848
2849 emc_change_entry(to_be_replaced, flow, key);
2850 }
2851
2852 static inline void
2853 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2854 const struct netdev_flow_key *key,
2855 struct dp_netdev_flow *flow)
2856 {
2857 /* Insert an entry into the EMC based on probability value 'min'. By
2858 * default the value is UINT32_MAX / 100 which yields an insertion
2859 * probability of 1/100 ie. 1% */
2860
2861 uint32_t min = pmd->ctx.emc_insert_min;
2862
2863 if (min && random_uint32() <= min) {
2864 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2865 }
2866 }
2867
2868 static inline struct dp_netdev_flow *
2869 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2870 {
2871 struct emc_entry *current_entry;
2872
2873 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2874 if (current_entry->key.hash == key->hash
2875 && emc_entry_alive(current_entry)
2876 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2877
2878 /* We found the entry with the 'key->mf' miniflow */
2879 return current_entry->flow;
2880 }
2881 }
2882
2883 return NULL;
2884 }
2885
2886 static inline const struct cmap_node *
2887 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2888 {
2889 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2890 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2891 uint16_t sig = hash >> 16;
2892 uint16_t index = UINT16_MAX;
2893
2894 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2895 if (bucket->sig[i] == sig) {
2896 index = bucket->flow_idx[i];
2897 break;
2898 }
2899 }
2900 if (index != UINT16_MAX) {
2901 return cmap_find_by_index(&pmd->flow_table, index);
2902 }
2903 return NULL;
2904 }
2905
2906 static void
2907 smc_clear_entry(struct smc_bucket *b, int idx)
2908 {
2909 b->flow_idx[idx] = UINT16_MAX;
2910 }
2911
2912 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2913 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2914 * If there is already an SMC entry having same signature, the index will be
2915 * updated. If there is no existing entry, but an empty entry is available,
2916 * the empty entry will be taken. If no empty entry or existing same signature,
2917 * a random entry from the hashed bucket will be picked. */
2918 static inline void
2919 smc_insert(struct dp_netdev_pmd_thread *pmd,
2920 const struct netdev_flow_key *key,
2921 uint32_t hash)
2922 {
2923 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2924 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2925 uint16_t index;
2926 uint32_t cmap_index;
2927 bool smc_enable_db;
2928 int i;
2929
2930 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2931 if (!smc_enable_db) {
2932 return;
2933 }
2934
2935 cmap_index = cmap_find_index(&pmd->flow_table, hash);
2936 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2937
2938 /* If the index is larger than SMC can handle (uint16_t), we don't
2939 * insert */
2940 if (index == UINT16_MAX) {
2941 return;
2942 }
2943
2944 /* If an entry with same signature already exists, update the index */
2945 uint16_t sig = key->hash >> 16;
2946 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2947 if (bucket->sig[i] == sig) {
2948 bucket->flow_idx[i] = index;
2949 return;
2950 }
2951 }
2952 /* If there is an empty entry, occupy it. */
2953 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2954 if (bucket->flow_idx[i] == UINT16_MAX) {
2955 bucket->sig[i] = sig;
2956 bucket->flow_idx[i] = index;
2957 return;
2958 }
2959 }
2960 /* Otherwise, pick a random entry. */
2961 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2962 bucket->sig[i] = sig;
2963 bucket->flow_idx[i] = index;
2964 }
2965
2966 static struct dp_netdev_flow *
2967 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2968 const struct netdev_flow_key *key,
2969 int *lookup_num_p)
2970 {
2971 struct dpcls *cls;
2972 struct dpcls_rule *rule;
2973 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2974 in_port.odp_port));
2975 struct dp_netdev_flow *netdev_flow = NULL;
2976
2977 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2978 if (OVS_LIKELY(cls)) {
2979 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
2980 netdev_flow = dp_netdev_flow_cast(rule);
2981 }
2982 return netdev_flow;
2983 }
2984
2985 static struct dp_netdev_flow *
2986 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2987 const ovs_u128 *ufidp, const struct nlattr *key,
2988 size_t key_len)
2989 {
2990 struct dp_netdev_flow *netdev_flow;
2991 struct flow flow;
2992 ovs_u128 ufid;
2993
2994 /* If a UFID is not provided, determine one based on the key. */
2995 if (!ufidp && key && key_len
2996 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2997 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2998 ufidp = &ufid;
2999 }
3000
3001 if (ufidp) {
3002 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3003 &pmd->flow_table) {
3004 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3005 return netdev_flow;
3006 }
3007 }
3008 }
3009
3010 return NULL;
3011 }
3012
3013 static void
3014 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
3015 struct dpif_flow_stats *stats)
3016 {
3017 struct dp_netdev_flow *netdev_flow;
3018 unsigned long long n;
3019 long long used;
3020 uint16_t flags;
3021
3022 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3023
3024 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3025 stats->n_packets = n;
3026 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3027 stats->n_bytes = n;
3028 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3029 stats->used = used;
3030 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3031 stats->tcp_flags = flags;
3032 }
3033
3034 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3035 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3036 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3037 * protect them. */
3038 static void
3039 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
3040 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3041 struct dpif_flow *flow, bool terse)
3042 {
3043 if (terse) {
3044 memset(flow, 0, sizeof *flow);
3045 } else {
3046 struct flow_wildcards wc;
3047 struct dp_netdev_actions *actions;
3048 size_t offset;
3049 struct odp_flow_key_parms odp_parms = {
3050 .flow = &netdev_flow->flow,
3051 .mask = &wc.masks,
3052 .support = dp_netdev_support,
3053 };
3054
3055 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3056 /* in_port is exact matched, but we have left it out from the mask for
3057 * optimnization reasons. Add in_port back to the mask. */
3058 wc.masks.in_port.odp_port = ODPP_NONE;
3059
3060 /* Key */
3061 offset = key_buf->size;
3062 flow->key = ofpbuf_tail(key_buf);
3063 odp_flow_key_from_flow(&odp_parms, key_buf);
3064 flow->key_len = key_buf->size - offset;
3065
3066 /* Mask */
3067 offset = mask_buf->size;
3068 flow->mask = ofpbuf_tail(mask_buf);
3069 odp_parms.key_buf = key_buf;
3070 odp_flow_key_from_mask(&odp_parms, mask_buf);
3071 flow->mask_len = mask_buf->size - offset;
3072
3073 /* Actions */
3074 actions = dp_netdev_flow_get_actions(netdev_flow);
3075 flow->actions = actions->actions;
3076 flow->actions_len = actions->size;
3077 }
3078
3079 flow->ufid = netdev_flow->ufid;
3080 flow->ufid_present = true;
3081 flow->pmd_id = netdev_flow->pmd_id;
3082 get_dpif_flow_stats(netdev_flow, &flow->stats);
3083
3084 flow->attrs.offloaded = false;
3085 flow->attrs.dp_layer = "ovs";
3086 }
3087
3088 static int
3089 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3090 const struct nlattr *mask_key,
3091 uint32_t mask_key_len, const struct flow *flow,
3092 struct flow_wildcards *wc, bool probe)
3093 {
3094 enum odp_key_fitness fitness;
3095
3096 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3097 if (fitness) {
3098 if (!probe) {
3099 /* This should not happen: it indicates that
3100 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3101 * disagree on the acceptable form of a mask. Log the problem
3102 * as an error, with enough details to enable debugging. */
3103 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3104
3105 if (!VLOG_DROP_ERR(&rl)) {
3106 struct ds s;
3107
3108 ds_init(&s);
3109 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3110 true);
3111 VLOG_ERR("internal error parsing flow mask %s (%s)",
3112 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3113 ds_destroy(&s);
3114 }
3115 }
3116
3117 return EINVAL;
3118 }
3119
3120 return 0;
3121 }
3122
3123 static int
3124 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3125 struct flow *flow, bool probe)
3126 {
3127 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3128 if (!probe) {
3129 /* This should not happen: it indicates that
3130 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3131 * the acceptable form of a flow. Log the problem as an error,
3132 * with enough details to enable debugging. */
3133 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3134
3135 if (!VLOG_DROP_ERR(&rl)) {
3136 struct ds s;
3137
3138 ds_init(&s);
3139 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3140 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3141 ds_destroy(&s);
3142 }
3143 }
3144
3145 return EINVAL;
3146 }
3147
3148 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3149 return EINVAL;
3150 }
3151
3152 return 0;
3153 }
3154
3155 static int
3156 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3157 {
3158 struct dp_netdev *dp = get_dp_netdev(dpif);
3159 struct dp_netdev_flow *netdev_flow;
3160 struct dp_netdev_pmd_thread *pmd;
3161 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3162 struct hmapx_node *node;
3163 int error = EINVAL;
3164
3165 if (get->pmd_id == PMD_ID_NULL) {
3166 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3167 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3168 dp_netdev_pmd_unref(pmd);
3169 }
3170 }
3171 } else {
3172 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3173 if (!pmd) {
3174 goto out;
3175 }
3176 hmapx_add(&to_find, pmd);
3177 }
3178
3179 if (!hmapx_count(&to_find)) {
3180 goto out;
3181 }
3182
3183 HMAPX_FOR_EACH (node, &to_find) {
3184 pmd = (struct dp_netdev_pmd_thread *) node->data;
3185 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3186 get->key_len);
3187 if (netdev_flow) {
3188 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
3189 get->flow, false);
3190 error = 0;
3191 break;
3192 } else {
3193 error = ENOENT;
3194 }
3195 }
3196
3197 HMAPX_FOR_EACH (node, &to_find) {
3198 pmd = (struct dp_netdev_pmd_thread *) node->data;
3199 dp_netdev_pmd_unref(pmd);
3200 }
3201 out:
3202 hmapx_destroy(&to_find);
3203 return error;
3204 }
3205
3206 static void
3207 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3208 {
3209 struct flow masked_flow;
3210 size_t i;
3211
3212 for (i = 0; i < sizeof(struct flow); i++) {
3213 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3214 ((uint8_t *)&match->wc)[i];
3215 }
3216 dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
3217 }
3218
3219 static struct dp_netdev_flow *
3220 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3221 struct match *match, const ovs_u128 *ufid,
3222 const struct nlattr *actions, size_t actions_len)
3223 OVS_REQUIRES(pmd->flow_mutex)
3224 {
3225 struct dp_netdev_flow *flow;
3226 struct netdev_flow_key mask;
3227 struct dpcls *cls;
3228
3229 /* Make sure in_port is exact matched before we read it. */
3230 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3231 odp_port_t in_port = match->flow.in_port.odp_port;
3232
3233 /* As we select the dpcls based on the port number, each netdev flow
3234 * belonging to the same dpcls will have the same odp_port value.
3235 * For performance reasons we wildcard odp_port here in the mask. In the
3236 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3237 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3238 * will not be part of the subtable mask.
3239 * This will speed up the hash computation during dpcls_lookup() because
3240 * there is one less call to hash_add64() in this case. */
3241 match->wc.masks.in_port.odp_port = 0;
3242 netdev_flow_mask_init(&mask, match);
3243 match->wc.masks.in_port.odp_port = ODPP_NONE;
3244
3245 /* Make sure wc does not have metadata. */
3246 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3247 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3248
3249 /* Do not allocate extra space. */
3250 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3251 memset(&flow->stats, 0, sizeof flow->stats);
3252 flow->dead = false;
3253 flow->batch = NULL;
3254 flow->mark = INVALID_FLOW_MARK;
3255 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3256 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3257 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3258 ovs_refcount_init(&flow->ref_cnt);
3259 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3260
3261 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3262 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3263
3264 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3265 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3266 dpcls_insert(cls, &flow->cr, &mask);
3267
3268 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3269 dp_netdev_flow_hash(&flow->ufid));
3270
3271 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3272
3273 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3274 struct ds ds = DS_EMPTY_INITIALIZER;
3275 struct ofpbuf key_buf, mask_buf;
3276 struct odp_flow_key_parms odp_parms = {
3277 .flow = &match->flow,
3278 .mask = &match->wc.masks,
3279 .support = dp_netdev_support,
3280 };
3281
3282 ofpbuf_init(&key_buf, 0);
3283 ofpbuf_init(&mask_buf, 0);
3284
3285 odp_flow_key_from_flow(&odp_parms, &key_buf);
3286 odp_parms.key_buf = &key_buf;
3287 odp_flow_key_from_mask(&odp_parms, &mask_buf);
3288
3289 ds_put_cstr(&ds, "flow_add: ");
3290 odp_format_ufid(ufid, &ds);
3291 ds_put_cstr(&ds, " ");
3292 odp_flow_format(key_buf.data, key_buf.size,
3293 mask_buf.data, mask_buf.size,
3294 NULL, &ds, false);
3295 ds_put_cstr(&ds, ", actions:");
3296 format_odp_actions(&ds, actions, actions_len, NULL);
3297
3298 VLOG_DBG("%s", ds_cstr(&ds));
3299
3300 ofpbuf_uninit(&key_buf);
3301 ofpbuf_uninit(&mask_buf);
3302
3303 /* Add a printout of the actual match installed. */
3304 struct match m;
3305 ds_clear(&ds);
3306 ds_put_cstr(&ds, "flow match: ");
3307 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3308 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3309 memset(&m.tun_md, 0, sizeof m.tun_md);
3310 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3311
3312 VLOG_DBG("%s", ds_cstr(&ds));
3313
3314 ds_destroy(&ds);
3315 }
3316
3317 return flow;
3318 }
3319
3320 static int
3321 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3322 struct netdev_flow_key *key,
3323 struct match *match,
3324 ovs_u128 *ufid,
3325 const struct dpif_flow_put *put,
3326 struct dpif_flow_stats *stats)
3327 {
3328 struct dp_netdev_flow *netdev_flow;
3329 int error = 0;
3330
3331 if (stats) {
3332 memset(stats, 0, sizeof *stats);
3333 }
3334
3335 ovs_mutex_lock(&pmd->flow_mutex);
3336 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3337 if (!netdev_flow) {
3338 if (put->flags & DPIF_FP_CREATE) {
3339 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
3340 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3341 put->actions_len);
3342 error = 0;
3343 } else {
3344 error = EFBIG;
3345 }
3346 } else {
3347 error = ENOENT;
3348 }
3349 } else {
3350 if (put->flags & DPIF_FP_MODIFY) {
3351 struct dp_netdev_actions *new_actions;
3352 struct dp_netdev_actions *old_actions;
3353
3354 new_actions = dp_netdev_actions_create(put->actions,
3355 put->actions_len);
3356
3357 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3358 ovsrcu_set(&netdev_flow->actions, new_actions);
3359
3360 queue_netdev_flow_put(pmd, netdev_flow, match,
3361 put->actions, put->actions_len);
3362
3363 if (stats) {
3364 get_dpif_flow_stats(netdev_flow, stats);
3365 }
3366 if (put->flags & DPIF_FP_ZERO_STATS) {
3367 /* XXX: The userspace datapath uses thread local statistics
3368 * (for flows), which should be updated only by the owning
3369 * thread. Since we cannot write on stats memory here,
3370 * we choose not to support this flag. Please note:
3371 * - This feature is currently used only by dpctl commands with
3372 * option --clear.
3373 * - Should the need arise, this operation can be implemented
3374 * by keeping a base value (to be update here) for each
3375 * counter, and subtracting it before outputting the stats */
3376 error = EOPNOTSUPP;
3377 }
3378
3379 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3380 } else if (put->flags & DPIF_FP_CREATE) {
3381 error = EEXIST;
3382 } else {
3383 /* Overlapping flow. */
3384 error = EINVAL;
3385 }
3386 }
3387 ovs_mutex_unlock(&pmd->flow_mutex);
3388 return error;
3389 }
3390
3391 static int
3392 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3393 {
3394 struct dp_netdev *dp = get_dp_netdev(dpif);
3395 struct netdev_flow_key key, mask;
3396 struct dp_netdev_pmd_thread *pmd;
3397 struct match match;
3398 ovs_u128 ufid;
3399 int error;
3400 bool probe = put->flags & DPIF_FP_PROBE;
3401
3402 if (put->stats) {
3403 memset(put->stats, 0, sizeof *put->stats);
3404 }
3405 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3406 probe);
3407 if (error) {
3408 return error;
3409 }
3410 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3411 put->mask, put->mask_len,
3412 &match.flow, &match.wc, probe);
3413 if (error) {
3414 return error;
3415 }
3416
3417 if (put->ufid) {
3418 ufid = *put->ufid;
3419 } else {
3420 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3421 }
3422
3423 /* The Netlink encoding of datapath flow keys cannot express
3424 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3425 * tag is interpreted as exact match on the fact that there is no
3426 * VLAN. Unless we refactor a lot of code that translates between
3427 * Netlink and struct flow representations, we have to do the same
3428 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3429 if (!match.wc.masks.vlans[0].tci) {
3430 match.wc.masks.vlans[0].tci = htons(0xffff);
3431 }
3432
3433 /* Must produce a netdev_flow_key for lookup.
3434 * Use the same method as employed to create the key when adding
3435 * the flow to the dplcs to make sure they match. */
3436 netdev_flow_mask_init(&mask, &match);
3437 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3438
3439 if (put->pmd_id == PMD_ID_NULL) {
3440 if (cmap_count(&dp->poll_threads) == 0) {
3441 return EINVAL;
3442 }
3443 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3444 struct dpif_flow_stats pmd_stats;
3445 int pmd_error;
3446
3447 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3448 &pmd_stats);
3449 if (pmd_error) {
3450 error = pmd_error;
3451 } else if (put->stats) {
3452 put->stats->n_packets += pmd_stats.n_packets;
3453 put->stats->n_bytes += pmd_stats.n_bytes;
3454 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3455 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3456 }
3457 }
3458 } else {
3459 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3460 if (!pmd) {
3461 return EINVAL;
3462 }
3463 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3464 dp_netdev_pmd_unref(pmd);
3465 }
3466
3467 return error;
3468 }
3469
3470 static int
3471 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3472 struct dpif_flow_stats *stats,
3473 const struct dpif_flow_del *del)
3474 {
3475 struct dp_netdev_flow *netdev_flow;
3476 int error = 0;
3477
3478 ovs_mutex_lock(&pmd->flow_mutex);
3479 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3480 del->key_len);
3481 if (netdev_flow) {
3482 if (stats) {
3483 get_dpif_flow_stats(netdev_flow, stats);
3484 }
3485 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3486 } else {
3487 error = ENOENT;
3488 }
3489 ovs_mutex_unlock(&pmd->flow_mutex);
3490
3491 return error;
3492 }
3493
3494 static int
3495 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3496 {
3497 struct dp_netdev *dp = get_dp_netdev(dpif);
3498 struct dp_netdev_pmd_thread *pmd;
3499 int error = 0;
3500
3501 if (del->stats) {
3502 memset(del->stats, 0, sizeof *del->stats);
3503 }
3504
3505 if (del->pmd_id == PMD_ID_NULL) {
3506 if (cmap_count(&dp->poll_threads) == 0) {
3507 return EINVAL;
3508 }
3509 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3510 struct dpif_flow_stats pmd_stats;
3511 int pmd_error;
3512
3513 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3514 if (pmd_error) {
3515 error = pmd_error;
3516 } else if (del->stats) {
3517 del->stats->n_packets += pmd_stats.n_packets;
3518 del->stats->n_bytes += pmd_stats.n_bytes;
3519 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3520 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3521 }
3522 }
3523 } else {
3524 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3525 if (!pmd) {
3526 return EINVAL;
3527 }
3528 error = flow_del_on_pmd(pmd, del->stats, del);
3529 dp_netdev_pmd_unref(pmd);
3530 }
3531
3532
3533 return error;
3534 }
3535
3536 struct dpif_netdev_flow_dump {
3537 struct dpif_flow_dump up;
3538 struct cmap_position poll_thread_pos;
3539 struct cmap_position flow_pos;
3540 struct dp_netdev_pmd_thread *cur_pmd;
3541 int status;
3542 struct ovs_mutex mutex;
3543 };
3544
3545 static struct dpif_netdev_flow_dump *
3546 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3547 {
3548 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3549 }
3550
3551 static struct dpif_flow_dump *
3552 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3553 struct dpif_flow_dump_types *types OVS_UNUSED)
3554 {
3555 struct dpif_netdev_flow_dump *dump;
3556
3557 dump = xzalloc(sizeof *dump);
3558 dpif_flow_dump_init(&dump->up, dpif_);
3559 dump->up.terse = terse;
3560 ovs_mutex_init(&dump->mutex);
3561
3562 return &dump->up;
3563 }
3564
3565 static int
3566 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3567 {
3568 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3569
3570 ovs_mutex_destroy(&dump->mutex);
3571 free(dump);
3572 return 0;
3573 }
3574
3575 struct dpif_netdev_flow_dump_thread {
3576 struct dpif_flow_dump_thread up;
3577 struct dpif_netdev_flow_dump *dump;
3578 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3579 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3580 };
3581
3582 static struct dpif_netdev_flow_dump_thread *
3583 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3584 {
3585 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3586 }
3587
3588 static struct dpif_flow_dump_thread *
3589 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3590 {
3591 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3592 struct dpif_netdev_flow_dump_thread *thread;
3593
3594 thread = xmalloc(sizeof *thread);
3595 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3596 thread->dump = dump;
3597 return &thread->up;
3598 }
3599
3600 static void
3601 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3602 {
3603 struct dpif_netdev_flow_dump_thread *thread
3604 = dpif_netdev_flow_dump_thread_cast(thread_);
3605
3606 free(thread);
3607 }
3608
3609 static int
3610 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3611 struct dpif_flow *flows, int max_flows)
3612 {
3613 struct dpif_netdev_flow_dump_thread *thread
3614 = dpif_netdev_flow_dump_thread_cast(thread_);
3615 struct dpif_netdev_flow_dump *dump = thread->dump;
3616 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3617 int n_flows = 0;
3618 int i;
3619
3620 ovs_mutex_lock(&dump->mutex);
3621 if (!dump->status) {
3622 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3623 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3624 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3625 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3626
3627 /* First call to dump_next(), extracts the first pmd thread.
3628 * If there is no pmd thread, returns immediately. */
3629 if (!pmd) {
3630 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3631 if (!pmd) {
3632 ovs_mutex_unlock(&dump->mutex);
3633 return n_flows;
3634
3635 }
3636 }
3637
3638 do {
3639 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3640 struct cmap_node *node;
3641
3642 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3643 if (!node) {
3644 break;
3645 }
3646 netdev_flows[n_flows] = CONTAINER_OF(node,
3647 struct dp_netdev_flow,
3648 node);
3649 }
3650 /* When finishing dumping the current pmd thread, moves to
3651 * the next. */
3652 if (n_flows < flow_limit) {
3653 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3654 dp_netdev_pmd_unref(pmd);
3655 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3656 if (!pmd) {
3657 dump->status = EOF;
3658 break;
3659 }
3660 }
3661 /* Keeps the reference to next caller. */
3662 dump->cur_pmd = pmd;
3663
3664 /* If the current dump is empty, do not exit the loop, since the
3665 * remaining pmds could have flows to be dumped. Just dumps again
3666 * on the new 'pmd'. */
3667 } while (!n_flows);
3668 }
3669 ovs_mutex_unlock(&dump->mutex);
3670
3671 for (i = 0; i < n_flows; i++) {
3672 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3673 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3674 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3675 struct dpif_flow *f = &flows[i];
3676 struct ofpbuf key, mask;
3677
3678 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3679 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3680 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3681 dump->up.terse);
3682 }
3683
3684 return n_flows;
3685 }
3686
3687 static int
3688 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3689 OVS_NO_THREAD_SAFETY_ANALYSIS
3690 {
3691 struct dp_netdev *dp = get_dp_netdev(dpif);
3692 struct dp_netdev_pmd_thread *pmd;
3693 struct dp_packet_batch pp;
3694
3695 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3696 dp_packet_size(execute->packet) > UINT16_MAX) {
3697 return EINVAL;
3698 }
3699
3700 /* Tries finding the 'pmd'. If NULL is returned, that means
3701 * the current thread is a non-pmd thread and should use
3702 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3703 pmd = ovsthread_getspecific(dp->per_pmd_key);
3704 if (!pmd) {
3705 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3706 if (!pmd) {
3707 return EBUSY;
3708 }
3709 }
3710
3711 if (execute->probe) {
3712 /* If this is part of a probe, Drop the packet, since executing
3713 * the action may actually cause spurious packets be sent into
3714 * the network. */
3715 if (pmd->core_id == NON_PMD_CORE_ID) {
3716 dp_netdev_pmd_unref(pmd);
3717 }
3718 return 0;
3719 }
3720
3721 /* If the current thread is non-pmd thread, acquires
3722 * the 'non_pmd_mutex'. */
3723 if (pmd->core_id == NON_PMD_CORE_ID) {
3724 ovs_mutex_lock(&dp->non_pmd_mutex);
3725 }
3726
3727 /* Update current time in PMD context. We don't care about EMC insertion
3728 * probability, because we are on a slow path. */
3729 pmd_thread_ctx_time_update(pmd);
3730
3731 /* The action processing expects the RSS hash to be valid, because
3732 * it's always initialized at the beginning of datapath processing.
3733 * In this case, though, 'execute->packet' may not have gone through
3734 * the datapath at all, it may have been generated by the upper layer
3735 * (OpenFlow packet-out, BFD frame, ...). */
3736 if (!dp_packet_rss_valid(execute->packet)) {
3737 dp_packet_set_rss_hash(execute->packet,
3738 flow_hash_5tuple(execute->flow, 0));
3739 }
3740
3741 dp_packet_batch_init_packet(&pp, execute->packet);
3742 pp.do_not_steal = true;
3743 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3744 execute->actions, execute->actions_len);
3745 dp_netdev_pmd_flush_output_packets(pmd, true);
3746
3747 if (pmd->core_id == NON_PMD_CORE_ID) {
3748 ovs_mutex_unlock(&dp->non_pmd_mutex);
3749 dp_netdev_pmd_unref(pmd);
3750 }
3751
3752 return 0;
3753 }
3754
3755 static void
3756 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3757 enum dpif_offload_type offload_type OVS_UNUSED)
3758 {
3759 size_t i;
3760
3761 for (i = 0; i < n_ops; i++) {
3762 struct dpif_op *op = ops[i];
3763
3764 switch (op->type) {
3765 case DPIF_OP_FLOW_PUT:
3766 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3767 break;
3768
3769 case DPIF_OP_FLOW_DEL:
3770 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3771 break;
3772
3773 case DPIF_OP_EXECUTE:
3774 op->error = dpif_netdev_execute(dpif, &op->execute);
3775 break;
3776
3777 case DPIF_OP_FLOW_GET:
3778 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3779 break;
3780 }
3781 }
3782 }
3783
3784 /* Enable or Disable PMD auto load balancing. */
3785 static void
3786 set_pmd_auto_lb(struct dp_netdev *dp)
3787 {
3788 unsigned int cnt = 0;
3789 struct dp_netdev_pmd_thread *pmd;
3790 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3791
3792 bool enable_alb = false;
3793 bool multi_rxq = false;
3794 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3795
3796 /* Ensure that there is at least 2 non-isolated PMDs and
3797 * one of them is polling more than one rxq. */
3798 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3799 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3800 continue;
3801 }
3802
3803 if (hmap_count(&pmd->poll_list) > 1) {
3804 multi_rxq = true;
3805 }
3806 if (cnt && multi_rxq) {
3807 enable_alb = true;
3808 break;
3809 }
3810 cnt++;
3811 }
3812
3813 /* Enable auto LB if it is requested and cycle based assignment is true. */
3814 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3815 pmd_alb->auto_lb_requested;
3816
3817 if (pmd_alb->is_enabled != enable_alb) {
3818 pmd_alb->is_enabled = enable_alb;
3819 if (pmd_alb->is_enabled) {
3820 VLOG_INFO("PMD auto load balance is enabled "
3821 "(with rebalance interval:%"PRIu64" msec)",
3822 pmd_alb->rebalance_intvl);
3823 } else {
3824 pmd_alb->rebalance_poll_timer = 0;
3825 VLOG_INFO("PMD auto load balance is disabled");
3826 }
3827 }
3828
3829 }
3830
3831 /* Applies datapath configuration from the database. Some of the changes are
3832 * actually applied in dpif_netdev_run(). */
3833 static int
3834 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3835 {
3836 struct dp_netdev *dp = get_dp_netdev(dpif);
3837 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3838 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3839 "cycles");
3840 unsigned long long insert_prob =
3841 smap_get_ullong(other_config, "emc-insert-inv-prob",
3842 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3843 uint32_t insert_min, cur_min;
3844 uint32_t tx_flush_interval, cur_tx_flush_interval;
3845 uint64_t rebalance_intvl;
3846
3847 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3848 DEFAULT_TX_FLUSH_INTERVAL);
3849 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3850 if (tx_flush_interval != cur_tx_flush_interval) {
3851 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3852 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3853 tx_flush_interval);
3854 }
3855
3856 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3857 free(dp->pmd_cmask);
3858 dp->pmd_cmask = nullable_xstrdup(cmask);
3859 dp_netdev_request_reconfigure(dp);
3860 }
3861
3862 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3863 if (insert_prob <= UINT32_MAX) {
3864 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3865 } else {
3866 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3867 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3868 }
3869
3870 if (insert_min != cur_min) {
3871 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3872 if (insert_min == 0) {
3873 VLOG_INFO("EMC insertion probability changed to zero");
3874 } else {
3875 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3876 insert_prob, (100 / (float)insert_prob));
3877 }
3878 }
3879
3880 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3881 bool cur_perf_enabled;
3882 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3883 if (perf_enabled != cur_perf_enabled) {
3884 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3885 if (perf_enabled) {
3886 VLOG_INFO("PMD performance metrics collection enabled");
3887 } else {
3888 VLOG_INFO("PMD performance metrics collection disabled");
3889 }
3890 }
3891
3892 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3893 bool cur_smc;
3894 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3895 if (smc_enable != cur_smc) {
3896 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3897 if (smc_enable) {
3898 VLOG_INFO("SMC cache is enabled");
3899 } else {
3900 VLOG_INFO("SMC cache is disabled");
3901 }
3902 }
3903
3904 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3905 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3906 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3907 "Defaulting to 'cycles'.");
3908 pmd_rxq_assign_cyc = true;
3909 pmd_rxq_assign = "cycles";
3910 }
3911 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
3912 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
3913 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3914 pmd_rxq_assign);
3915 dp_netdev_request_reconfigure(dp);
3916 }
3917
3918 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3919 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
3920 false);
3921
3922 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
3923 ALB_PMD_REBALANCE_POLL_INTERVAL);
3924
3925 /* Input is in min, convert it to msec. */
3926 rebalance_intvl =
3927 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
3928
3929 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
3930 pmd_alb->rebalance_intvl = rebalance_intvl;
3931 }
3932
3933 set_pmd_auto_lb(dp);
3934 return 0;
3935 }
3936
3937 /* Parses affinity list and returns result in 'core_ids'. */
3938 static int
3939 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3940 {
3941 unsigned i;
3942 char *list, *copy, *key, *value;
3943 int error = 0;
3944
3945 for (i = 0; i < n_rxq; i++) {
3946 core_ids[i] = OVS_CORE_UNSPEC;
3947 }
3948
3949 if (!affinity_list) {
3950 return 0;
3951 }
3952
3953 list = copy = xstrdup(affinity_list);
3954
3955 while (ofputil_parse_key_value(&list, &key, &value)) {
3956 int rxq_id, core_id;
3957
3958 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3959 || !str_to_int(value, 0, &core_id) || core_id < 0) {
3960 error = EINVAL;
3961 break;
3962 }
3963
3964 if (rxq_id < n_rxq) {
3965 core_ids[rxq_id] = core_id;
3966 }
3967 }
3968
3969 free(copy);
3970 return error;
3971 }
3972
3973 /* Parses 'affinity_list' and applies configuration if it is valid. */
3974 static int
3975 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3976 const char *affinity_list)
3977 {
3978 unsigned *core_ids, i;
3979 int error = 0;
3980
3981 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3982 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3983 error = EINVAL;
3984 goto exit;
3985 }
3986
3987 for (i = 0; i < port->n_rxq; i++) {
3988 port->rxqs[i].core_id = core_ids[i];
3989 }
3990
3991 exit:
3992 free(core_ids);
3993 return error;
3994 }
3995
3996 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
3997 * of given PMD thread. */
3998 static bool
3999 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4000 struct dp_netdev_port *port)
4001 OVS_EXCLUDED(pmd->port_mutex)
4002 {
4003 struct rxq_poll *poll;
4004 bool found = false;
4005
4006 ovs_mutex_lock(&pmd->port_mutex);
4007 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4008 if (port == poll->rxq->port) {
4009 found = true;
4010 break;
4011 }
4012 }
4013 ovs_mutex_unlock(&pmd->port_mutex);
4014 return found;
4015 }
4016
4017 /* Updates port configuration from the database. The changes are actually
4018 * applied in dpif_netdev_run(). */
4019 static int
4020 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4021 const struct smap *cfg)
4022 {
4023 struct dp_netdev *dp = get_dp_netdev(dpif);
4024 struct dp_netdev_port *port;
4025 int error = 0;
4026 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4027 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4028
4029 ovs_mutex_lock(&dp->port_mutex);
4030 error = get_port_by_number(dp, port_no, &port);
4031 if (error) {
4032 goto unlock;
4033 }
4034
4035 if (emc_enabled != port->emc_enabled) {
4036 struct dp_netdev_pmd_thread *pmd;
4037 struct ds ds = DS_EMPTY_INITIALIZER;
4038 uint32_t cur_min, insert_prob;
4039
4040 port->emc_enabled = emc_enabled;
4041 /* Mark for reload all the threads that polls this port and request
4042 * for reconfiguration for the actual reloading of threads. */
4043 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4044 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4045 pmd->need_reload = true;
4046 }
4047 }
4048 dp_netdev_request_reconfigure(dp);
4049
4050 ds_put_format(&ds, "%s: EMC has been %s.",
4051 netdev_get_name(port->netdev),
4052 (emc_enabled) ? "enabled" : "disabled");
4053 if (emc_enabled) {
4054 ds_put_cstr(&ds, " Current insertion probability is ");
4055 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4056 if (!cur_min) {
4057 ds_put_cstr(&ds, "zero.");
4058 } else {
4059 insert_prob = UINT32_MAX / cur_min;
4060 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4061 insert_prob, 100 / (float) insert_prob);
4062 }
4063 }
4064 VLOG_INFO("%s", ds_cstr(&ds));
4065 ds_destroy(&ds);
4066 }
4067
4068 /* Checking for RXq affinity changes. */
4069 if (!netdev_is_pmd(port->netdev)
4070 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4071 goto unlock;
4072 }
4073
4074 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4075 if (error) {
4076 goto unlock;
4077 }
4078 free(port->rxq_affinity_list);
4079 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4080
4081 dp_netdev_request_reconfigure(dp);
4082 unlock:
4083 ovs_mutex_unlock(&dp->port_mutex);
4084 return error;
4085 }
4086
4087 static int
4088 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4089 uint32_t queue_id, uint32_t *priority)
4090 {
4091 *priority = queue_id;
4092 return 0;
4093 }
4094
4095 \f
4096 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4097 * a copy of the 'size' bytes of 'actions' input parameters. */
4098 struct dp_netdev_actions *
4099 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4100 {
4101 struct dp_netdev_actions *netdev_actions;
4102
4103 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4104 memcpy(netdev_actions->actions, actions, size);
4105 netdev_actions->size = size;
4106
4107 return netdev_actions;
4108 }
4109
4110 struct dp_netdev_actions *
4111 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4112 {
4113 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4114 }
4115
4116 static void
4117 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4118 {
4119 free(actions);
4120 }
4121 \f
4122 static void
4123 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4124 enum rxq_cycles_counter_type type,
4125 unsigned long long cycles)
4126 {
4127 atomic_store_relaxed(&rx->cycles[type], cycles);
4128 }
4129
4130 static void
4131 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4132 enum rxq_cycles_counter_type type,
4133 unsigned long long cycles)
4134 {
4135 non_atomic_ullong_add(&rx->cycles[type], cycles);
4136 }
4137
4138 static uint64_t
4139 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4140 enum rxq_cycles_counter_type type)
4141 {
4142 unsigned long long processing_cycles;
4143 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4144 return processing_cycles;
4145 }
4146
4147 static void
4148 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4149 unsigned long long cycles)
4150 {
4151 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4152 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4153 }
4154
4155 static uint64_t
4156 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4157 {
4158 unsigned long long processing_cycles;
4159 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4160 return processing_cycles;
4161 }
4162
4163 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4164 static inline bool
4165 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4166 {
4167 bool pmd_perf_enabled;
4168 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4169 return pmd_perf_enabled;
4170 }
4171 #else
4172 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4173 * performance metrics are not available as locked access to 64 bit
4174 * integers would be prohibitively expensive. */
4175 static inline bool
4176 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4177 {
4178 return false;
4179 }
4180 #endif
4181
4182 static int
4183 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4184 struct tx_port *p)
4185 {
4186 int i;
4187 int tx_qid;
4188 int output_cnt;
4189 bool dynamic_txqs;
4190 struct cycle_timer timer;
4191 uint64_t cycles;
4192 uint32_t tx_flush_interval;
4193
4194 cycle_timer_start(&pmd->perf_stats, &timer);
4195
4196 dynamic_txqs = p->port->dynamic_txqs;
4197 if (dynamic_txqs) {
4198 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4199 } else {
4200 tx_qid = pmd->static_tx_qid;
4201 }
4202
4203 output_cnt = dp_packet_batch_size(&p->output_pkts);
4204 ovs_assert(output_cnt > 0);
4205
4206 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4207 dp_packet_batch_init(&p->output_pkts);
4208
4209 /* Update time of the next flush. */
4210 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4211 p->flush_time = pmd->ctx.now + tx_flush_interval;
4212
4213 ovs_assert(pmd->n_output_batches > 0);
4214 pmd->n_output_batches--;
4215
4216 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4217 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4218
4219 /* Distribute send cycles evenly among transmitted packets and assign to
4220 * their respective rx queues. */
4221 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4222 for (i = 0; i < output_cnt; i++) {
4223 if (p->output_pkts_rxqs[i]) {
4224 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4225 RXQ_CYCLES_PROC_CURR, cycles);
4226 }
4227 }
4228
4229 return output_cnt;
4230 }
4231
4232 static int
4233 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4234 bool force)
4235 {
4236 struct tx_port *p;
4237 int output_cnt = 0;
4238
4239 if (!pmd->n_output_batches) {
4240 return 0;
4241 }
4242
4243 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4244 if (!dp_packet_batch_is_empty(&p->output_pkts)
4245 && (force || pmd->ctx.now >= p->flush_time)) {
4246 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4247 }
4248 }
4249 return output_cnt;
4250 }
4251
4252 static int
4253 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4254 struct dp_netdev_rxq *rxq,
4255 odp_port_t port_no)
4256 {
4257 struct pmd_perf_stats *s = &pmd->perf_stats;
4258 struct dp_packet_batch batch;
4259 struct cycle_timer timer;
4260 int error;
4261 int batch_cnt = 0;
4262 int rem_qlen = 0, *qlen_p = NULL;
4263 uint64_t cycles;
4264
4265 /* Measure duration for polling and processing rx burst. */
4266 cycle_timer_start(&pmd->perf_stats, &timer);
4267
4268 pmd->ctx.last_rxq = rxq;
4269 dp_packet_batch_init(&batch);
4270
4271 /* Fetch the rx queue length only for vhostuser ports. */
4272 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4273 qlen_p = &rem_qlen;
4274 }
4275
4276 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4277 if (!error) {
4278 /* At least one packet received. */
4279 *recirc_depth_get() = 0;
4280 pmd_thread_ctx_time_update(pmd);
4281 batch_cnt = batch.count;
4282 if (pmd_perf_metrics_enabled(pmd)) {
4283 /* Update batch histogram. */
4284 s->current.batches++;
4285 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4286 /* Update the maximum vhost rx queue fill level. */
4287 if (rxq->is_vhost && rem_qlen >= 0) {
4288 uint32_t qfill = batch_cnt + rem_qlen;
4289 if (qfill > s->current.max_vhost_qfill) {
4290 s->current.max_vhost_qfill = qfill;
4291 }
4292 }
4293 }
4294 /* Process packet batch. */
4295 dp_netdev_input(pmd, &batch, port_no);
4296
4297 /* Assign processing cycles to rx queue. */
4298 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4299 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4300
4301 dp_netdev_pmd_flush_output_packets(pmd, false);
4302 } else {
4303 /* Discard cycles. */
4304 cycle_timer_stop(&pmd->perf_stats, &timer);
4305 if (error != EAGAIN && error != EOPNOTSUPP) {
4306 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4307
4308 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4309 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4310 }
4311 }
4312
4313 pmd->ctx.last_rxq = NULL;
4314
4315 return batch_cnt;
4316 }
4317
4318 static struct tx_port *
4319 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4320 {
4321 struct tx_port *tx;
4322
4323 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4324 if (tx->port->port_no == port_no) {
4325 return tx;
4326 }
4327 }
4328
4329 return NULL;
4330 }
4331
4332 static int
4333 port_reconfigure(struct dp_netdev_port *port)
4334 {
4335 struct netdev *netdev = port->netdev;
4336 int i, err;
4337
4338 /* Closes the existing 'rxq's. */
4339 for (i = 0; i < port->n_rxq; i++) {
4340 netdev_rxq_close(port->rxqs[i].rx);
4341 port->rxqs[i].rx = NULL;
4342 }
4343 unsigned last_nrxq = port->n_rxq;
4344 port->n_rxq = 0;
4345
4346 /* Allows 'netdev' to apply the pending configuration changes. */
4347 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4348 err = netdev_reconfigure(netdev);
4349 if (err && (err != EOPNOTSUPP)) {
4350 VLOG_ERR("Failed to set interface %s new configuration",
4351 netdev_get_name(netdev));
4352 return err;
4353 }
4354 }
4355 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4356 port->rxqs = xrealloc(port->rxqs,
4357 sizeof *port->rxqs * netdev_n_rxq(netdev));
4358 /* Realloc 'used' counters for tx queues. */
4359 free(port->txq_used);
4360 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4361
4362 for (i = 0; i < netdev_n_rxq(netdev); i++) {
4363 bool new_queue = i >= last_nrxq;
4364 if (new_queue) {
4365 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4366 }
4367
4368 port->rxqs[i].port = port;
4369 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4370
4371 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4372 if (err) {
4373 return err;
4374 }
4375 port->n_rxq++;
4376 }
4377
4378 /* Parse affinity list to apply configuration for new queues. */
4379 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4380
4381 /* If reconfiguration was successful mark it as such, so we can use it */
4382 port->need_reconfigure = false;
4383
4384 return 0;
4385 }
4386
4387 struct rr_numa_list {
4388 struct hmap numas; /* Contains 'struct rr_numa' */
4389 };
4390
4391 struct rr_numa {
4392 struct hmap_node node;
4393
4394 int numa_id;
4395
4396 /* Non isolated pmds on numa node 'numa_id' */
4397 struct dp_netdev_pmd_thread **pmds;
4398 int n_pmds;
4399
4400 int cur_index;
4401 bool idx_inc;
4402 };
4403
4404 static struct rr_numa *
4405 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4406 {
4407 struct rr_numa *numa;
4408
4409 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4410 if (numa->numa_id == numa_id) {
4411 return numa;
4412 }
4413 }
4414
4415 return NULL;
4416 }
4417
4418 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4419 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4420 * Returns NULL if 'rr' numa list is empty. */
4421 static struct rr_numa *
4422 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4423 {
4424 struct hmap_node *node = NULL;
4425
4426 if (numa) {
4427 node = hmap_next(&rr->numas, &numa->node);
4428 }
4429 if (!node) {
4430 node = hmap_first(&rr->numas);
4431 }
4432
4433 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4434 }
4435
4436 static void
4437 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4438 {
4439 struct dp_netdev_pmd_thread *pmd;
4440 struct rr_numa *numa;
4441
4442 hmap_init(&rr->numas);
4443
4444 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4445 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4446 continue;
4447 }
4448
4449 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4450 if (!numa) {
4451 numa = xzalloc(sizeof *numa);
4452 numa->numa_id = pmd->numa_id;
4453 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4454 }
4455 numa->n_pmds++;
4456 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4457 numa->pmds[numa->n_pmds - 1] = pmd;
4458 /* At least one pmd so initialise curr_idx and idx_inc. */
4459 numa->cur_index = 0;
4460 numa->idx_inc = true;
4461 }
4462 }
4463
4464 /*
4465 * Returns the next pmd from the numa node.
4466 *
4467 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4468 * either an up or down walk, switching between up/down when the first or last
4469 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4470 *
4471 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4472 * core reached. e.g. 1,2,3,1,2,3,1,2...
4473 */
4474 static struct dp_netdev_pmd_thread *
4475 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4476 {
4477 int numa_idx = numa->cur_index;
4478
4479 if (numa->idx_inc == true) {
4480 /* Incrementing through list of pmds. */
4481 if (numa->cur_index == numa->n_pmds-1) {
4482 /* Reached the last pmd. */
4483 if (updown) {
4484 numa->idx_inc = false;
4485 } else {
4486 numa->cur_index = 0;
4487 }
4488 } else {
4489 numa->cur_index++;
4490 }
4491 } else {
4492 /* Decrementing through list of pmds. */
4493 if (numa->cur_index == 0) {
4494 /* Reached the first pmd. */
4495 numa->idx_inc = true;
4496 } else {
4497 numa->cur_index--;
4498 }
4499 }
4500 return numa->pmds[numa_idx];
4501 }
4502
4503 static void
4504 rr_numa_list_destroy(struct rr_numa_list *rr)
4505 {
4506 struct rr_numa *numa;
4507
4508 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4509 free(numa->pmds);
4510 free(numa);
4511 }
4512 hmap_destroy(&rr->numas);
4513 }
4514
4515 /* Sort Rx Queues by the processing cycles they are consuming. */
4516 static int
4517 compare_rxq_cycles(const void *a, const void *b)
4518 {
4519 struct dp_netdev_rxq *qa;
4520 struct dp_netdev_rxq *qb;
4521 uint64_t cycles_qa, cycles_qb;
4522
4523 qa = *(struct dp_netdev_rxq **) a;
4524 qb = *(struct dp_netdev_rxq **) b;
4525
4526 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4527 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4528
4529 if (cycles_qa != cycles_qb) {
4530 return (cycles_qa < cycles_qb) ? 1 : -1;
4531 } else {
4532 /* Cycles are the same so tiebreak on port/queue id.
4533 * Tiebreaking (as opposed to return 0) ensures consistent
4534 * sort results across multiple OS's. */
4535 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4536 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4537 if (port_qa != port_qb) {
4538 return port_qa > port_qb ? 1 : -1;
4539 } else {
4540 return netdev_rxq_get_queue_id(qa->rx)
4541 - netdev_rxq_get_queue_id(qb->rx);
4542 }
4543 }
4544 }
4545
4546 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4547 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4548 * pmds to unpinned queues.
4549 *
4550 * The function doesn't touch the pmd threads, it just stores the assignment
4551 * in the 'pmd' member of each rxq. */
4552 static void
4553 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4554 {
4555 struct dp_netdev_port *port;
4556 struct rr_numa_list rr;
4557 struct rr_numa *non_local_numa = NULL;
4558 struct dp_netdev_rxq ** rxqs = NULL;
4559 int n_rxqs = 0;
4560 struct rr_numa *numa = NULL;
4561 int numa_id;
4562 bool assign_cyc = dp->pmd_rxq_assign_cyc;
4563
4564 HMAP_FOR_EACH (port, node, &dp->ports) {
4565 if (!netdev_is_pmd(port->netdev)) {
4566 continue;
4567 }
4568
4569 for (int qid = 0; qid < port->n_rxq; qid++) {
4570 struct dp_netdev_rxq *q = &port->rxqs[qid];
4571
4572 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4573 struct dp_netdev_pmd_thread *pmd;
4574
4575 pmd = dp_netdev_get_pmd(dp, q->core_id);
4576 if (!pmd) {
4577 VLOG_WARN("There is no PMD thread on core %d. Queue "
4578 "%d on port \'%s\' will not be polled.",
4579 q->core_id, qid, netdev_get_name(port->netdev));
4580 } else {
4581 q->pmd = pmd;
4582 pmd->isolated = true;
4583 dp_netdev_pmd_unref(pmd);
4584 }
4585 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4586 uint64_t cycle_hist = 0;
4587
4588 if (n_rxqs == 0) {
4589 rxqs = xmalloc(sizeof *rxqs);
4590 } else {
4591 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4592 }
4593
4594 if (assign_cyc) {
4595 /* Sum the queue intervals and store the cycle history. */
4596 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4597 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4598 }
4599 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4600 cycle_hist);
4601 }
4602 /* Store the queue. */
4603 rxqs[n_rxqs++] = q;
4604 }
4605 }
4606 }
4607
4608 if (n_rxqs > 1 && assign_cyc) {
4609 /* Sort the queues in order of the processing cycles
4610 * they consumed during their last pmd interval. */
4611 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4612 }
4613
4614 rr_numa_list_populate(dp, &rr);
4615 /* Assign the sorted queues to pmds in round robin. */
4616 for (int i = 0; i < n_rxqs; i++) {
4617 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4618 numa = rr_numa_list_lookup(&rr, numa_id);
4619 if (!numa) {
4620 /* There are no pmds on the queue's local NUMA node.
4621 Round robin on the NUMA nodes that do have pmds. */
4622 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4623 if (!non_local_numa) {
4624 VLOG_ERR("There is no available (non-isolated) pmd "
4625 "thread for port \'%s\' queue %d. This queue "
4626 "will not be polled. Is pmd-cpu-mask set to "
4627 "zero? Or are all PMDs isolated to other "
4628 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4629 netdev_rxq_get_queue_id(rxqs[i]->rx));
4630 continue;
4631 }
4632 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4633 VLOG_WARN("There's no available (non-isolated) pmd thread "
4634 "on numa node %d. Queue %d on port \'%s\' will "
4635 "be assigned to the pmd on core %d "
4636 "(numa node %d). Expect reduced performance.",
4637 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4638 netdev_rxq_get_name(rxqs[i]->rx),
4639 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4640 } else {
4641 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4642 if (assign_cyc) {
4643 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4644 "rx queue %d "
4645 "(measured processing cycles %"PRIu64").",
4646 rxqs[i]->pmd->core_id, numa_id,
4647 netdev_rxq_get_name(rxqs[i]->rx),
4648 netdev_rxq_get_queue_id(rxqs[i]->rx),
4649 dp_netdev_rxq_get_cycles(rxqs[i],
4650 RXQ_CYCLES_PROC_HIST));
4651 } else {
4652 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4653 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4654 netdev_rxq_get_name(rxqs[i]->rx),
4655 netdev_rxq_get_queue_id(rxqs[i]->rx));
4656 }
4657 }
4658 }
4659
4660 rr_numa_list_destroy(&rr);
4661 free(rxqs);
4662 }
4663
4664 static void
4665 reload_affected_pmds(struct dp_netdev *dp)
4666 {
4667 struct dp_netdev_pmd_thread *pmd;
4668
4669 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4670 if (pmd->need_reload) {
4671 flow_mark_flush(pmd);
4672 dp_netdev_reload_pmd__(pmd);
4673 }
4674 }
4675
4676 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4677 if (pmd->need_reload) {
4678 if (pmd->core_id != NON_PMD_CORE_ID) {
4679 bool reload;
4680
4681 do {
4682 atomic_read_explicit(&pmd->reload, &reload,
4683 memory_order_acquire);
4684 } while (reload);
4685 }
4686 pmd->need_reload = false;
4687 }
4688 }
4689 }
4690
4691 static void
4692 reconfigure_pmd_threads(struct dp_netdev *dp)
4693 OVS_REQUIRES(dp->port_mutex)
4694 {
4695 struct dp_netdev_pmd_thread *pmd;
4696 struct ovs_numa_dump *pmd_cores;
4697 struct ovs_numa_info_core *core;
4698 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4699 struct hmapx_node *node;
4700 bool changed = false;
4701 bool need_to_adjust_static_tx_qids = false;
4702
4703 /* The pmd threads should be started only if there's a pmd port in the
4704 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4705 * NR_PMD_THREADS per numa node. */
4706 if (!has_pmd_port(dp)) {
4707 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4708 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4709 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4710 } else {
4711 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4712 }
4713
4714 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4715 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4716 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4717 /* Adjustment is required to keep 'static_tx_qid's sequential and
4718 * avoid possible issues, for example, imbalanced tx queue usage
4719 * and unnecessary locking caused by remapping on netdev level. */
4720 need_to_adjust_static_tx_qids = true;
4721 }
4722
4723 /* Check for unwanted pmd threads */
4724 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4725 if (pmd->core_id == NON_PMD_CORE_ID) {
4726 continue;
4727 }
4728 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4729 pmd->core_id)) {
4730 hmapx_add(&to_delete, pmd);
4731 } else if (need_to_adjust_static_tx_qids) {
4732 atomic_store_relaxed(&pmd->reload_tx_qid, true);
4733 pmd->need_reload = true;
4734 }
4735 }
4736
4737 HMAPX_FOR_EACH (node, &to_delete) {
4738 pmd = (struct dp_netdev_pmd_thread *) node->data;
4739 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4740 pmd->numa_id, pmd->core_id);
4741 dp_netdev_del_pmd(dp, pmd);
4742 }
4743 changed = !hmapx_is_empty(&to_delete);
4744 hmapx_destroy(&to_delete);
4745
4746 if (need_to_adjust_static_tx_qids) {
4747 /* 'static_tx_qid's are not sequential now.
4748 * Reload remaining threads to fix this. */
4749 reload_affected_pmds(dp);
4750 }
4751
4752 /* Check for required new pmd threads */
4753 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4754 pmd = dp_netdev_get_pmd(dp, core->core_id);
4755 if (!pmd) {
4756 pmd = xzalloc(sizeof *pmd);
4757 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4758 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4759 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4760 pmd->numa_id, pmd->core_id);
4761 changed = true;
4762 } else {
4763 dp_netdev_pmd_unref(pmd);
4764 }
4765 }
4766
4767 if (changed) {
4768 struct ovs_numa_info_numa *numa;
4769
4770 /* Log the number of pmd threads per numa node. */
4771 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4772 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4773 numa->n_cores, numa->numa_id);
4774 }
4775 }
4776
4777 ovs_numa_dump_destroy(pmd_cores);
4778 }
4779
4780 static void
4781 pmd_remove_stale_ports(struct dp_netdev *dp,
4782 struct dp_netdev_pmd_thread *pmd)
4783 OVS_EXCLUDED(pmd->port_mutex)
4784 OVS_REQUIRES(dp->port_mutex)
4785 {
4786 struct rxq_poll *poll, *poll_next;
4787 struct tx_port *tx, *tx_next;
4788
4789 ovs_mutex_lock(&pmd->port_mutex);
4790 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4791 struct dp_netdev_port *port = poll->rxq->port;
4792
4793 if (port->need_reconfigure
4794 || !hmap_contains(&dp->ports, &port->node)) {
4795 dp_netdev_del_rxq_from_pmd(pmd, poll);
4796 }
4797 }
4798 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4799 struct dp_netdev_port *port = tx->port;
4800
4801 if (port->need_reconfigure
4802 || !hmap_contains(&dp->ports, &port->node)) {
4803 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4804 }
4805 }
4806 ovs_mutex_unlock(&pmd->port_mutex);
4807 }
4808
4809 /* Must be called each time a port is added/removed or the cmask changes.
4810 * This creates and destroys pmd threads, reconfigures ports, opens their
4811 * rxqs and assigns all rxqs/txqs to pmd threads. */
4812 static void
4813 reconfigure_datapath(struct dp_netdev *dp)
4814 OVS_REQUIRES(dp->port_mutex)
4815 {
4816 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
4817 struct dp_netdev_pmd_thread *pmd;
4818 struct dp_netdev_port *port;
4819 int wanted_txqs;
4820
4821 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4822
4823 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4824 * on the system and the user configuration. */
4825 reconfigure_pmd_threads(dp);
4826
4827 wanted_txqs = cmap_count(&dp->poll_threads);
4828
4829 /* The number of pmd threads might have changed, or a port can be new:
4830 * adjust the txqs. */
4831 HMAP_FOR_EACH (port, node, &dp->ports) {
4832 netdev_set_tx_multiq(port->netdev, wanted_txqs);
4833 }
4834
4835 /* Step 2: Remove from the pmd threads ports that have been removed or
4836 * need reconfiguration. */
4837
4838 /* Check for all the ports that need reconfiguration. We cache this in
4839 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4840 * change at any time. */
4841 HMAP_FOR_EACH (port, node, &dp->ports) {
4842 if (netdev_is_reconf_required(port->netdev)) {
4843 port->need_reconfigure = true;
4844 }
4845 }
4846
4847 /* Remove from the pmd threads all the ports that have been deleted or
4848 * need reconfiguration. */
4849 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4850 pmd_remove_stale_ports(dp, pmd);
4851 }
4852
4853 /* Reload affected pmd threads. We must wait for the pmd threads before
4854 * reconfiguring the ports, because a port cannot be reconfigured while
4855 * it's being used. */
4856 reload_affected_pmds(dp);
4857
4858 /* Step 3: Reconfigure ports. */
4859
4860 /* We only reconfigure the ports that we determined above, because they're
4861 * not being used by any pmd thread at the moment. If a port fails to
4862 * reconfigure we remove it from the datapath. */
4863 struct dp_netdev_port *next_port;
4864 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4865 int err;
4866
4867 if (!port->need_reconfigure) {
4868 continue;
4869 }
4870
4871 err = port_reconfigure(port);
4872 if (err) {
4873 hmap_remove(&dp->ports, &port->node);
4874 seq_change(dp->port_seq);
4875 port_destroy(port);
4876 } else {
4877 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4878 }
4879 }
4880
4881 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4882 * for now, we just update the 'pmd' pointer in each rxq to point to the
4883 * wanted thread according to the scheduling policy. */
4884
4885 /* Reset all the pmd threads to non isolated. */
4886 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4887 pmd->isolated = false;
4888 }
4889
4890 /* Reset all the queues to unassigned */
4891 HMAP_FOR_EACH (port, node, &dp->ports) {
4892 for (int i = 0; i < port->n_rxq; i++) {
4893 port->rxqs[i].pmd = NULL;
4894 }
4895 }
4896
4897 /* Add pinned queues and mark pmd threads isolated. */
4898 rxq_scheduling(dp, true);
4899
4900 /* Add non-pinned queues. */
4901 rxq_scheduling(dp, false);
4902
4903 /* Step 5: Remove queues not compliant with new scheduling. */
4904
4905 /* Count all the threads that will have at least one queue to poll. */
4906 HMAP_FOR_EACH (port, node, &dp->ports) {
4907 for (int qid = 0; qid < port->n_rxq; qid++) {
4908 struct dp_netdev_rxq *q = &port->rxqs[qid];
4909
4910 if (q->pmd) {
4911 hmapx_add(&busy_threads, q->pmd);
4912 }
4913 }
4914 }
4915
4916 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4917 struct rxq_poll *poll, *poll_next;
4918
4919 ovs_mutex_lock(&pmd->port_mutex);
4920 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4921 if (poll->rxq->pmd != pmd) {
4922 dp_netdev_del_rxq_from_pmd(pmd, poll);
4923
4924 /* This pmd might sleep after this step if it has no rxq
4925 * remaining. Tell it to busy wait for new assignment if it
4926 * has at least one scheduled queue. */
4927 if (hmap_count(&pmd->poll_list) == 0 &&
4928 hmapx_contains(&busy_threads, pmd)) {
4929 atomic_store_relaxed(&pmd->wait_for_reload, true);
4930 }
4931 }
4932 }
4933 ovs_mutex_unlock(&pmd->port_mutex);
4934 }
4935
4936 hmapx_destroy(&busy_threads);
4937
4938 /* Reload affected pmd threads. We must wait for the pmd threads to remove
4939 * the old queues before readding them, otherwise a queue can be polled by
4940 * two threads at the same time. */
4941 reload_affected_pmds(dp);
4942
4943 /* Step 6: Add queues from scheduling, if they're not there already. */
4944 HMAP_FOR_EACH (port, node, &dp->ports) {
4945 if (!netdev_is_pmd(port->netdev)) {
4946 continue;
4947 }
4948
4949 for (int qid = 0; qid < port->n_rxq; qid++) {
4950 struct dp_netdev_rxq *q = &port->rxqs[qid];
4951
4952 if (q->pmd) {
4953 ovs_mutex_lock(&q->pmd->port_mutex);
4954 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4955 ovs_mutex_unlock(&q->pmd->port_mutex);
4956 }
4957 }
4958 }
4959
4960 /* Add every port to the tx cache of every pmd thread, if it's not
4961 * there already and if this pmd has at least one rxq to poll. */
4962 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4963 ovs_mutex_lock(&pmd->port_mutex);
4964 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4965 HMAP_FOR_EACH (port, node, &dp->ports) {
4966 dp_netdev_add_port_tx_to_pmd(pmd, port);
4967 }
4968 }
4969 ovs_mutex_unlock(&pmd->port_mutex);
4970 }
4971
4972 /* Reload affected pmd threads. */
4973 reload_affected_pmds(dp);
4974
4975 /* Check if PMD Auto LB is to be enabled */
4976 set_pmd_auto_lb(dp);
4977 }
4978
4979 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
4980 static bool
4981 ports_require_restart(const struct dp_netdev *dp)
4982 OVS_REQUIRES(dp->port_mutex)
4983 {
4984 struct dp_netdev_port *port;
4985
4986 HMAP_FOR_EACH (port, node, &dp->ports) {
4987 if (netdev_is_reconf_required(port->netdev)) {
4988 return true;
4989 }
4990 }
4991
4992 return false;
4993 }
4994
4995 /* Calculates variance in the values stored in array 'a'. 'n' is the number
4996 * of elements in array to be considered for calculating vairance.
4997 * Usage example: data array 'a' contains the processing load of each pmd and
4998 * 'n' is the number of PMDs. It returns the variance in processing load of
4999 * PMDs*/
5000 static uint64_t
5001 variance(uint64_t a[], int n)
5002 {
5003 /* Compute mean (average of elements). */
5004 uint64_t sum = 0;
5005 uint64_t mean = 0;
5006 uint64_t sqDiff = 0;
5007
5008 if (!n) {
5009 return 0;
5010 }
5011
5012 for (int i = 0; i < n; i++) {
5013 sum += a[i];
5014 }
5015
5016 if (sum) {
5017 mean = sum / n;
5018
5019 /* Compute sum squared differences with mean. */
5020 for (int i = 0; i < n; i++) {
5021 sqDiff += (a[i] - mean)*(a[i] - mean);
5022 }
5023 }
5024 return (sqDiff ? (sqDiff / n) : 0);
5025 }
5026
5027
5028 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5029 * assignment to PMDs. */
5030 static bool
5031 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5032 uint32_t num_pmds, uint64_t *predicted_variance)
5033 OVS_REQUIRES(dp->port_mutex)
5034 {
5035 struct dp_netdev_port *port;
5036 struct dp_netdev_pmd_thread *pmd;
5037 struct dp_netdev_rxq **rxqs = NULL;
5038 struct rr_numa *numa = NULL;
5039 struct rr_numa_list rr;
5040 int n_rxqs = 0;
5041 bool ret = false;
5042 uint64_t *pmd_usage;
5043
5044 if (!predicted_variance) {
5045 return ret;
5046 }
5047
5048 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5049
5050 HMAP_FOR_EACH (port, node, &dp->ports) {
5051 if (!netdev_is_pmd(port->netdev)) {
5052 continue;
5053 }
5054
5055 for (int qid = 0; qid < port->n_rxq; qid++) {
5056 struct dp_netdev_rxq *q = &port->rxqs[qid];
5057 uint64_t cycle_hist = 0;
5058
5059 if (q->pmd->isolated) {
5060 continue;
5061 }
5062
5063 if (n_rxqs == 0) {
5064 rxqs = xmalloc(sizeof *rxqs);
5065 } else {
5066 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5067 }
5068
5069 /* Sum the queue intervals and store the cycle history. */
5070 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5071 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5072 }
5073 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5074 cycle_hist);
5075 /* Store the queue. */
5076 rxqs[n_rxqs++] = q;
5077 }
5078 }
5079 if (n_rxqs > 1) {
5080 /* Sort the queues in order of the processing cycles
5081 * they consumed during their last pmd interval. */
5082 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5083 }
5084 rr_numa_list_populate(dp, &rr);
5085
5086 for (int i = 0; i < n_rxqs; i++) {
5087 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5088 numa = rr_numa_list_lookup(&rr, numa_id);
5089 if (!numa) {
5090 /* Abort if cross NUMA polling. */
5091 VLOG_DBG("PMD auto lb dry run."
5092 " Aborting due to cross-numa polling.");
5093 goto cleanup;
5094 }
5095
5096 pmd = rr_numa_get_pmd(numa, true);
5097 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5098 "to be assigned port \'%s\' rx queue %d "
5099 "(measured processing cycles %"PRIu64").",
5100 pmd->core_id, numa_id,
5101 netdev_rxq_get_name(rxqs[i]->rx),
5102 netdev_rxq_get_queue_id(rxqs[i]->rx),
5103 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5104
5105 for (int id = 0; id < num_pmds; id++) {
5106 if (pmd->core_id == core_list[id]) {
5107 /* Add the processing cycles of rxq to pmd polling it. */
5108 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5109 RXQ_CYCLES_PROC_HIST);
5110 }
5111 }
5112 }
5113
5114 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5115 uint64_t total_cycles = 0;
5116
5117 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5118 continue;
5119 }
5120
5121 /* Get the total pmd cycles for an interval. */
5122 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5123 /* Estimate the cycles to cover all intervals. */
5124 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5125 for (int id = 0; id < num_pmds; id++) {
5126 if (pmd->core_id == core_list[id]) {
5127 if (pmd_usage[id]) {
5128 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5129 }
5130 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5131 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5132 }
5133 }
5134 }
5135 *predicted_variance = variance(pmd_usage, num_pmds);
5136 ret = true;
5137
5138 cleanup:
5139 rr_numa_list_destroy(&rr);
5140 free(rxqs);
5141 free(pmd_usage);
5142 return ret;
5143 }
5144
5145 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5146 * better distribution of load on PMDs. */
5147 static bool
5148 pmd_rebalance_dry_run(struct dp_netdev *dp)
5149 OVS_REQUIRES(dp->port_mutex)
5150 {
5151 struct dp_netdev_pmd_thread *pmd;
5152 uint64_t *curr_pmd_usage;
5153
5154 uint64_t curr_variance;
5155 uint64_t new_variance;
5156 uint64_t improvement = 0;
5157 uint32_t num_pmds;
5158 uint32_t *pmd_corelist;
5159 struct rxq_poll *poll;
5160 bool ret;
5161
5162 num_pmds = cmap_count(&dp->poll_threads);
5163
5164 if (num_pmds > 1) {
5165 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5166 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5167 } else {
5168 return false;
5169 }
5170
5171 num_pmds = 0;
5172 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5173 uint64_t total_cycles = 0;
5174 uint64_t total_proc = 0;
5175
5176 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5177 continue;
5178 }
5179
5180 /* Get the total pmd cycles for an interval. */
5181 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5182 /* Estimate the cycles to cover all intervals. */
5183 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5184
5185 ovs_mutex_lock(&pmd->port_mutex);
5186 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5187 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5188 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5189 }
5190 }
5191 ovs_mutex_unlock(&pmd->port_mutex);
5192
5193 if (total_proc) {
5194 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5195 }
5196
5197 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5198 pmd->core_id, curr_pmd_usage[num_pmds]);
5199
5200 if (atomic_count_get(&pmd->pmd_overloaded)) {
5201 atomic_count_set(&pmd->pmd_overloaded, 0);
5202 }
5203
5204 pmd_corelist[num_pmds] = pmd->core_id;
5205 num_pmds++;
5206 }
5207
5208 curr_variance = variance(curr_pmd_usage, num_pmds);
5209 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5210
5211 if (ret) {
5212 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5213 " Predicted PMD variance: %"PRIu64"",
5214 curr_variance, new_variance);
5215
5216 if (new_variance < curr_variance) {
5217 improvement =
5218 ((curr_variance - new_variance) * 100) / curr_variance;
5219 }
5220 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5221 ret = false;
5222 }
5223 }
5224
5225 free(curr_pmd_usage);
5226 free(pmd_corelist);
5227 return ret;
5228 }
5229
5230
5231 /* Return true if needs to revalidate datapath flows. */
5232 static bool
5233 dpif_netdev_run(struct dpif *dpif)
5234 {
5235 struct dp_netdev_port *port;
5236 struct dp_netdev *dp = get_dp_netdev(dpif);
5237 struct dp_netdev_pmd_thread *non_pmd;
5238 uint64_t new_tnl_seq;
5239 bool need_to_flush = true;
5240 bool pmd_rebalance = false;
5241 long long int now = time_msec();
5242 struct dp_netdev_pmd_thread *pmd;
5243
5244 ovs_mutex_lock(&dp->port_mutex);
5245 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5246 if (non_pmd) {
5247 ovs_mutex_lock(&dp->non_pmd_mutex);
5248 HMAP_FOR_EACH (port, node, &dp->ports) {
5249 if (!netdev_is_pmd(port->netdev)) {
5250 int i;
5251
5252 if (port->emc_enabled) {
5253 atomic_read_relaxed(&dp->emc_insert_min,
5254 &non_pmd->ctx.emc_insert_min);
5255 } else {
5256 non_pmd->ctx.emc_insert_min = 0;
5257 }
5258
5259 for (i = 0; i < port->n_rxq; i++) {
5260
5261 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5262 continue;
5263 }
5264
5265 if (dp_netdev_process_rxq_port(non_pmd,
5266 &port->rxqs[i],
5267 port->port_no)) {
5268 need_to_flush = false;
5269 }
5270 }
5271 }
5272 }
5273 if (need_to_flush) {
5274 /* We didn't receive anything in the process loop.
5275 * Check if we need to send something.
5276 * There was no time updates on current iteration. */
5277 pmd_thread_ctx_time_update(non_pmd);
5278 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5279 }
5280
5281 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5282 ovs_mutex_unlock(&dp->non_pmd_mutex);
5283
5284 dp_netdev_pmd_unref(non_pmd);
5285 }
5286
5287 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5288 if (pmd_alb->is_enabled) {
5289 if (!pmd_alb->rebalance_poll_timer) {
5290 pmd_alb->rebalance_poll_timer = now;
5291 } else if ((pmd_alb->rebalance_poll_timer +
5292 pmd_alb->rebalance_intvl) < now) {
5293 pmd_alb->rebalance_poll_timer = now;
5294 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5295 if (atomic_count_get(&pmd->pmd_overloaded) >=
5296 PMD_RXQ_INTERVAL_MAX) {
5297 pmd_rebalance = true;
5298 break;
5299 }
5300 }
5301
5302 if (pmd_rebalance &&
5303 !dp_netdev_is_reconf_required(dp) &&
5304 !ports_require_restart(dp) &&
5305 pmd_rebalance_dry_run(dp)) {
5306 VLOG_INFO("PMD auto lb dry run."
5307 " requesting datapath reconfigure.");
5308 dp_netdev_request_reconfigure(dp);
5309 }
5310 }
5311 }
5312
5313 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5314 reconfigure_datapath(dp);
5315 }
5316 ovs_mutex_unlock(&dp->port_mutex);
5317
5318 tnl_neigh_cache_run();
5319 tnl_port_map_run();
5320 new_tnl_seq = seq_read(tnl_conf_seq);
5321
5322 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5323 dp->last_tnl_conf_seq = new_tnl_seq;
5324 return true;
5325 }
5326 return false;
5327 }
5328
5329 static void
5330 dpif_netdev_wait(struct dpif *dpif)
5331 {
5332 struct dp_netdev_port *port;
5333 struct dp_netdev *dp = get_dp_netdev(dpif);
5334
5335 ovs_mutex_lock(&dp_netdev_mutex);
5336 ovs_mutex_lock(&dp->port_mutex);
5337 HMAP_FOR_EACH (port, node, &dp->ports) {
5338 netdev_wait_reconf_required(port->netdev);
5339 if (!netdev_is_pmd(port->netdev)) {
5340 int i;
5341
5342 for (i = 0; i < port->n_rxq; i++) {
5343 netdev_rxq_wait(port->rxqs[i].rx);
5344 }
5345 }
5346 }
5347 ovs_mutex_unlock(&dp->port_mutex);
5348 ovs_mutex_unlock(&dp_netdev_mutex);
5349 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5350 }
5351
5352 static void
5353 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5354 {
5355 struct tx_port *tx_port_cached;
5356
5357 /* Flush all the queued packets. */
5358 dp_netdev_pmd_flush_output_packets(pmd, true);
5359 /* Free all used tx queue ids. */
5360 dpif_netdev_xps_revalidate_pmd(pmd, true);
5361
5362 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5363 free(tx_port_cached);
5364 }
5365 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5366 free(tx_port_cached);
5367 }
5368 }
5369
5370 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5371 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5372 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5373 * one txq. */
5374 static void
5375 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5376 OVS_REQUIRES(pmd->port_mutex)
5377 {
5378 struct tx_port *tx_port, *tx_port_cached;
5379
5380 pmd_free_cached_ports(pmd);
5381 hmap_shrink(&pmd->send_port_cache);
5382 hmap_shrink(&pmd->tnl_port_cache);
5383
5384 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5385 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5386 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5387 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5388 hash_port_no(tx_port_cached->port->port_no));
5389 }
5390
5391 if (netdev_n_txq(tx_port->port->netdev)) {
5392 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5393 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5394 hash_port_no(tx_port_cached->port->port_no));
5395 }
5396 }
5397 }
5398
5399 static void
5400 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5401 {
5402 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5403 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5404 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5405 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5406 }
5407 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5408
5409 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5410 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5411 }
5412
5413 static void
5414 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5415 {
5416 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5417 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5418 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5419 }
5420
5421 static int
5422 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5423 struct polled_queue **ppoll_list)
5424 {
5425 struct polled_queue *poll_list = *ppoll_list;
5426 struct rxq_poll *poll;
5427 int i;
5428
5429 ovs_mutex_lock(&pmd->port_mutex);
5430 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5431 * sizeof *poll_list);
5432
5433 i = 0;
5434 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5435 poll_list[i].rxq = poll->rxq;
5436 poll_list[i].port_no = poll->rxq->port->port_no;
5437 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5438 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5439 poll_list[i].change_seq =
5440 netdev_get_change_seq(poll->rxq->port->netdev);
5441 i++;
5442 }
5443
5444 pmd_load_cached_ports(pmd);
5445
5446 ovs_mutex_unlock(&pmd->port_mutex);
5447
5448 *ppoll_list = poll_list;
5449 return i;
5450 }
5451
5452 static void *
5453 pmd_thread_main(void *f_)
5454 {
5455 struct dp_netdev_pmd_thread *pmd = f_;
5456 struct pmd_perf_stats *s = &pmd->perf_stats;
5457 unsigned int lc = 0;
5458 struct polled_queue *poll_list;
5459 bool wait_for_reload = false;
5460 bool reload_tx_qid;
5461 bool exiting;
5462 bool reload;
5463 int poll_cnt;
5464 int i;
5465 int process_packets = 0;
5466
5467 poll_list = NULL;
5468
5469 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5470 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5471 ovs_numa_thread_setaffinity_core(pmd->core_id);
5472 dpdk_set_lcore_id(pmd->core_id);
5473 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5474 dfc_cache_init(&pmd->flow_cache);
5475 pmd_alloc_static_tx_qid(pmd);
5476
5477 reload:
5478 atomic_count_init(&pmd->pmd_overloaded, 0);
5479
5480 /* List port/core affinity */
5481 for (i = 0; i < poll_cnt; i++) {
5482 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5483 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5484 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5485 /* Reset the rxq current cycles counter. */
5486 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5487 }
5488
5489 if (!poll_cnt) {
5490 if (wait_for_reload) {
5491 /* Don't sleep, control thread will ask for a reload shortly. */
5492 do {
5493 atomic_read_explicit(&pmd->reload, &reload,
5494 memory_order_acquire);
5495 } while (!reload);
5496 } else {
5497 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5498 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5499 poll_block();
5500 }
5501 }
5502 }
5503
5504 pmd->intrvl_tsc_prev = 0;
5505 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5506 cycles_counter_update(s);
5507 /* Protect pmd stats from external clearing while polling. */
5508 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5509 for (;;) {
5510 uint64_t rx_packets = 0, tx_packets = 0;
5511
5512 pmd_perf_start_iteration(s);
5513
5514 for (i = 0; i < poll_cnt; i++) {
5515
5516 if (!poll_list[i].rxq_enabled) {
5517 continue;
5518 }
5519
5520 if (poll_list[i].emc_enabled) {
5521 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5522 &pmd->ctx.emc_insert_min);
5523 } else {
5524 pmd->ctx.emc_insert_min = 0;
5525 }
5526
5527 process_packets =
5528 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5529 poll_list[i].port_no);
5530 rx_packets += process_packets;
5531 }
5532
5533 if (!rx_packets) {
5534 /* We didn't receive anything in the process loop.
5535 * Check if we need to send something.
5536 * There was no time updates on current iteration. */
5537 pmd_thread_ctx_time_update(pmd);
5538 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5539 }
5540
5541 if (lc++ > 1024) {
5542 lc = 0;
5543
5544 coverage_try_clear();
5545 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5546 if (!ovsrcu_try_quiesce()) {
5547 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5548 }
5549
5550 for (i = 0; i < poll_cnt; i++) {
5551 uint64_t current_seq =
5552 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5553 if (poll_list[i].change_seq != current_seq) {
5554 poll_list[i].change_seq = current_seq;
5555 poll_list[i].rxq_enabled =
5556 netdev_rxq_enabled(poll_list[i].rxq->rx);
5557 }
5558 }
5559 }
5560
5561 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5562 if (OVS_UNLIKELY(reload)) {
5563 break;
5564 }
5565
5566 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5567 pmd_perf_metrics_enabled(pmd));
5568 }
5569 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5570
5571 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5572 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5573 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5574 atomic_read_relaxed(&pmd->exit, &exiting);
5575 /* Signal here to make sure the pmd finishes
5576 * reloading the updated configuration. */
5577 dp_netdev_pmd_reload_done(pmd);
5578
5579 if (reload_tx_qid) {
5580 pmd_free_static_tx_qid(pmd);
5581 pmd_alloc_static_tx_qid(pmd);
5582 }
5583
5584 if (!exiting) {
5585 goto reload;
5586 }
5587
5588 pmd_free_static_tx_qid(pmd);
5589 dfc_cache_uninit(&pmd->flow_cache);
5590 free(poll_list);
5591 pmd_free_cached_ports(pmd);
5592 return NULL;
5593 }
5594
5595 static void
5596 dp_netdev_disable_upcall(struct dp_netdev *dp)
5597 OVS_ACQUIRES(dp->upcall_rwlock)
5598 {
5599 fat_rwlock_wrlock(&dp->upcall_rwlock);
5600 }
5601
5602 \f
5603 /* Meters */
5604 static void
5605 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5606 struct ofputil_meter_features *features)
5607 {
5608 features->max_meters = MAX_METERS;
5609 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5610 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5611 features->max_bands = MAX_BANDS;
5612 features->max_color = 0;
5613 }
5614
5615 /* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5616 * that exceed a band are dropped in-place. */
5617 static void
5618 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5619 uint32_t meter_id, long long int now)
5620 {
5621 struct dp_meter *meter;
5622 struct dp_meter_band *band;
5623 struct dp_packet *packet;
5624 long long int long_delta_t; /* msec */
5625 uint32_t delta_t; /* msec */
5626 const size_t cnt = dp_packet_batch_size(packets_);
5627 uint32_t bytes, volume;
5628 int exceeded_band[NETDEV_MAX_BURST];
5629 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5630 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5631
5632 if (meter_id >= MAX_METERS) {
5633 return;
5634 }
5635
5636 meter_lock(dp, meter_id);
5637 meter = dp->meters[meter_id];
5638 if (!meter) {
5639 goto out;
5640 }
5641
5642 /* Initialize as negative values. */
5643 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5644 /* Initialize as zeroes. */
5645 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5646
5647 /* All packets will hit the meter at the same time. */
5648 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5649
5650 /* Make sure delta_t will not be too large, so that bucket will not
5651 * wrap around below. */
5652 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5653 ? meter->max_delta_t : (uint32_t)long_delta_t;
5654
5655 /* Update meter stats. */
5656 meter->used = now;
5657 meter->packet_count += cnt;
5658 bytes = 0;
5659 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5660 bytes += dp_packet_size(packet);
5661 }
5662 meter->byte_count += bytes;
5663
5664 /* Meters can operate in terms of packets per second or kilobits per
5665 * second. */
5666 if (meter->flags & OFPMF13_PKTPS) {
5667 /* Rate in packets/second, bucket 1/1000 packets. */
5668 /* msec * packets/sec = 1/1000 packets. */
5669 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5670 } else {
5671 /* Rate in kbps, bucket in bits. */
5672 /* msec * kbps = bits */
5673 volume = bytes * 8;
5674 }
5675
5676 /* Update all bands and find the one hit with the highest rate for each
5677 * packet (if any). */
5678 for (int m = 0; m < meter->n_bands; ++m) {
5679 band = &meter->bands[m];
5680
5681 /* Update band's bucket. */
5682 band->bucket += delta_t * band->up.rate;
5683 if (band->bucket > band->up.burst_size) {
5684 band->bucket = band->up.burst_size;
5685 }
5686
5687 /* Drain the bucket for all the packets, if possible. */
5688 if (band->bucket >= volume) {
5689 band->bucket -= volume;
5690 } else {
5691 int band_exceeded_pkt;
5692
5693 /* Band limit hit, must process packet-by-packet. */
5694 if (meter->flags & OFPMF13_PKTPS) {
5695 band_exceeded_pkt = band->bucket / 1000;
5696 band->bucket %= 1000; /* Remainder stays in bucket. */
5697
5698 /* Update the exceeding band for each exceeding packet.
5699 * (Only one band will be fired by a packet, and that
5700 * can be different for each packet.) */
5701 for (int i = band_exceeded_pkt; i < cnt; i++) {
5702 if (band->up.rate > exceeded_rate[i]) {
5703 exceeded_rate[i] = band->up.rate;
5704 exceeded_band[i] = m;
5705 }
5706 }
5707 } else {
5708 /* Packet sizes differ, must process one-by-one. */
5709 band_exceeded_pkt = cnt;
5710 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5711 uint32_t bits = dp_packet_size(packet) * 8;
5712
5713 if (band->bucket >= bits) {
5714 band->bucket -= bits;
5715 } else {
5716 if (i < band_exceeded_pkt) {
5717 band_exceeded_pkt = i;
5718 }
5719 /* Update the exceeding band for the exceeding packet.
5720 * (Only one band will be fired by a packet, and that
5721 * can be different for each packet.) */
5722 if (band->up.rate > exceeded_rate[i]) {
5723 exceeded_rate[i] = band->up.rate;
5724 exceeded_band[i] = m;
5725 }
5726 }
5727 }
5728 }
5729 /* Remember the first exceeding packet. */
5730 if (exceeded_pkt > band_exceeded_pkt) {
5731 exceeded_pkt = band_exceeded_pkt;
5732 }
5733 }
5734 }
5735
5736 /* Fire the highest rate band exceeded by each packet, and drop
5737 * packets if needed. */
5738 size_t j;
5739 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
5740 if (exceeded_band[j] >= 0) {
5741 /* Meter drop packet. */
5742 band = &meter->bands[exceeded_band[j]];
5743 band->packet_count += 1;
5744 band->byte_count += dp_packet_size(packet);
5745
5746 dp_packet_delete(packet);
5747 } else {
5748 /* Meter accepts packet. */
5749 dp_packet_batch_refill(packets_, packet, j);
5750 }
5751 }
5752 out:
5753 meter_unlock(dp, meter_id);
5754 }
5755
5756 /* Meter set/get/del processing is still single-threaded. */
5757 static int
5758 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
5759 struct ofputil_meter_config *config)
5760 {
5761 struct dp_netdev *dp = get_dp_netdev(dpif);
5762 uint32_t mid = meter_id.uint32;
5763 struct dp_meter *meter;
5764 int i;
5765
5766 if (mid >= MAX_METERS) {
5767 return EFBIG; /* Meter_id out of range. */
5768 }
5769
5770 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
5771 return EBADF; /* Unsupported flags set */
5772 }
5773
5774 if (config->n_bands > MAX_BANDS) {
5775 return EINVAL;
5776 }
5777
5778 for (i = 0; i < config->n_bands; ++i) {
5779 switch (config->bands[i].type) {
5780 case OFPMBT13_DROP:
5781 break;
5782 default:
5783 return ENODEV; /* Unsupported band type */
5784 }
5785 }
5786
5787 /* Allocate meter */
5788 meter = xzalloc(sizeof *meter
5789 + config->n_bands * sizeof(struct dp_meter_band));
5790
5791 meter->flags = config->flags;
5792 meter->n_bands = config->n_bands;
5793 meter->max_delta_t = 0;
5794 meter->used = time_usec();
5795
5796 /* set up bands */
5797 for (i = 0; i < config->n_bands; ++i) {
5798 uint32_t band_max_delta_t;
5799
5800 /* Set burst size to a workable value if none specified. */
5801 if (config->bands[i].burst_size == 0) {
5802 config->bands[i].burst_size = config->bands[i].rate;
5803 }
5804
5805 meter->bands[i].up = config->bands[i];
5806 /* Convert burst size to the bucket units: */
5807 /* pkts => 1/1000 packets, kilobits => bits. */
5808 meter->bands[i].up.burst_size *= 1000;
5809 /* Initialize bucket to empty. */
5810 meter->bands[i].bucket = 0;
5811
5812 /* Figure out max delta_t that is enough to fill any bucket. */
5813 band_max_delta_t
5814 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5815 if (band_max_delta_t > meter->max_delta_t) {
5816 meter->max_delta_t = band_max_delta_t;
5817 }
5818 }
5819
5820 meter_lock(dp, mid);
5821 dp_delete_meter(dp, mid); /* Free existing meter, if any */
5822 dp->meters[mid] = meter;
5823 meter_unlock(dp, mid);
5824
5825 return 0;
5826 }
5827
5828 static int
5829 dpif_netdev_meter_get(const struct dpif *dpif,
5830 ofproto_meter_id meter_id_,
5831 struct ofputil_meter_stats *stats, uint16_t n_bands)
5832 {
5833 const struct dp_netdev *dp = get_dp_netdev(dpif);
5834 uint32_t meter_id = meter_id_.uint32;
5835 int retval = 0;
5836
5837 if (meter_id >= MAX_METERS) {
5838 return EFBIG;
5839 }
5840
5841 meter_lock(dp, meter_id);
5842 const struct dp_meter *meter = dp->meters[meter_id];
5843 if (!meter) {
5844 retval = ENOENT;
5845 goto done;
5846 }
5847 if (stats) {
5848 int i = 0;
5849
5850 stats->packet_in_count = meter->packet_count;
5851 stats->byte_in_count = meter->byte_count;
5852
5853 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5854 stats->bands[i].packet_count = meter->bands[i].packet_count;
5855 stats->bands[i].byte_count = meter->bands[i].byte_count;
5856 }
5857
5858 stats->n_bands = i;
5859 }
5860
5861 done:
5862 meter_unlock(dp, meter_id);
5863 return retval;
5864 }
5865
5866 static int
5867 dpif_netdev_meter_del(struct dpif *dpif,
5868 ofproto_meter_id meter_id_,
5869 struct ofputil_meter_stats *stats, uint16_t n_bands)
5870 {
5871 struct dp_netdev *dp = get_dp_netdev(dpif);
5872 int error;
5873
5874 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5875 if (!error) {
5876 uint32_t meter_id = meter_id_.uint32;
5877
5878 meter_lock(dp, meter_id);
5879 dp_delete_meter(dp, meter_id);
5880 meter_unlock(dp, meter_id);
5881 }
5882 return error;
5883 }
5884
5885 \f
5886 static void
5887 dpif_netdev_disable_upcall(struct dpif *dpif)
5888 OVS_NO_THREAD_SAFETY_ANALYSIS
5889 {
5890 struct dp_netdev *dp = get_dp_netdev(dpif);
5891 dp_netdev_disable_upcall(dp);
5892 }
5893
5894 static void
5895 dp_netdev_enable_upcall(struct dp_netdev *dp)
5896 OVS_RELEASES(dp->upcall_rwlock)
5897 {
5898 fat_rwlock_unlock(&dp->upcall_rwlock);
5899 }
5900
5901 static void
5902 dpif_netdev_enable_upcall(struct dpif *dpif)
5903 OVS_NO_THREAD_SAFETY_ANALYSIS
5904 {
5905 struct dp_netdev *dp = get_dp_netdev(dpif);
5906 dp_netdev_enable_upcall(dp);
5907 }
5908
5909 static void
5910 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
5911 {
5912 atomic_store_relaxed(&pmd->wait_for_reload, false);
5913 atomic_store_relaxed(&pmd->reload_tx_qid, false);
5914 pmd->last_reload_seq = seq_read(pmd->reload_seq);
5915 atomic_store_explicit(&pmd->reload, false, memory_order_release);
5916 }
5917
5918 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
5919 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
5920 * 'core_id' is NON_PMD_CORE_ID).
5921 *
5922 * Caller must unrefs the returned reference. */
5923 static struct dp_netdev_pmd_thread *
5924 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
5925 {
5926 struct dp_netdev_pmd_thread *pmd;
5927 const struct cmap_node *pnode;
5928
5929 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
5930 if (!pnode) {
5931 return NULL;
5932 }
5933 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
5934
5935 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
5936 }
5937
5938 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
5939 static void
5940 dp_netdev_set_nonpmd(struct dp_netdev *dp)
5941 OVS_REQUIRES(dp->port_mutex)
5942 {
5943 struct dp_netdev_pmd_thread *non_pmd;
5944
5945 non_pmd = xzalloc(sizeof *non_pmd);
5946 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5947 }
5948
5949 /* Caller must have valid pointer to 'pmd'. */
5950 static bool
5951 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5952 {
5953 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5954 }
5955
5956 static void
5957 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5958 {
5959 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5960 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5961 }
5962 }
5963
5964 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
5965 * fails, keeps checking for next node until reaching the end of cmap.
5966 *
5967 * Caller must unrefs the returned reference. */
5968 static struct dp_netdev_pmd_thread *
5969 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5970 {
5971 struct dp_netdev_pmd_thread *next;
5972
5973 do {
5974 struct cmap_node *node;
5975
5976 node = cmap_next_position(&dp->poll_threads, pos);
5977 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5978 : NULL;
5979 } while (next && !dp_netdev_pmd_try_ref(next));
5980
5981 return next;
5982 }
5983
5984 /* Configures the 'pmd' based on the input argument. */
5985 static void
5986 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
5987 unsigned core_id, int numa_id)
5988 {
5989 pmd->dp = dp;
5990 pmd->core_id = core_id;
5991 pmd->numa_id = numa_id;
5992 pmd->need_reload = false;
5993 pmd->n_output_batches = 0;
5994
5995 ovs_refcount_init(&pmd->ref_cnt);
5996 atomic_init(&pmd->exit, false);
5997 pmd->reload_seq = seq_create();
5998 pmd->last_reload_seq = seq_read(pmd->reload_seq);
5999 atomic_init(&pmd->reload, false);
6000 ovs_mutex_init(&pmd->flow_mutex);
6001 ovs_mutex_init(&pmd->port_mutex);
6002 cmap_init(&pmd->flow_table);
6003 cmap_init(&pmd->classifiers);
6004 pmd->ctx.last_rxq = NULL;
6005 pmd_thread_ctx_time_update(pmd);
6006 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6007 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6008 hmap_init(&pmd->poll_list);
6009 hmap_init(&pmd->tx_ports);
6010 hmap_init(&pmd->tnl_port_cache);
6011 hmap_init(&pmd->send_port_cache);
6012 /* init the 'flow_cache' since there is no
6013 * actual thread created for NON_PMD_CORE_ID. */
6014 if (core_id == NON_PMD_CORE_ID) {
6015 dfc_cache_init(&pmd->flow_cache);
6016 pmd_alloc_static_tx_qid(pmd);
6017 }
6018 pmd_perf_stats_init(&pmd->perf_stats);
6019 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6020 hash_int(core_id, 0));
6021 }
6022
6023 static void
6024 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6025 {
6026 struct dpcls *cls;
6027
6028 dp_netdev_pmd_flow_flush(pmd);
6029 hmap_destroy(&pmd->send_port_cache);
6030 hmap_destroy(&pmd->tnl_port_cache);
6031 hmap_destroy(&pmd->tx_ports);
6032 hmap_destroy(&pmd->poll_list);
6033 /* All flows (including their dpcls_rules) have been deleted already */
6034 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6035 dpcls_destroy(cls);
6036 ovsrcu_postpone(free, cls);
6037 }
6038 cmap_destroy(&pmd->classifiers);
6039 cmap_destroy(&pmd->flow_table);
6040 ovs_mutex_destroy(&pmd->flow_mutex);
6041 seq_destroy(pmd->reload_seq);
6042 ovs_mutex_destroy(&pmd->port_mutex);
6043 free(pmd);
6044 }
6045
6046 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6047 * and unrefs the struct. */
6048 static void
6049 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6050 {
6051 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6052 * but extra cleanup is necessary */
6053 if (pmd->core_id == NON_PMD_CORE_ID) {
6054 ovs_mutex_lock(&dp->non_pmd_mutex);
6055 dfc_cache_uninit(&pmd->flow_cache);
6056 pmd_free_cached_ports(pmd);
6057 pmd_free_static_tx_qid(pmd);
6058 ovs_mutex_unlock(&dp->non_pmd_mutex);
6059 } else {
6060 atomic_store_relaxed(&pmd->exit, true);
6061 dp_netdev_reload_pmd__(pmd);
6062 xpthread_join(pmd->thread, NULL);
6063 }
6064
6065 dp_netdev_pmd_clear_ports(pmd);
6066
6067 /* Purges the 'pmd''s flows after stopping the thread, but before
6068 * destroying the flows, so that the flow stats can be collected. */
6069 if (dp->dp_purge_cb) {
6070 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6071 }
6072 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6073 dp_netdev_pmd_unref(pmd);
6074 }
6075
6076 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6077 * thread. */
6078 static void
6079 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6080 {
6081 struct dp_netdev_pmd_thread *pmd;
6082 struct dp_netdev_pmd_thread **pmd_list;
6083 size_t k = 0, n_pmds;
6084
6085 n_pmds = cmap_count(&dp->poll_threads);
6086 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6087
6088 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6089 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6090 continue;
6091 }
6092 /* We cannot call dp_netdev_del_pmd(), since it alters
6093 * 'dp->poll_threads' (while we're iterating it) and it
6094 * might quiesce. */
6095 ovs_assert(k < n_pmds);
6096 pmd_list[k++] = pmd;
6097 }
6098
6099 for (size_t i = 0; i < k; i++) {
6100 dp_netdev_del_pmd(dp, pmd_list[i]);
6101 }
6102 free(pmd_list);
6103 }
6104
6105 /* Deletes all rx queues from pmd->poll_list and all the ports from
6106 * pmd->tx_ports. */
6107 static void
6108 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6109 {
6110 struct rxq_poll *poll;
6111 struct tx_port *port;
6112
6113 ovs_mutex_lock(&pmd->port_mutex);
6114 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6115 free(poll);
6116 }
6117 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6118 free(port);
6119 }
6120 ovs_mutex_unlock(&pmd->port_mutex);
6121 }
6122
6123 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6124 static void
6125 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6126 struct dp_netdev_rxq *rxq)
6127 OVS_REQUIRES(pmd->port_mutex)
6128 {
6129 int qid = netdev_rxq_get_queue_id(rxq->rx);
6130 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6131 struct rxq_poll *poll;
6132
6133 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6134 if (poll->rxq == rxq) {
6135 /* 'rxq' is already polled by this thread. Do nothing. */
6136 return;
6137 }
6138 }
6139
6140 poll = xmalloc(sizeof *poll);
6141 poll->rxq = rxq;
6142 hmap_insert(&pmd->poll_list, &poll->node, hash);
6143
6144 pmd->need_reload = true;
6145 }
6146
6147 /* Delete 'poll' from poll_list of PMD thread. */
6148 static void
6149 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6150 struct rxq_poll *poll)
6151 OVS_REQUIRES(pmd->port_mutex)
6152 {
6153 hmap_remove(&pmd->poll_list, &poll->node);
6154 free(poll);
6155
6156 pmd->need_reload = true;
6157 }
6158
6159 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6160 * changes to take effect. */
6161 static void
6162 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6163 struct dp_netdev_port *port)
6164 OVS_REQUIRES(pmd->port_mutex)
6165 {
6166 struct tx_port *tx;
6167
6168 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6169 if (tx) {
6170 /* 'port' is already on this thread tx cache. Do nothing. */
6171 return;
6172 }
6173
6174 tx = xzalloc(sizeof *tx);
6175
6176 tx->port = port;
6177 tx->qid = -1;
6178 tx->flush_time = 0LL;
6179 dp_packet_batch_init(&tx->output_pkts);
6180
6181 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6182 pmd->need_reload = true;
6183 }
6184
6185 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6186 * changes to take effect. */
6187 static void
6188 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6189 struct tx_port *tx)
6190 OVS_REQUIRES(pmd->port_mutex)
6191 {
6192 hmap_remove(&pmd->tx_ports, &tx->node);
6193 free(tx);
6194 pmd->need_reload = true;
6195 }
6196 \f
6197 static char *
6198 dpif_netdev_get_datapath_version(void)
6199 {
6200 return xstrdup("<built-in>");
6201 }
6202
6203 static void
6204 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6205 uint16_t tcp_flags, long long now)
6206 {
6207 uint16_t flags;
6208
6209 atomic_store_relaxed(&netdev_flow->stats.used, now);
6210 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6211 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6212 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6213 flags |= tcp_flags;
6214 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6215 }
6216
6217 static int
6218 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6219 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6220 enum dpif_upcall_type type, const struct nlattr *userdata,
6221 struct ofpbuf *actions, struct ofpbuf *put_actions)
6222 {
6223 struct dp_netdev *dp = pmd->dp;
6224
6225 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6226 return ENODEV;
6227 }
6228
6229 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6230 struct ds ds = DS_EMPTY_INITIALIZER;
6231 char *packet_str;
6232 struct ofpbuf key;
6233 struct odp_flow_key_parms odp_parms = {
6234 .flow = flow,
6235 .mask = wc ? &wc->masks : NULL,
6236 .support = dp_netdev_support,
6237 };
6238
6239 ofpbuf_init(&key, 0);
6240 odp_flow_key_from_flow(&odp_parms, &key);
6241 packet_str = ofp_dp_packet_to_string(packet_);
6242
6243 odp_flow_key_format(key.data, key.size, &ds);
6244
6245 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6246 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6247
6248 ofpbuf_uninit(&key);
6249 free(packet_str);
6250
6251 ds_destroy(&ds);
6252 }
6253
6254 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6255 actions, wc, put_actions, dp->upcall_aux);
6256 }
6257
6258 static inline uint32_t
6259 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6260 const struct miniflow *mf)
6261 {
6262 uint32_t hash;
6263
6264 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6265 hash = dp_packet_get_rss_hash(packet);
6266 } else {
6267 hash = miniflow_hash_5tuple(mf, 0);
6268 dp_packet_set_rss_hash(packet, hash);
6269 }
6270
6271 return hash;
6272 }
6273
6274 static inline uint32_t
6275 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6276 const struct miniflow *mf)
6277 {
6278 uint32_t hash, recirc_depth;
6279
6280 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6281 hash = dp_packet_get_rss_hash(packet);
6282 } else {
6283 hash = miniflow_hash_5tuple(mf, 0);
6284 dp_packet_set_rss_hash(packet, hash);
6285 }
6286
6287 /* The RSS hash must account for the recirculation depth to avoid
6288 * collisions in the exact match cache */
6289 recirc_depth = *recirc_depth_get_unsafe();
6290 if (OVS_UNLIKELY(recirc_depth)) {
6291 hash = hash_finish(hash, recirc_depth);
6292 dp_packet_set_rss_hash(packet, hash);
6293 }
6294 return hash;
6295 }
6296
6297 struct packet_batch_per_flow {
6298 unsigned int byte_count;
6299 uint16_t tcp_flags;
6300 struct dp_netdev_flow *flow;
6301
6302 struct dp_packet_batch array;
6303 };
6304
6305 static inline void
6306 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6307 struct dp_packet *packet,
6308 uint16_t tcp_flags)
6309 {
6310 batch->byte_count += dp_packet_size(packet);
6311 batch->tcp_flags |= tcp_flags;
6312 batch->array.packets[batch->array.count++] = packet;
6313 }
6314
6315 static inline void
6316 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6317 struct dp_netdev_flow *flow)
6318 {
6319 flow->batch = batch;
6320
6321 batch->flow = flow;
6322 dp_packet_batch_init(&batch->array);
6323 batch->byte_count = 0;
6324 batch->tcp_flags = 0;
6325 }
6326
6327 static inline void
6328 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6329 struct dp_netdev_pmd_thread *pmd)
6330 {
6331 struct dp_netdev_actions *actions;
6332 struct dp_netdev_flow *flow = batch->flow;
6333
6334 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
6335 batch->tcp_flags, pmd->ctx.now / 1000);
6336
6337 actions = dp_netdev_flow_get_actions(flow);
6338
6339 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6340 actions->actions, actions->size);
6341 }
6342
6343 static inline void
6344 dp_netdev_queue_batches(struct dp_packet *pkt,
6345 struct dp_netdev_flow *flow, uint16_t tcp_flags,
6346 struct packet_batch_per_flow *batches,
6347 size_t *n_batches)
6348 {
6349 struct packet_batch_per_flow *batch = flow->batch;
6350
6351 if (OVS_UNLIKELY(!batch)) {
6352 batch = &batches[(*n_batches)++];
6353 packet_batch_per_flow_init(batch, flow);
6354 }
6355
6356 packet_batch_per_flow_update(batch, pkt, tcp_flags);
6357 }
6358
6359 static inline void
6360 packet_enqueue_to_flow_map(struct dp_packet *packet,
6361 struct dp_netdev_flow *flow,
6362 uint16_t tcp_flags,
6363 struct dp_packet_flow_map *flow_map,
6364 size_t index)
6365 {
6366 struct dp_packet_flow_map *map = &flow_map[index];
6367 map->flow = flow;
6368 map->packet = packet;
6369 map->tcp_flags = tcp_flags;
6370 }
6371
6372 /* SMC lookup function for a batch of packets.
6373 * By doing batching SMC lookup, we can use prefetch
6374 * to hide memory access latency.
6375 */
6376 static inline void
6377 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6378 struct netdev_flow_key *keys,
6379 struct netdev_flow_key **missed_keys,
6380 struct dp_packet_batch *packets_,
6381 const int cnt,
6382 struct dp_packet_flow_map *flow_map,
6383 uint8_t *index_map)
6384 {
6385 int i;
6386 struct dp_packet *packet;
6387 size_t n_smc_hit = 0, n_missed = 0;
6388 struct dfc_cache *cache = &pmd->flow_cache;
6389 struct smc_cache *smc_cache = &cache->smc_cache;
6390 const struct cmap_node *flow_node;
6391 int recv_idx;
6392 uint16_t tcp_flags;
6393
6394 /* Prefetch buckets for all packets */
6395 for (i = 0; i < cnt; i++) {
6396 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6397 }
6398
6399 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6400 struct dp_netdev_flow *flow = NULL;
6401 flow_node = smc_entry_get(pmd, keys[i].hash);
6402 bool hit = false;
6403 /* Get the original order of this packet in received batch. */
6404 recv_idx = index_map[i];
6405
6406 if (OVS_LIKELY(flow_node != NULL)) {
6407 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6408 /* Since we dont have per-port megaflow to check the port
6409 * number, we need to verify that the input ports match. */
6410 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6411 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6412 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6413
6414 /* SMC hit and emc miss, we insert into EMC */
6415 keys[i].len =
6416 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6417 emc_probabilistic_insert(pmd, &keys[i], flow);
6418 /* Add these packets into the flow map in the same order
6419 * as received.
6420 */
6421 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6422 flow_map, recv_idx);
6423 n_smc_hit++;
6424 hit = true;
6425 break;
6426 }
6427 }
6428 if (hit) {
6429 continue;
6430 }
6431 }
6432
6433 /* SMC missed. Group missed packets together at
6434 * the beginning of the 'packets' array. */
6435 dp_packet_batch_refill(packets_, packet, i);
6436
6437 /* Preserve the order of packet for flow batching. */
6438 index_map[n_missed] = recv_idx;
6439
6440 /* Put missed keys to the pointer arrays return to the caller */
6441 missed_keys[n_missed++] = &keys[i];
6442 }
6443
6444 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6445 }
6446
6447 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6448 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6449 * miniflow is copied into 'keys' and the packet pointer is moved at the
6450 * beginning of the 'packets' array. The pointers of missed keys are put in the
6451 * missed_keys pointer array for future processing.
6452 *
6453 * The function returns the number of packets that needs to be processed in the
6454 * 'packets' array (they have been moved to the beginning of the vector).
6455 *
6456 * For performance reasons a caller may choose not to initialize the metadata
6457 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6458 * is not valid and must be initialized by this function using 'port_no'.
6459 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6460 * will be ignored.
6461 */
6462 static inline size_t
6463 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6464 struct dp_packet_batch *packets_,
6465 struct netdev_flow_key *keys,
6466 struct netdev_flow_key **missed_keys,
6467 struct packet_batch_per_flow batches[], size_t *n_batches,
6468 struct dp_packet_flow_map *flow_map,
6469 size_t *n_flows, uint8_t *index_map,
6470 bool md_is_valid, odp_port_t port_no)
6471 {
6472 struct netdev_flow_key *key = &keys[0];
6473 size_t n_missed = 0, n_emc_hit = 0;
6474 struct dfc_cache *cache = &pmd->flow_cache;
6475 struct dp_packet *packet;
6476 const size_t cnt = dp_packet_batch_size(packets_);
6477 uint32_t cur_min = pmd->ctx.emc_insert_min;
6478 int i;
6479 uint16_t tcp_flags;
6480 bool smc_enable_db;
6481 size_t map_cnt = 0;
6482 bool batch_enable = true;
6483
6484 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6485 pmd_perf_update_counter(&pmd->perf_stats,
6486 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6487 cnt);
6488
6489 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6490 struct dp_netdev_flow *flow;
6491 uint32_t mark;
6492
6493 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6494 dp_packet_delete(packet);
6495 continue;
6496 }
6497
6498 if (i != cnt - 1) {
6499 struct dp_packet **packets = packets_->packets;
6500 /* Prefetch next packet data and metadata. */
6501 OVS_PREFETCH(dp_packet_data(packets[i+1]));
6502 pkt_metadata_prefetch_init(&packets[i+1]->md);
6503 }
6504
6505 if (!md_is_valid) {
6506 pkt_metadata_init(&packet->md, port_no);
6507 }
6508
6509 if ((*recirc_depth_get() == 0) &&
6510 dp_packet_has_flow_mark(packet, &mark)) {
6511 flow = mark_to_flow_find(pmd, mark);
6512 if (OVS_LIKELY(flow)) {
6513 tcp_flags = parse_tcp_flags(packet);
6514 if (OVS_LIKELY(batch_enable)) {
6515 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6516 n_batches);
6517 } else {
6518 /* Flow batching should be performed only after fast-path
6519 * processing is also completed for packets with emc miss
6520 * or else it will result in reordering of packets with
6521 * same datapath flows. */
6522 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6523 flow_map, map_cnt++);
6524 }
6525 continue;
6526 }
6527 }
6528
6529 miniflow_extract(packet, &key->mf);
6530 key->len = 0; /* Not computed yet. */
6531 key->hash =
6532 (md_is_valid == false)
6533 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6534 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6535
6536 /* If EMC is disabled skip emc_lookup */
6537 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6538 if (OVS_LIKELY(flow)) {
6539 tcp_flags = miniflow_get_tcp_flags(&key->mf);
6540 n_emc_hit++;
6541 if (OVS_LIKELY(batch_enable)) {
6542 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6543 n_batches);
6544 } else {
6545 /* Flow batching should be performed only after fast-path
6546 * processing is also completed for packets with emc miss
6547 * or else it will result in reordering of packets with
6548 * same datapath flows. */
6549 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6550 flow_map, map_cnt++);
6551 }
6552 } else {
6553 /* Exact match cache missed. Group missed packets together at
6554 * the beginning of the 'packets' array. */
6555 dp_packet_batch_refill(packets_, packet, i);
6556
6557 /* Preserve the order of packet for flow batching. */
6558 index_map[n_missed] = map_cnt;
6559 flow_map[map_cnt++].flow = NULL;
6560
6561 /* 'key[n_missed]' contains the key of the current packet and it
6562 * will be passed to SMC lookup. The next key should be extracted
6563 * to 'keys[n_missed + 1]'.
6564 * We also maintain a pointer array to keys missed both SMC and EMC
6565 * which will be returned to the caller for future processing. */
6566 missed_keys[n_missed] = key;
6567 key = &keys[++n_missed];
6568
6569 /* Skip batching for subsequent packets to avoid reordering. */
6570 batch_enable = false;
6571 }
6572 }
6573 /* Count of packets which are not flow batched. */
6574 *n_flows = map_cnt;
6575
6576 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6577
6578 if (!smc_enable_db) {
6579 return dp_packet_batch_size(packets_);
6580 }
6581
6582 /* Packets miss EMC will do a batch lookup in SMC if enabled */
6583 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6584 n_missed, flow_map, index_map);
6585
6586 return dp_packet_batch_size(packets_);
6587 }
6588
6589 static inline int
6590 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6591 struct dp_packet *packet,
6592 const struct netdev_flow_key *key,
6593 struct ofpbuf *actions, struct ofpbuf *put_actions)
6594 {
6595 struct ofpbuf *add_actions;
6596 struct dp_packet_batch b;
6597 struct match match;
6598 ovs_u128 ufid;
6599 int error;
6600 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6601
6602 match.tun_md.valid = false;
6603 miniflow_expand(&key->mf, &match.flow);
6604
6605 ofpbuf_clear(actions);
6606 ofpbuf_clear(put_actions);
6607
6608 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
6609 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6610 &ufid, DPIF_UC_MISS, NULL, actions,
6611 put_actions);
6612 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6613 dp_packet_delete(packet);
6614 return error;
6615 }
6616
6617 /* The Netlink encoding of datapath flow keys cannot express
6618 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6619 * tag is interpreted as exact match on the fact that there is no
6620 * VLAN. Unless we refactor a lot of code that translates between
6621 * Netlink and struct flow representations, we have to do the same
6622 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
6623 if (!match.wc.masks.vlans[0].tci) {
6624 match.wc.masks.vlans[0].tci = htons(0xffff);
6625 }
6626
6627 /* We can't allow the packet batching in the next loop to execute
6628 * the actions. Otherwise, if there are any slow path actions,
6629 * we'll send the packet up twice. */
6630 dp_packet_batch_init_packet(&b, packet);
6631 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6632 actions->data, actions->size);
6633
6634 add_actions = put_actions->size ? put_actions : actions;
6635 if (OVS_LIKELY(error != ENOSPC)) {
6636 struct dp_netdev_flow *netdev_flow;
6637
6638 /* XXX: There's a race window where a flow covering this packet
6639 * could have already been installed since we last did the flow
6640 * lookup before upcall. This could be solved by moving the
6641 * mutex lock outside the loop, but that's an awful long time
6642 * to be locking revalidators out of making flow modifications. */
6643 ovs_mutex_lock(&pmd->flow_mutex);
6644 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6645 if (OVS_LIKELY(!netdev_flow)) {
6646 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6647 add_actions->data,
6648 add_actions->size);
6649 }
6650 ovs_mutex_unlock(&pmd->flow_mutex);
6651 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6652 smc_insert(pmd, key, hash);
6653 emc_probabilistic_insert(pmd, key, netdev_flow);
6654 }
6655 if (pmd_perf_metrics_enabled(pmd)) {
6656 /* Update upcall stats. */
6657 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6658 struct pmd_perf_stats *s = &pmd->perf_stats;
6659 s->current.upcalls++;
6660 s->current.upcall_cycles += cycles;
6661 histogram_add_sample(&s->cycles_per_upcall, cycles);
6662 }
6663 return error;
6664 }
6665
6666 static inline void
6667 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
6668 struct dp_packet_batch *packets_,
6669 struct netdev_flow_key **keys,
6670 struct dp_packet_flow_map *flow_map,
6671 uint8_t *index_map,
6672 odp_port_t in_port)
6673 {
6674 const size_t cnt = dp_packet_batch_size(packets_);
6675 #if !defined(__CHECKER__) && !defined(_WIN32)
6676 const size_t PKT_ARRAY_SIZE = cnt;
6677 #else
6678 /* Sparse or MSVC doesn't like variable length array. */
6679 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6680 #endif
6681 struct dp_packet *packet;
6682 struct dpcls *cls;
6683 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
6684 struct dp_netdev *dp = pmd->dp;
6685 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
6686 int lookup_cnt = 0, add_lookup_cnt;
6687 bool any_miss;
6688
6689 for (size_t i = 0; i < cnt; i++) {
6690 /* Key length is needed in all the cases, hash computed on demand. */
6691 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
6692 }
6693 /* Get the classifier for the in_port */
6694 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6695 if (OVS_LIKELY(cls)) {
6696 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6697 rules, cnt, &lookup_cnt);
6698 } else {
6699 any_miss = true;
6700 memset(rules, 0, sizeof(rules));
6701 }
6702 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6703 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6704 struct ofpbuf actions, put_actions;
6705
6706 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6707 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6708
6709 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6710 struct dp_netdev_flow *netdev_flow;
6711
6712 if (OVS_LIKELY(rules[i])) {
6713 continue;
6714 }
6715
6716 /* It's possible that an earlier slow path execution installed
6717 * a rule covering this flow. In this case, it's a lot cheaper
6718 * to catch it here than execute a miss. */
6719 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
6720 &add_lookup_cnt);
6721 if (netdev_flow) {
6722 lookup_cnt += add_lookup_cnt;
6723 rules[i] = &netdev_flow->cr;
6724 continue;
6725 }
6726
6727 int error = handle_packet_upcall(pmd, packet, keys[i],
6728 &actions, &put_actions);
6729
6730 if (OVS_UNLIKELY(error)) {
6731 upcall_fail_cnt++;
6732 } else {
6733 upcall_ok_cnt++;
6734 }
6735 }
6736
6737 ofpbuf_uninit(&actions);
6738 ofpbuf_uninit(&put_actions);
6739 fat_rwlock_unlock(&dp->upcall_rwlock);
6740 } else if (OVS_UNLIKELY(any_miss)) {
6741 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6742 if (OVS_UNLIKELY(!rules[i])) {
6743 dp_packet_delete(packet);
6744 upcall_fail_cnt++;
6745 }
6746 }
6747 }
6748
6749 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6750 struct dp_netdev_flow *flow;
6751 /* Get the original order of this packet in received batch. */
6752 int recv_idx = index_map[i];
6753 uint16_t tcp_flags;
6754
6755 if (OVS_UNLIKELY(!rules[i])) {
6756 continue;
6757 }
6758
6759 flow = dp_netdev_flow_cast(rules[i]);
6760 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
6761 smc_insert(pmd, keys[i], hash);
6762
6763 emc_probabilistic_insert(pmd, keys[i], flow);
6764 /* Add these packets into the flow map in the same order
6765 * as received.
6766 */
6767 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6768 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6769 flow_map, recv_idx);
6770 }
6771
6772 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6773 cnt - upcall_ok_cnt - upcall_fail_cnt);
6774 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6775 lookup_cnt);
6776 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6777 upcall_ok_cnt);
6778 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6779 upcall_fail_cnt);
6780 }
6781
6782 /* Packets enter the datapath from a port (or from recirculation) here.
6783 *
6784 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6785 * When false the metadata in 'packets' need to be initialized. */
6786 static void
6787 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
6788 struct dp_packet_batch *packets,
6789 bool md_is_valid, odp_port_t port_no)
6790 {
6791 #if !defined(__CHECKER__) && !defined(_WIN32)
6792 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
6793 #else
6794 /* Sparse or MSVC doesn't like variable length array. */
6795 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6796 #endif
6797 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6798 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
6799 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
6800 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
6801 size_t n_batches;
6802 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6803 uint8_t index_map[PKT_ARRAY_SIZE];
6804 size_t n_flows, i;
6805
6806 odp_port_t in_port;
6807
6808 n_batches = 0;
6809 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
6810 flow_map, &n_flows, index_map, md_is_valid, port_no);
6811
6812 if (!dp_packet_batch_is_empty(packets)) {
6813 /* Get ingress port from first packet's metadata. */
6814 in_port = packets->packets[0]->md.in_port.odp_port;
6815 fast_path_processing(pmd, packets, missed_keys,
6816 flow_map, index_map, in_port);
6817 }
6818
6819 /* Batch rest of packets which are in flow map. */
6820 for (i = 0; i < n_flows; i++) {
6821 struct dp_packet_flow_map *map = &flow_map[i];
6822
6823 if (OVS_UNLIKELY(!map->flow)) {
6824 continue;
6825 }
6826 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6827 batches, &n_batches);
6828 }
6829
6830 /* All the flow batches need to be reset before any call to
6831 * packet_batch_per_flow_execute() as it could potentially trigger
6832 * recirculation. When a packet matching flow ‘j’ happens to be
6833 * recirculated, the nested call to dp_netdev_input__() could potentially
6834 * classify the packet as matching another flow - say 'k'. It could happen
6835 * that in the previous call to dp_netdev_input__() that same flow 'k' had
6836 * already its own batches[k] still waiting to be served. So if its
6837 * ‘batch’ member is not reset, the recirculated packet would be wrongly
6838 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
6839 for (i = 0; i < n_batches; i++) {
6840 batches[i].flow->batch = NULL;
6841 }
6842
6843 for (i = 0; i < n_batches; i++) {
6844 packet_batch_per_flow_execute(&batches[i], pmd);
6845 }
6846 }
6847
6848 static void
6849 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
6850 struct dp_packet_batch *packets,
6851 odp_port_t port_no)
6852 {
6853 dp_netdev_input__(pmd, packets, false, port_no);
6854 }
6855
6856 static void
6857 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
6858 struct dp_packet_batch *packets)
6859 {
6860 dp_netdev_input__(pmd, packets, true, 0);
6861 }
6862
6863 struct dp_netdev_execute_aux {
6864 struct dp_netdev_pmd_thread *pmd;
6865 const struct flow *flow;
6866 };
6867
6868 static void
6869 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6870 void *aux)
6871 {
6872 struct dp_netdev *dp = get_dp_netdev(dpif);
6873 dp->dp_purge_aux = aux;
6874 dp->dp_purge_cb = cb;
6875 }
6876
6877 static void
6878 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6879 void *aux)
6880 {
6881 struct dp_netdev *dp = get_dp_netdev(dpif);
6882 dp->upcall_aux = aux;
6883 dp->upcall_cb = cb;
6884 }
6885
6886 static void
6887 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
6888 bool purge)
6889 {
6890 struct tx_port *tx;
6891 struct dp_netdev_port *port;
6892 long long interval;
6893
6894 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
6895 if (!tx->port->dynamic_txqs) {
6896 continue;
6897 }
6898 interval = pmd->ctx.now - tx->last_used;
6899 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
6900 port = tx->port;
6901 ovs_mutex_lock(&port->txq_used_mutex);
6902 port->txq_used[tx->qid]--;
6903 ovs_mutex_unlock(&port->txq_used_mutex);
6904 tx->qid = -1;
6905 }
6906 }
6907 }
6908
6909 static int
6910 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
6911 struct tx_port *tx)
6912 {
6913 struct dp_netdev_port *port;
6914 long long interval;
6915 int i, min_cnt, min_qid;
6916
6917 interval = pmd->ctx.now - tx->last_used;
6918 tx->last_used = pmd->ctx.now;
6919
6920 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
6921 return tx->qid;
6922 }
6923
6924 port = tx->port;
6925
6926 ovs_mutex_lock(&port->txq_used_mutex);
6927 if (tx->qid >= 0) {
6928 port->txq_used[tx->qid]--;
6929 tx->qid = -1;
6930 }
6931
6932 min_cnt = -1;
6933 min_qid = 0;
6934 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
6935 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
6936 min_cnt = port->txq_used[i];
6937 min_qid = i;
6938 }
6939 }
6940
6941 port->txq_used[min_qid]++;
6942 tx->qid = min_qid;
6943
6944 ovs_mutex_unlock(&port->txq_used_mutex);
6945
6946 dpif_netdev_xps_revalidate_pmd(pmd, false);
6947
6948 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
6949 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
6950 return min_qid;
6951 }
6952
6953 static struct tx_port *
6954 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6955 odp_port_t port_no)
6956 {
6957 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
6958 }
6959
6960 static struct tx_port *
6961 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6962 odp_port_t port_no)
6963 {
6964 return tx_port_lookup(&pmd->send_port_cache, port_no);
6965 }
6966
6967 static int
6968 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
6969 const struct nlattr *attr,
6970 struct dp_packet_batch *batch)
6971 {
6972 struct tx_port *tun_port;
6973 const struct ovs_action_push_tnl *data;
6974 int err;
6975
6976 data = nl_attr_get(attr);
6977
6978 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
6979 if (!tun_port) {
6980 err = -EINVAL;
6981 goto error;
6982 }
6983 err = netdev_push_header(tun_port->port->netdev, batch, data);
6984 if (!err) {
6985 return 0;
6986 }
6987 error:
6988 dp_packet_delete_batch(batch, true);
6989 return err;
6990 }
6991
6992 static void
6993 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
6994 struct dp_packet *packet, bool should_steal,
6995 struct flow *flow, ovs_u128 *ufid,
6996 struct ofpbuf *actions,
6997 const struct nlattr *userdata)
6998 {
6999 struct dp_packet_batch b;
7000 int error;
7001
7002 ofpbuf_clear(actions);
7003
7004 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7005 DPIF_UC_ACTION, userdata, actions,
7006 NULL);
7007 if (!error || error == ENOSPC) {
7008 dp_packet_batch_init_packet(&b, packet);
7009 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7010 actions->data, actions->size);
7011 } else if (should_steal) {
7012 dp_packet_delete(packet);
7013 }
7014 }
7015
7016 static void
7017 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7018 const struct nlattr *a, bool should_steal)
7019 OVS_NO_THREAD_SAFETY_ANALYSIS
7020 {
7021 struct dp_netdev_execute_aux *aux = aux_;
7022 uint32_t *depth = recirc_depth_get();
7023 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7024 struct dp_netdev *dp = pmd->dp;
7025 int type = nl_attr_type(a);
7026 struct tx_port *p;
7027
7028 switch ((enum ovs_action_attr)type) {
7029 case OVS_ACTION_ATTR_OUTPUT:
7030 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
7031 if (OVS_LIKELY(p)) {
7032 struct dp_packet *packet;
7033 struct dp_packet_batch out;
7034
7035 if (!should_steal) {
7036 dp_packet_batch_clone(&out, packets_);
7037 dp_packet_batch_reset_cutlen(packets_);
7038 packets_ = &out;
7039 }
7040 dp_packet_batch_apply_cutlen(packets_);
7041
7042 #ifdef DPDK_NETDEV
7043 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7044 && packets_->packets[0]->source
7045 != p->output_pkts.packets[0]->source)) {
7046 /* XXX: netdev-dpdk assumes that all packets in a single
7047 * output batch has the same source. Flush here to
7048 * avoid memory access issues. */
7049 dp_netdev_pmd_flush_output_on_port(pmd, p);
7050 }
7051 #endif
7052 if (dp_packet_batch_size(&p->output_pkts)
7053 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7054 /* Flush here to avoid overflow. */
7055 dp_netdev_pmd_flush_output_on_port(pmd, p);
7056 }
7057
7058 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7059 pmd->n_output_batches++;
7060 }
7061
7062 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7063 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7064 pmd->ctx.last_rxq;
7065 dp_packet_batch_add(&p->output_pkts, packet);
7066 }
7067 return;
7068 }
7069 break;
7070
7071 case OVS_ACTION_ATTR_TUNNEL_PUSH:
7072 if (should_steal) {
7073 /* We're requested to push tunnel header, but also we need to take
7074 * the ownership of these packets. Thus, we can avoid performing
7075 * the action, because the caller will not use the result anyway.
7076 * Just break to free the batch. */
7077 break;
7078 }
7079 dp_packet_batch_apply_cutlen(packets_);
7080 push_tnl_action(pmd, a, packets_);
7081 return;
7082
7083 case OVS_ACTION_ATTR_TUNNEL_POP:
7084 if (*depth < MAX_RECIRC_DEPTH) {
7085 struct dp_packet_batch *orig_packets_ = packets_;
7086 odp_port_t portno = nl_attr_get_odp_port(a);
7087
7088 p = pmd_tnl_port_cache_lookup(pmd, portno);
7089 if (p) {
7090 struct dp_packet_batch tnl_pkt;
7091
7092 if (!should_steal) {
7093 dp_packet_batch_clone(&tnl_pkt, packets_);
7094 packets_ = &tnl_pkt;
7095 dp_packet_batch_reset_cutlen(orig_packets_);
7096 }
7097
7098 dp_packet_batch_apply_cutlen(packets_);
7099
7100 netdev_pop_header(p->port->netdev, packets_);
7101 if (dp_packet_batch_is_empty(packets_)) {
7102 return;
7103 }
7104
7105 struct dp_packet *packet;
7106 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7107 packet->md.in_port.odp_port = portno;
7108 }
7109
7110 (*depth)++;
7111 dp_netdev_recirculate(pmd, packets_);
7112 (*depth)--;
7113 return;
7114 }
7115 }
7116 break;
7117
7118 case OVS_ACTION_ATTR_USERSPACE:
7119 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7120 struct dp_packet_batch *orig_packets_ = packets_;
7121 const struct nlattr *userdata;
7122 struct dp_packet_batch usr_pkt;
7123 struct ofpbuf actions;
7124 struct flow flow;
7125 ovs_u128 ufid;
7126 bool clone = false;
7127
7128 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7129 ofpbuf_init(&actions, 0);
7130
7131 if (packets_->trunc) {
7132 if (!should_steal) {
7133 dp_packet_batch_clone(&usr_pkt, packets_);
7134 packets_ = &usr_pkt;
7135 clone = true;
7136 dp_packet_batch_reset_cutlen(orig_packets_);
7137 }
7138
7139 dp_packet_batch_apply_cutlen(packets_);
7140 }
7141
7142 struct dp_packet *packet;
7143 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7144 flow_extract(packet, &flow);
7145 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
7146 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7147 &ufid, &actions, userdata);
7148 }
7149
7150 if (clone) {
7151 dp_packet_delete_batch(packets_, true);
7152 }
7153
7154 ofpbuf_uninit(&actions);
7155 fat_rwlock_unlock(&dp->upcall_rwlock);
7156
7157 return;
7158 }
7159 break;
7160
7161 case OVS_ACTION_ATTR_RECIRC:
7162 if (*depth < MAX_RECIRC_DEPTH) {
7163 struct dp_packet_batch recirc_pkts;
7164
7165 if (!should_steal) {
7166 dp_packet_batch_clone(&recirc_pkts, packets_);
7167 packets_ = &recirc_pkts;
7168 }
7169
7170 struct dp_packet *packet;
7171 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7172 packet->md.recirc_id = nl_attr_get_u32(a);
7173 }
7174
7175 (*depth)++;
7176 dp_netdev_recirculate(pmd, packets_);
7177 (*depth)--;
7178
7179 return;
7180 }
7181
7182 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7183 break;
7184
7185 case OVS_ACTION_ATTR_CT: {
7186 const struct nlattr *b;
7187 bool force = false;
7188 bool commit = false;
7189 unsigned int left;
7190 uint16_t zone = 0;
7191 const char *helper = NULL;
7192 const uint32_t *setmark = NULL;
7193 const struct ovs_key_ct_labels *setlabel = NULL;
7194 struct nat_action_info_t nat_action_info;
7195 struct nat_action_info_t *nat_action_info_ref = NULL;
7196 bool nat_config = false;
7197
7198 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7199 nl_attr_get_size(a)) {
7200 enum ovs_ct_attr sub_type = nl_attr_type(b);
7201
7202 switch(sub_type) {
7203 case OVS_CT_ATTR_FORCE_COMMIT:
7204 force = true;
7205 /* fall through. */
7206 case OVS_CT_ATTR_COMMIT:
7207 commit = true;
7208 break;
7209 case OVS_CT_ATTR_ZONE:
7210 zone = nl_attr_get_u16(b);
7211 break;
7212 case OVS_CT_ATTR_HELPER:
7213 helper = nl_attr_get_string(b);
7214 break;
7215 case OVS_CT_ATTR_MARK:
7216 setmark = nl_attr_get(b);
7217 break;
7218 case OVS_CT_ATTR_LABELS:
7219 setlabel = nl_attr_get(b);
7220 break;
7221 case OVS_CT_ATTR_EVENTMASK:
7222 /* Silently ignored, as userspace datapath does not generate
7223 * netlink events. */
7224 break;
7225 case OVS_CT_ATTR_NAT: {
7226 const struct nlattr *b_nest;
7227 unsigned int left_nest;
7228 bool ip_min_specified = false;
7229 bool proto_num_min_specified = false;
7230 bool ip_max_specified = false;
7231 bool proto_num_max_specified = false;
7232 memset(&nat_action_info, 0, sizeof nat_action_info);
7233 nat_action_info_ref = &nat_action_info;
7234
7235 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7236 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7237
7238 switch (sub_type_nest) {
7239 case OVS_NAT_ATTR_SRC:
7240 case OVS_NAT_ATTR_DST:
7241 nat_config = true;
7242 nat_action_info.nat_action |=
7243 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7244 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7245 break;
7246 case OVS_NAT_ATTR_IP_MIN:
7247 memcpy(&nat_action_info.min_addr,
7248 nl_attr_get(b_nest),
7249 nl_attr_get_size(b_nest));
7250 ip_min_specified = true;
7251 break;
7252 case OVS_NAT_ATTR_IP_MAX:
7253 memcpy(&nat_action_info.max_addr,
7254 nl_attr_get(b_nest),
7255 nl_attr_get_size(b_nest));
7256 ip_max_specified = true;
7257 break;
7258 case OVS_NAT_ATTR_PROTO_MIN:
7259 nat_action_info.min_port =
7260 nl_attr_get_u16(b_nest);
7261 proto_num_min_specified = true;
7262 break;
7263 case OVS_NAT_ATTR_PROTO_MAX:
7264 nat_action_info.max_port =
7265 nl_attr_get_u16(b_nest);
7266 proto_num_max_specified = true;
7267 break;
7268 case OVS_NAT_ATTR_PERSISTENT:
7269 case OVS_NAT_ATTR_PROTO_HASH:
7270 case OVS_NAT_ATTR_PROTO_RANDOM:
7271 break;
7272 case OVS_NAT_ATTR_UNSPEC:
7273 case __OVS_NAT_ATTR_MAX:
7274 OVS_NOT_REACHED();
7275 }
7276 }
7277
7278 if (ip_min_specified && !ip_max_specified) {
7279 nat_action_info.max_addr = nat_action_info.min_addr;
7280 }
7281 if (proto_num_min_specified && !proto_num_max_specified) {
7282 nat_action_info.max_port = nat_action_info.min_port;
7283 }
7284 if (proto_num_min_specified || proto_num_max_specified) {
7285 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7286 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7287 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7288 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7289 }
7290 }
7291 break;
7292 }
7293 case OVS_CT_ATTR_UNSPEC:
7294 case __OVS_CT_ATTR_MAX:
7295 OVS_NOT_REACHED();
7296 }
7297 }
7298
7299 /* We won't be able to function properly in this case, hence
7300 * complain loudly. */
7301 if (nat_config && !commit) {
7302 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7303 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7304 }
7305
7306 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7307 commit, zone, setmark, setlabel, aux->flow->tp_src,
7308 aux->flow->tp_dst, helper, nat_action_info_ref,
7309 pmd->ctx.now / 1000);
7310 break;
7311 }
7312
7313 case OVS_ACTION_ATTR_METER:
7314 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7315 pmd->ctx.now);
7316 break;
7317
7318 case OVS_ACTION_ATTR_PUSH_VLAN:
7319 case OVS_ACTION_ATTR_POP_VLAN:
7320 case OVS_ACTION_ATTR_PUSH_MPLS:
7321 case OVS_ACTION_ATTR_POP_MPLS:
7322 case OVS_ACTION_ATTR_SET:
7323 case OVS_ACTION_ATTR_SET_MASKED:
7324 case OVS_ACTION_ATTR_SAMPLE:
7325 case OVS_ACTION_ATTR_HASH:
7326 case OVS_ACTION_ATTR_UNSPEC:
7327 case OVS_ACTION_ATTR_TRUNC:
7328 case OVS_ACTION_ATTR_PUSH_ETH:
7329 case OVS_ACTION_ATTR_POP_ETH:
7330 case OVS_ACTION_ATTR_CLONE:
7331 case OVS_ACTION_ATTR_PUSH_NSH:
7332 case OVS_ACTION_ATTR_POP_NSH:
7333 case OVS_ACTION_ATTR_CT_CLEAR:
7334 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7335 case __OVS_ACTION_ATTR_MAX:
7336 OVS_NOT_REACHED();
7337 }
7338
7339 dp_packet_delete_batch(packets_, should_steal);
7340 }
7341
7342 static void
7343 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7344 struct dp_packet_batch *packets,
7345 bool should_steal, const struct flow *flow,
7346 const struct nlattr *actions, size_t actions_len)
7347 {
7348 struct dp_netdev_execute_aux aux = { pmd, flow };
7349
7350 odp_execute_actions(&aux, packets, should_steal, actions,
7351 actions_len, dp_execute_cb);
7352 }
7353
7354 struct dp_netdev_ct_dump {
7355 struct ct_dpif_dump_state up;
7356 struct conntrack_dump dump;
7357 struct conntrack *ct;
7358 struct dp_netdev *dp;
7359 };
7360
7361 static int
7362 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7363 const uint16_t *pzone, int *ptot_bkts)
7364 {
7365 struct dp_netdev *dp = get_dp_netdev(dpif);
7366 struct dp_netdev_ct_dump *dump;
7367
7368 dump = xzalloc(sizeof *dump);
7369 dump->dp = dp;
7370 dump->ct = dp->conntrack;
7371
7372 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7373
7374 *dump_ = &dump->up;
7375
7376 return 0;
7377 }
7378
7379 static int
7380 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7381 struct ct_dpif_dump_state *dump_,
7382 struct ct_dpif_entry *entry)
7383 {
7384 struct dp_netdev_ct_dump *dump;
7385
7386 INIT_CONTAINER(dump, dump_, up);
7387
7388 return conntrack_dump_next(&dump->dump, entry);
7389 }
7390
7391 static int
7392 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7393 struct ct_dpif_dump_state *dump_)
7394 {
7395 struct dp_netdev_ct_dump *dump;
7396 int err;
7397
7398 INIT_CONTAINER(dump, dump_, up);
7399
7400 err = conntrack_dump_done(&dump->dump);
7401
7402 free(dump);
7403
7404 return err;
7405 }
7406
7407 static int
7408 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7409 const struct ct_dpif_tuple *tuple)
7410 {
7411 struct dp_netdev *dp = get_dp_netdev(dpif);
7412
7413 if (tuple) {
7414 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7415 }
7416 return conntrack_flush(dp->conntrack, zone);
7417 }
7418
7419 static int
7420 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7421 {
7422 struct dp_netdev *dp = get_dp_netdev(dpif);
7423
7424 return conntrack_set_maxconns(dp->conntrack, maxconns);
7425 }
7426
7427 static int
7428 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7429 {
7430 struct dp_netdev *dp = get_dp_netdev(dpif);
7431
7432 return conntrack_get_maxconns(dp->conntrack, maxconns);
7433 }
7434
7435 static int
7436 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7437 {
7438 struct dp_netdev *dp = get_dp_netdev(dpif);
7439
7440 return conntrack_get_nconns(dp->conntrack, nconns);
7441 }
7442
7443 static int
7444 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7445 {
7446 struct dp_netdev *dp = get_dp_netdev(dpif);
7447 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
7448 }
7449
7450 static int
7451 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7452 {
7453 struct dp_netdev *dp = get_dp_netdev(dpif);
7454 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
7455 }
7456
7457 static int
7458 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7459 {
7460 struct dp_netdev *dp = get_dp_netdev(dpif);
7461 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
7462 }
7463
7464 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7465 * diverge. */
7466 static int
7467 dpif_netdev_ipf_get_status(struct dpif *dpif,
7468 struct dpif_ipf_status *dpif_ipf_status)
7469 {
7470 struct dp_netdev *dp = get_dp_netdev(dpif);
7471 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
7472 (struct ipf_status *) dpif_ipf_status);
7473 return 0;
7474 }
7475
7476 static int
7477 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7478 struct ipf_dump_ctx **ipf_dump_ctx)
7479 {
7480 return ipf_dump_start(ipf_dump_ctx);
7481 }
7482
7483 static int
7484 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7485 {
7486 struct dp_netdev *dp = get_dp_netdev(dpif);
7487 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
7488 dump);
7489 }
7490
7491 static int
7492 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7493 {
7494 return ipf_dump_done(ipf_dump_ctx);
7495
7496 }
7497
7498 const struct dpif_class dpif_netdev_class = {
7499 "netdev",
7500 true, /* cleanup_required */
7501 dpif_netdev_init,
7502 dpif_netdev_enumerate,
7503 dpif_netdev_port_open_type,
7504 dpif_netdev_open,
7505 dpif_netdev_close,
7506 dpif_netdev_destroy,
7507 dpif_netdev_run,
7508 dpif_netdev_wait,
7509 dpif_netdev_get_stats,
7510 dpif_netdev_port_add,
7511 dpif_netdev_port_del,
7512 dpif_netdev_port_set_config,
7513 dpif_netdev_port_query_by_number,
7514 dpif_netdev_port_query_by_name,
7515 NULL, /* port_get_pid */
7516 dpif_netdev_port_dump_start,
7517 dpif_netdev_port_dump_next,
7518 dpif_netdev_port_dump_done,
7519 dpif_netdev_port_poll,
7520 dpif_netdev_port_poll_wait,
7521 dpif_netdev_flow_flush,
7522 dpif_netdev_flow_dump_create,
7523 dpif_netdev_flow_dump_destroy,
7524 dpif_netdev_flow_dump_thread_create,
7525 dpif_netdev_flow_dump_thread_destroy,
7526 dpif_netdev_flow_dump_next,
7527 dpif_netdev_operate,
7528 NULL, /* recv_set */
7529 NULL, /* handlers_set */
7530 dpif_netdev_set_config,
7531 dpif_netdev_queue_to_priority,
7532 NULL, /* recv */
7533 NULL, /* recv_wait */
7534 NULL, /* recv_purge */
7535 dpif_netdev_register_dp_purge_cb,
7536 dpif_netdev_register_upcall_cb,
7537 dpif_netdev_enable_upcall,
7538 dpif_netdev_disable_upcall,
7539 dpif_netdev_get_datapath_version,
7540 dpif_netdev_ct_dump_start,
7541 dpif_netdev_ct_dump_next,
7542 dpif_netdev_ct_dump_done,
7543 dpif_netdev_ct_flush,
7544 dpif_netdev_ct_set_maxconns,
7545 dpif_netdev_ct_get_maxconns,
7546 dpif_netdev_ct_get_nconns,
7547 NULL, /* ct_set_limits */
7548 NULL, /* ct_get_limits */
7549 NULL, /* ct_del_limits */
7550 dpif_netdev_ipf_set_enabled,
7551 dpif_netdev_ipf_set_min_frag,
7552 dpif_netdev_ipf_set_max_nfrags,
7553 dpif_netdev_ipf_get_status,
7554 dpif_netdev_ipf_dump_start,
7555 dpif_netdev_ipf_dump_next,
7556 dpif_netdev_ipf_dump_done,
7557 dpif_netdev_meter_get_features,
7558 dpif_netdev_meter_set,
7559 dpif_netdev_meter_get,
7560 dpif_netdev_meter_del,
7561 };
7562
7563 static void
7564 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7565 const char *argv[], void *aux OVS_UNUSED)
7566 {
7567 struct dp_netdev_port *port;
7568 struct dp_netdev *dp;
7569 odp_port_t port_no;
7570
7571 ovs_mutex_lock(&dp_netdev_mutex);
7572 dp = shash_find_data(&dp_netdevs, argv[1]);
7573 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
7574 ovs_mutex_unlock(&dp_netdev_mutex);
7575 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7576 return;
7577 }
7578 ovs_refcount_ref(&dp->ref_cnt);
7579 ovs_mutex_unlock(&dp_netdev_mutex);
7580
7581 ovs_mutex_lock(&dp->port_mutex);
7582 if (get_port_by_name(dp, argv[2], &port)) {
7583 unixctl_command_reply_error(conn, "unknown port");
7584 goto exit;
7585 }
7586
7587 port_no = u32_to_odp(atoi(argv[3]));
7588 if (!port_no || port_no == ODPP_NONE) {
7589 unixctl_command_reply_error(conn, "bad port number");
7590 goto exit;
7591 }
7592 if (dp_netdev_lookup_port(dp, port_no)) {
7593 unixctl_command_reply_error(conn, "port number already in use");
7594 goto exit;
7595 }
7596
7597 /* Remove port. */
7598 hmap_remove(&dp->ports, &port->node);
7599 reconfigure_datapath(dp);
7600
7601 /* Reinsert with new port number. */
7602 port->port_no = port_no;
7603 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
7604 reconfigure_datapath(dp);
7605
7606 seq_change(dp->port_seq);
7607 unixctl_command_reply(conn, NULL);
7608
7609 exit:
7610 ovs_mutex_unlock(&dp->port_mutex);
7611 dp_netdev_unref(dp);
7612 }
7613
7614 static void
7615 dpif_dummy_register__(const char *type)
7616 {
7617 struct dpif_class *class;
7618
7619 class = xmalloc(sizeof *class);
7620 *class = dpif_netdev_class;
7621 class->type = xstrdup(type);
7622 dp_register_provider(class);
7623 }
7624
7625 static void
7626 dpif_dummy_override(const char *type)
7627 {
7628 int error;
7629
7630 /*
7631 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7632 * a userland-only build. It's useful for testsuite.
7633 */
7634 error = dp_unregister_provider(type);
7635 if (error == 0 || error == EAFNOSUPPORT) {
7636 dpif_dummy_register__(type);
7637 }
7638 }
7639
7640 void
7641 dpif_dummy_register(enum dummy_level level)
7642 {
7643 if (level == DUMMY_OVERRIDE_ALL) {
7644 struct sset types;
7645 const char *type;
7646
7647 sset_init(&types);
7648 dp_enumerate_types(&types);
7649 SSET_FOR_EACH (type, &types) {
7650 dpif_dummy_override(type);
7651 }
7652 sset_destroy(&types);
7653 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7654 dpif_dummy_override("system");
7655 }
7656
7657 dpif_dummy_register__("dummy");
7658
7659 unixctl_command_register("dpif-dummy/change-port-number",
7660 "dp port new-number",
7661 3, 3, dpif_dummy_change_port_number, NULL);
7662 }
7663 \f
7664 /* Datapath Classifier. */
7665
7666 static void
7667 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7668 {
7669 cmap_destroy(&subtable->rules);
7670 ovsrcu_postpone(free, subtable);
7671 }
7672
7673 /* Initializes 'cls' as a classifier that initially contains no classification
7674 * rules. */
7675 static void
7676 dpcls_init(struct dpcls *cls)
7677 {
7678 cmap_init(&cls->subtables_map);
7679 pvector_init(&cls->subtables);
7680 }
7681
7682 static void
7683 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7684 {
7685 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
7686 pvector_remove(&cls->subtables, subtable);
7687 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7688 subtable->mask.hash);
7689 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
7690 }
7691
7692 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
7693 * caller's responsibility.
7694 * May only be called after all the readers have been terminated. */
7695 static void
7696 dpcls_destroy(struct dpcls *cls)
7697 {
7698 if (cls) {
7699 struct dpcls_subtable *subtable;
7700
7701 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
7702 ovs_assert(cmap_count(&subtable->rules) == 0);
7703 dpcls_destroy_subtable(cls, subtable);
7704 }
7705 cmap_destroy(&cls->subtables_map);
7706 pvector_destroy(&cls->subtables);
7707 }
7708 }
7709
7710 static struct dpcls_subtable *
7711 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7712 {
7713 struct dpcls_subtable *subtable;
7714
7715 /* Need to add one. */
7716 subtable = xmalloc(sizeof *subtable
7717 - sizeof subtable->mask.mf + mask->len);
7718 cmap_init(&subtable->rules);
7719 subtable->hit_cnt = 0;
7720 netdev_flow_key_clone(&subtable->mask, mask);
7721
7722 /* Decide which hash/lookup/verify function to use. */
7723 subtable->lookup_func = dpcls_subtable_lookup_generic;
7724
7725 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
7726 /* Add the new subtable at the end of the pvector (with no hits yet) */
7727 pvector_insert(&cls->subtables, subtable, 0);
7728 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
7729 cmap_count(&cls->subtables_map), subtable, cls->in_port);
7730 pvector_publish(&cls->subtables);
7731
7732 return subtable;
7733 }
7734
7735 static inline struct dpcls_subtable *
7736 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7737 {
7738 struct dpcls_subtable *subtable;
7739
7740 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7741 &cls->subtables_map) {
7742 if (netdev_flow_key_equal(&subtable->mask, mask)) {
7743 return subtable;
7744 }
7745 }
7746 return dpcls_create_subtable(cls, mask);
7747 }
7748
7749
7750 /* Periodically sort the dpcls subtable vectors according to hit counts */
7751 static void
7752 dpcls_sort_subtable_vector(struct dpcls *cls)
7753 {
7754 struct pvector *pvec = &cls->subtables;
7755 struct dpcls_subtable *subtable;
7756
7757 PVECTOR_FOR_EACH (subtable, pvec) {
7758 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
7759 subtable->hit_cnt = 0;
7760 }
7761 pvector_publish(pvec);
7762 }
7763
7764 static inline void
7765 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
7766 struct polled_queue *poll_list, int poll_cnt)
7767 {
7768 struct dpcls *cls;
7769 uint64_t tot_idle = 0, tot_proc = 0;
7770 unsigned int pmd_load = 0;
7771
7772 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
7773 uint64_t curr_tsc;
7774 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
7775 if (pmd_alb->is_enabled && !pmd->isolated
7776 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
7777 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
7778 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
7779 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
7780 {
7781 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
7782 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
7783 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
7784 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
7785
7786 if (tot_proc) {
7787 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
7788 }
7789
7790 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
7791 atomic_count_inc(&pmd->pmd_overloaded);
7792 } else {
7793 atomic_count_set(&pmd->pmd_overloaded, 0);
7794 }
7795 }
7796
7797 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
7798 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
7799 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
7800 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
7801
7802 /* Get the cycles that were used to process each queue and store. */
7803 for (unsigned i = 0; i < poll_cnt; i++) {
7804 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
7805 RXQ_CYCLES_PROC_CURR);
7806 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
7807 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
7808 0);
7809 }
7810 curr_tsc = cycles_counter_update(&pmd->perf_stats);
7811 if (pmd->intrvl_tsc_prev) {
7812 /* There is a prev timestamp, store a new intrvl cycle count. */
7813 atomic_store_relaxed(&pmd->intrvl_cycles,
7814 curr_tsc - pmd->intrvl_tsc_prev);
7815 }
7816 pmd->intrvl_tsc_prev = curr_tsc;
7817 /* Start new measuring interval */
7818 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
7819 }
7820
7821 if (pmd->ctx.now > pmd->next_optimization) {
7822 /* Try to obtain the flow lock to block out revalidator threads.
7823 * If not possible, just try next time. */
7824 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
7825 /* Optimize each classifier */
7826 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7827 dpcls_sort_subtable_vector(cls);
7828 }
7829 ovs_mutex_unlock(&pmd->flow_mutex);
7830 /* Start new measuring interval */
7831 pmd->next_optimization = pmd->ctx.now
7832 + DPCLS_OPTIMIZATION_INTERVAL;
7833 }
7834 }
7835 }
7836
7837 /* Insert 'rule' into 'cls'. */
7838 static void
7839 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
7840 const struct netdev_flow_key *mask)
7841 {
7842 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
7843
7844 /* Refer to subtable's mask, also for later removal. */
7845 rule->mask = &subtable->mask;
7846 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
7847 }
7848
7849 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
7850 static void
7851 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
7852 {
7853 struct dpcls_subtable *subtable;
7854
7855 ovs_assert(rule->mask);
7856
7857 /* Get subtable from reference in rule->mask. */
7858 INIT_CONTAINER(subtable, rule->mask, mask);
7859 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
7860 == 0) {
7861 /* Delete empty subtable. */
7862 dpcls_destroy_subtable(cls, subtable);
7863 pvector_publish(&cls->subtables);
7864 }
7865 }
7866
7867 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
7868 * in 'mask' the values in 'key' and 'target' are the same. */
7869 bool
7870 dpcls_rule_matches_key(const struct dpcls_rule *rule,
7871 const struct netdev_flow_key *target)
7872 {
7873 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
7874 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
7875 uint64_t value;
7876
7877 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
7878 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
7879 return false;
7880 }
7881 }
7882 return true;
7883 }
7884
7885 uint32_t
7886 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
7887 uint32_t keys_map,
7888 const struct netdev_flow_key *keys[],
7889 struct dpcls_rule **rules)
7890 {
7891 int i;
7892 uint32_t found_map;
7893
7894 /* Compute hashes for the remaining keys. Each search-key is
7895 * masked with the subtable's mask to avoid hashing the wildcarded
7896 * bits. */
7897 uint32_t hashes[NETDEV_MAX_BURST];
7898 ULLONG_FOR_EACH_1 (i, keys_map) {
7899 hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
7900 &subtable->mask);
7901 }
7902
7903 /* Lookup. */
7904 const struct cmap_node *nodes[NETDEV_MAX_BURST];
7905 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
7906
7907 /* Check results. When the i-th bit of found_map is set, it means
7908 * that a set of nodes with a matching hash value was found for the
7909 * i-th search-key. Due to possible hash collisions we need to check
7910 * which of the found rules, if any, really matches our masked
7911 * search-key. */
7912 ULLONG_FOR_EACH_1 (i, found_map) {
7913 struct dpcls_rule *rule;
7914
7915 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
7916 if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
7917 rules[i] = rule;
7918 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
7919 * within one second optimization interval. */
7920 subtable->hit_cnt++;
7921 goto next;
7922 }
7923 }
7924 /* None of the found rules was a match. Reset the i-th bit to
7925 * keep searching this key in the next subtable. */
7926 ULLONG_SET0(found_map, i); /* Did not match. */
7927 next:
7928 ; /* Keep Sparse happy. */
7929 }
7930
7931 return found_map;
7932 }
7933
7934 /* For each miniflow in 'keys' performs a classifier lookup writing the result
7935 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
7936 * NULL it is skipped.
7937 *
7938 * This function is optimized for use in the userspace datapath and therefore
7939 * does not implement a lot of features available in the standard
7940 * classifier_lookup() function. Specifically, it does not implement
7941 * priorities, instead returning any rule which matches the flow.
7942 *
7943 * Returns true if all miniflows found a corresponding rule. */
7944 static bool
7945 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
7946 struct dpcls_rule **rules, const size_t cnt,
7947 int *num_lookups_p)
7948 {
7949 /* The received 'cnt' miniflows are the search-keys that will be processed
7950 * to find a matching entry into the available subtables.
7951 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
7952 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
7953 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
7954
7955 struct dpcls_subtable *subtable;
7956
7957 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
7958
7959 if (cnt != MAP_BITS) {
7960 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
7961 }
7962 memset(rules, 0, cnt * sizeof *rules);
7963
7964 int lookups_match = 0, subtable_pos = 1;
7965 uint32_t found_map;
7966
7967 /* The Datapath classifier - aka dpcls - is composed of subtables.
7968 * Subtables are dynamically created as needed when new rules are inserted.
7969 * Each subtable collects rules with matches on a specific subset of packet
7970 * fields as defined by the subtable's mask. We proceed to process every
7971 * search-key against each subtable, but when a match is found for a
7972 * search-key, the search for that key can stop because the rules are
7973 * non-overlapping. */
7974 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
7975 /* Call the subtable specific lookup function. */
7976 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
7977
7978 /* Count the number of subtables searched for this packet match. This
7979 * estimates the "spread" of subtables looked at per matched packet. */
7980 uint32_t pkts_matched = count_1bits(found_map);
7981 lookups_match += pkts_matched * subtable_pos;
7982
7983 /* Clear the found rules, and return early if all packets are found. */
7984 keys_map &= ~found_map;
7985 if (!keys_map) {
7986 if (num_lookups_p) {
7987 *num_lookups_p = lookups_match;
7988 }
7989 return true;
7990 }
7991 subtable_pos++;
7992 }
7993
7994 if (num_lookups_p) {
7995 *num_lookups_p = lookups_match;
7996 }
7997 return false;
7998 }