]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: Implement function pointers/subtable
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <sys/types.h>
26 #include <netinet/in.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/ioctl.h>
31 #include <sys/socket.h>
32 #include <sys/stat.h>
33 #include <unistd.h>
34
35 #include "bitmap.h"
36 #include "cmap.h"
37 #include "conntrack.h"
38 #include "coverage.h"
39 #include "ct-dpif.h"
40 #include "csum.h"
41 #include "dp-packet.h"
42 #include "dpif.h"
43 #include "dpif-netdev-perf.h"
44 #include "dpif-provider.h"
45 #include "dummy.h"
46 #include "fat-rwlock.h"
47 #include "flow.h"
48 #include "hmapx.h"
49 #include "id-pool.h"
50 #include "ipf.h"
51 #include "netdev.h"
52 #include "netdev-offload.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
55 #include "netlink.h"
56 #include "odp-execute.h"
57 #include "odp-util.h"
58 #include "openvswitch/dynamic-string.h"
59 #include "openvswitch/list.h"
60 #include "openvswitch/match.h"
61 #include "openvswitch/ofp-parse.h"
62 #include "openvswitch/ofp-print.h"
63 #include "openvswitch/ofpbuf.h"
64 #include "openvswitch/shash.h"
65 #include "openvswitch/vlog.h"
66 #include "ovs-numa.h"
67 #include "ovs-rcu.h"
68 #include "packets.h"
69 #include "openvswitch/poll-loop.h"
70 #include "pvector.h"
71 #include "random.h"
72 #include "seq.h"
73 #include "smap.h"
74 #include "sset.h"
75 #include "timeval.h"
76 #include "tnl-neigh-cache.h"
77 #include "tnl-ports.h"
78 #include "unixctl.h"
79 #include "util.h"
80 #include "uuid.h"
81
82 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
83
84 /* Auto Load Balancing Defaults */
85 #define ALB_ACCEPTABLE_IMPROVEMENT 25
86 #define ALB_PMD_LOAD_THRESHOLD 95
87 #define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
88 #define MIN_TO_MSEC 60000
89
90 #define FLOW_DUMP_MAX_BATCH 50
91 /* Use per thread recirc_depth to prevent recirculation loop. */
92 #define MAX_RECIRC_DEPTH 6
93 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
94
95 /* Use instant packet send by default. */
96 #define DEFAULT_TX_FLUSH_INTERVAL 0
97
98 /* Configuration parameters. */
99 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
100 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
101 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
102 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
103
104 /* Protects against changes to 'dp_netdevs'. */
105 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
106
107 /* Contains all 'struct dp_netdev's. */
108 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
109 = SHASH_INITIALIZER(&dp_netdevs);
110
111 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
112
113 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
114 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
115 | CS_SRC_NAT | CS_DST_NAT)
116 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
117
118 static struct odp_support dp_netdev_support = {
119 .max_vlan_headers = SIZE_MAX,
120 .max_mpls_depth = SIZE_MAX,
121 .recirc = true,
122 .ct_state = true,
123 .ct_zone = true,
124 .ct_mark = true,
125 .ct_label = true,
126 .ct_state_nat = true,
127 .ct_orig_tuple = true,
128 .ct_orig_tuple6 = true,
129 };
130
131 /* Stores a miniflow with inline values */
132
133 struct netdev_flow_key {
134 uint32_t hash; /* Hash function differs for different users. */
135 uint32_t len; /* Length of the following miniflow (incl. map). */
136 struct miniflow mf;
137 uint64_t buf[FLOW_MAX_PACKET_U64S];
138 };
139
140 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
141 *
142 * Exact match cache for frequently used flows
143 *
144 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
145 * search its entries for a miniflow that matches exactly the miniflow of the
146 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
147 *
148 * A cache entry holds a reference to its 'dp_netdev_flow'.
149 *
150 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
151 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
152 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
153 * value is the index of a cache entry where the miniflow could be.
154 *
155 *
156 * Signature match cache (SMC)
157 *
158 * This cache stores a 16-bit signature for each flow without storing keys, and
159 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
160 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
161 * SMC uses a set-associative design that each bucket contains
162 * SMC_ENTRY_PER_BUCKET number of entries.
163 * Since 16-bit flow_table index is used, if there are more than 2^16
164 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
165 *
166 *
167 * Thread-safety
168 * =============
169 *
170 * Each pmd_thread has its own private exact match cache.
171 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
172 */
173
174 #define EM_FLOW_HASH_SHIFT 13
175 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
176 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
177 #define EM_FLOW_HASH_SEGS 2
178
179 /* SMC uses a set-associative design. A bucket contains a set of entries that
180 * a flow item can occupy. For now, it uses one hash function rather than two
181 * as for the EMC design. */
182 #define SMC_ENTRY_PER_BUCKET 4
183 #define SMC_ENTRIES (1u << 20)
184 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
185 #define SMC_MASK (SMC_BUCKET_CNT - 1)
186
187 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
188 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
189 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
190 DEFAULT_EM_FLOW_INSERT_INV_PROB)
191
192 struct emc_entry {
193 struct dp_netdev_flow *flow;
194 struct netdev_flow_key key; /* key.hash used for emc hash value. */
195 };
196
197 struct emc_cache {
198 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
199 int sweep_idx; /* For emc_cache_slow_sweep(). */
200 };
201
202 struct smc_bucket {
203 uint16_t sig[SMC_ENTRY_PER_BUCKET];
204 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
205 };
206
207 /* Signature match cache, differentiate from EMC cache */
208 struct smc_cache {
209 struct smc_bucket buckets[SMC_BUCKET_CNT];
210 };
211
212 struct dfc_cache {
213 struct emc_cache emc_cache;
214 struct smc_cache smc_cache;
215 };
216
217 /* Iterate in the exact match cache through every entry that might contain a
218 * miniflow with hash 'HASH'. */
219 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
220 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
221 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
222 i__ < EM_FLOW_HASH_SEGS; \
223 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
224 \f
225 /* Simple non-wildcarding single-priority classifier. */
226
227 /* Time in microseconds between successive optimizations of the dpcls
228 * subtable vector */
229 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
230
231 /* Time in microseconds of the interval in which rxq processing cycles used
232 * in rxq to pmd assignments is measured and stored. */
233 #define PMD_RXQ_INTERVAL_LEN 10000000LL
234
235 /* Number of intervals for which cycles are stored
236 * and used during rxq to pmd assignment. */
237 #define PMD_RXQ_INTERVAL_MAX 6
238
239 struct dpcls {
240 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
241 odp_port_t in_port;
242 struct cmap subtables_map;
243 struct pvector subtables;
244 };
245
246 /* A rule to be inserted to the classifier. */
247 struct dpcls_rule {
248 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
249 struct netdev_flow_key *mask; /* Subtable's mask. */
250 struct netdev_flow_key flow; /* Matching key. */
251 /* 'flow' must be the last field, additional space is allocated here. */
252 };
253
254 /* Data structure to keep packet order till fastpath processing. */
255 struct dp_packet_flow_map {
256 struct dp_packet *packet;
257 struct dp_netdev_flow *flow;
258 uint16_t tcp_flags;
259 };
260
261 static void dpcls_init(struct dpcls *);
262 static void dpcls_destroy(struct dpcls *);
263 static void dpcls_sort_subtable_vector(struct dpcls *);
264 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
265 const struct netdev_flow_key *mask);
266 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
267 static bool dpcls_lookup(struct dpcls *cls,
268 const struct netdev_flow_key *keys[],
269 struct dpcls_rule **rules, size_t cnt,
270 int *num_lookups_p);
271 static bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
272 const struct netdev_flow_key *target);
273 /* Set of supported meter flags */
274 #define DP_SUPPORTED_METER_FLAGS_MASK \
275 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
276
277 /* Set of supported meter band types */
278 #define DP_SUPPORTED_METER_BAND_TYPES \
279 ( 1 << OFPMBT13_DROP )
280
281 struct dp_meter_band {
282 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
283 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
284 uint64_t packet_count;
285 uint64_t byte_count;
286 };
287
288 struct dp_meter {
289 uint16_t flags;
290 uint16_t n_bands;
291 uint32_t max_delta_t;
292 uint64_t used;
293 uint64_t packet_count;
294 uint64_t byte_count;
295 struct dp_meter_band bands[];
296 };
297
298 struct pmd_auto_lb {
299 bool auto_lb_requested; /* Auto load balancing requested by user. */
300 bool is_enabled; /* Current status of Auto load balancing. */
301 uint64_t rebalance_intvl;
302 uint64_t rebalance_poll_timer;
303 };
304
305 /* Datapath based on the network device interface from netdev.h.
306 *
307 *
308 * Thread-safety
309 * =============
310 *
311 * Some members, marked 'const', are immutable. Accessing other members
312 * requires synchronization, as noted in more detail below.
313 *
314 * Acquisition order is, from outermost to innermost:
315 *
316 * dp_netdev_mutex (global)
317 * port_mutex
318 * non_pmd_mutex
319 */
320 struct dp_netdev {
321 const struct dpif_class *const class;
322 const char *const name;
323 struct dpif *dpif;
324 struct ovs_refcount ref_cnt;
325 atomic_flag destroyed;
326
327 /* Ports.
328 *
329 * Any lookup into 'ports' or any access to the dp_netdev_ports found
330 * through 'ports' requires taking 'port_mutex'. */
331 struct ovs_mutex port_mutex;
332 struct hmap ports;
333 struct seq *port_seq; /* Incremented whenever a port changes. */
334
335 /* The time that a packet can wait in output batch for sending. */
336 atomic_uint32_t tx_flush_interval;
337
338 /* Meters. */
339 struct ovs_mutex meter_locks[N_METER_LOCKS];
340 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
341
342 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
343 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
344 /* Enable collection of PMD performance metrics. */
345 atomic_bool pmd_perf_metrics;
346 /* Enable the SMC cache from ovsdb config */
347 atomic_bool smc_enable_db;
348
349 /* Protects access to ofproto-dpif-upcall interface during revalidator
350 * thread synchronization. */
351 struct fat_rwlock upcall_rwlock;
352 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
353 void *upcall_aux;
354
355 /* Callback function for notifying the purging of dp flows (during
356 * reseting pmd deletion). */
357 dp_purge_callback *dp_purge_cb;
358 void *dp_purge_aux;
359
360 /* Stores all 'struct dp_netdev_pmd_thread's. */
361 struct cmap poll_threads;
362 /* id pool for per thread static_tx_qid. */
363 struct id_pool *tx_qid_pool;
364 struct ovs_mutex tx_qid_pool_mutex;
365 /* Use measured cycles for rxq to pmd assignment. */
366 bool pmd_rxq_assign_cyc;
367
368 /* Protects the access of the 'struct dp_netdev_pmd_thread'
369 * instance for non-pmd thread. */
370 struct ovs_mutex non_pmd_mutex;
371
372 /* Each pmd thread will store its pointer to
373 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
374 ovsthread_key_t per_pmd_key;
375
376 struct seq *reconfigure_seq;
377 uint64_t last_reconfigure_seq;
378
379 /* Cpu mask for pin of pmd threads. */
380 char *pmd_cmask;
381
382 uint64_t last_tnl_conf_seq;
383
384 struct conntrack *conntrack;
385 struct pmd_auto_lb pmd_alb;
386 };
387
388 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
389 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
390 {
391 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
392 }
393
394 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
395 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
396 {
397 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
398 }
399
400
401 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
402 odp_port_t)
403 OVS_REQUIRES(dp->port_mutex);
404
405 enum rxq_cycles_counter_type {
406 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
407 processing packets during the current
408 interval. */
409 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
410 during rxq to pmd assignment. */
411 RXQ_N_CYCLES
412 };
413
414 enum {
415 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
416 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
417 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
418 };
419
420 struct dp_flow_offload_item {
421 struct dp_netdev_pmd_thread *pmd;
422 struct dp_netdev_flow *flow;
423 int op;
424 struct match match;
425 struct nlattr *actions;
426 size_t actions_len;
427
428 struct ovs_list node;
429 };
430
431 struct dp_flow_offload {
432 struct ovs_mutex mutex;
433 struct ovs_list list;
434 pthread_cond_t cond;
435 };
436
437 static struct dp_flow_offload dp_flow_offload = {
438 .mutex = OVS_MUTEX_INITIALIZER,
439 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
440 };
441
442 static struct ovsthread_once offload_thread_once
443 = OVSTHREAD_ONCE_INITIALIZER;
444
445 #define XPS_TIMEOUT 500000LL /* In microseconds. */
446
447 /* Contained by struct dp_netdev_port's 'rxqs' member. */
448 struct dp_netdev_rxq {
449 struct dp_netdev_port *port;
450 struct netdev_rxq *rx;
451 unsigned core_id; /* Core to which this queue should be
452 pinned. OVS_CORE_UNSPEC if the
453 queue doesn't need to be pinned to a
454 particular core. */
455 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
456 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
457 bool is_vhost; /* Is rxq of a vhost port. */
458
459 /* Counters of cycles spent successfully polling and processing pkts. */
460 atomic_ullong cycles[RXQ_N_CYCLES];
461 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
462 sum them to yield the cycles used for an rxq. */
463 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
464 };
465
466 /* A port in a netdev-based datapath. */
467 struct dp_netdev_port {
468 odp_port_t port_no;
469 bool dynamic_txqs; /* If true XPS will be used. */
470 bool need_reconfigure; /* True if we should reconfigure netdev. */
471 struct netdev *netdev;
472 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
473 struct netdev_saved_flags *sf;
474 struct dp_netdev_rxq *rxqs;
475 unsigned n_rxq; /* Number of elements in 'rxqs' */
476 unsigned *txq_used; /* Number of threads that use each tx queue. */
477 struct ovs_mutex txq_used_mutex;
478 bool emc_enabled; /* If true EMC will be used. */
479 char *type; /* Port type as requested by user. */
480 char *rxq_affinity_list; /* Requested affinity of rx queues. */
481 };
482
483 /* Contained by struct dp_netdev_flow's 'stats' member. */
484 struct dp_netdev_flow_stats {
485 atomic_llong used; /* Last used time, in monotonic msecs. */
486 atomic_ullong packet_count; /* Number of packets matched. */
487 atomic_ullong byte_count; /* Number of bytes matched. */
488 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
489 };
490
491 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
492 *
493 *
494 * Thread-safety
495 * =============
496 *
497 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
498 * its pmd thread's classifier. The text below calls this classifier 'cls'.
499 *
500 * Motivation
501 * ----------
502 *
503 * The thread safety rules described here for "struct dp_netdev_flow" are
504 * motivated by two goals:
505 *
506 * - Prevent threads that read members of "struct dp_netdev_flow" from
507 * reading bad data due to changes by some thread concurrently modifying
508 * those members.
509 *
510 * - Prevent two threads making changes to members of a given "struct
511 * dp_netdev_flow" from interfering with each other.
512 *
513 *
514 * Rules
515 * -----
516 *
517 * A flow 'flow' may be accessed without a risk of being freed during an RCU
518 * grace period. Code that needs to hold onto a flow for a while
519 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
520 *
521 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
522 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
523 * from modification.
524 *
525 * Some members, marked 'const', are immutable. Accessing other members
526 * requires synchronization, as noted in more detail below.
527 */
528 struct dp_netdev_flow {
529 const struct flow flow; /* Unmasked flow that created this entry. */
530 /* Hash table index by unmasked flow. */
531 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
532 /* 'flow_table'. */
533 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
534 const ovs_u128 ufid; /* Unique flow identifier. */
535 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
536 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
537 /* flow. */
538
539 /* Number of references.
540 * The classifier owns one reference.
541 * Any thread trying to keep a rule from being freed should hold its own
542 * reference. */
543 struct ovs_refcount ref_cnt;
544
545 bool dead;
546 uint32_t mark; /* Unique flow mark assigned to a flow */
547
548 /* Statistics. */
549 struct dp_netdev_flow_stats stats;
550
551 /* Actions. */
552 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
553
554 /* While processing a group of input packets, the datapath uses the next
555 * member to store a pointer to the output batch for the flow. It is
556 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
557 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
558 struct packet_batch_per_flow *batch;
559
560 /* Packet classification. */
561 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
562 /* 'cr' must be the last member. */
563 };
564
565 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
566 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
567 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
568 struct flow *, bool);
569
570 /* A set of datapath actions within a "struct dp_netdev_flow".
571 *
572 *
573 * Thread-safety
574 * =============
575 *
576 * A struct dp_netdev_actions 'actions' is protected with RCU. */
577 struct dp_netdev_actions {
578 /* These members are immutable: they do not change during the struct's
579 * lifetime. */
580 unsigned int size; /* Size of 'actions', in bytes. */
581 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
582 };
583
584 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
585 size_t);
586 struct dp_netdev_actions *dp_netdev_flow_get_actions(
587 const struct dp_netdev_flow *);
588 static void dp_netdev_actions_free(struct dp_netdev_actions *);
589
590 struct polled_queue {
591 struct dp_netdev_rxq *rxq;
592 odp_port_t port_no;
593 bool emc_enabled;
594 bool rxq_enabled;
595 uint64_t change_seq;
596 };
597
598 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
599 struct rxq_poll {
600 struct dp_netdev_rxq *rxq;
601 struct hmap_node node;
602 };
603
604 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
605 * 'tnl_port_cache' or 'tx_ports'. */
606 struct tx_port {
607 struct dp_netdev_port *port;
608 int qid;
609 long long last_used;
610 struct hmap_node node;
611 long long flush_time;
612 struct dp_packet_batch output_pkts;
613 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
614 };
615
616 /* A set of properties for the current processing loop that is not directly
617 * associated with the pmd thread itself, but with the packets being
618 * processed or the short-term system configuration (for example, time).
619 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
620 struct dp_netdev_pmd_thread_ctx {
621 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
622 long long now;
623 /* RX queue from which last packet was received. */
624 struct dp_netdev_rxq *last_rxq;
625 /* EMC insertion probability context for the current processing cycle. */
626 uint32_t emc_insert_min;
627 };
628
629 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
630 * the performance overhead of interrupt processing. Therefore netdev can
631 * not implement rx-wait for these devices. dpif-netdev needs to poll
632 * these device to check for recv buffer. pmd-thread does polling for
633 * devices assigned to itself.
634 *
635 * DPDK used PMD for accessing NIC.
636 *
637 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
638 * I/O of all non-pmd threads. There will be no actual thread created
639 * for the instance.
640 *
641 * Each struct has its own flow cache and classifier per managed ingress port.
642 * For packets received on ingress port, a look up is done on corresponding PMD
643 * thread's flow cache and in case of a miss, lookup is performed in the
644 * corresponding classifier of port. Packets are executed with the found
645 * actions in either case.
646 * */
647 struct dp_netdev_pmd_thread {
648 struct dp_netdev *dp;
649 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
650 struct cmap_node node; /* In 'dp->poll_threads'. */
651
652 /* Per thread exact-match cache. Note, the instance for cpu core
653 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
654 * need to be protected by 'non_pmd_mutex'. Every other instance
655 * will only be accessed by its own pmd thread. */
656 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
657
658 /* Flow-Table and classifiers
659 *
660 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
661 * changes to 'classifiers' must be made while still holding the
662 * 'flow_mutex'.
663 */
664 struct ovs_mutex flow_mutex;
665 struct cmap flow_table OVS_GUARDED; /* Flow table. */
666
667 /* One classifier per in_port polled by the pmd */
668 struct cmap classifiers;
669 /* Periodically sort subtable vectors according to hit frequencies */
670 long long int next_optimization;
671 /* End of the next time interval for which processing cycles
672 are stored for each polled rxq. */
673 long long int rxq_next_cycle_store;
674
675 /* Last interval timestamp. */
676 uint64_t intrvl_tsc_prev;
677 /* Last interval cycles. */
678 atomic_ullong intrvl_cycles;
679
680 /* Current context of the PMD thread. */
681 struct dp_netdev_pmd_thread_ctx ctx;
682
683 struct seq *reload_seq;
684 uint64_t last_reload_seq;
685
686 /* These are atomic variables used as a synchronization and configuration
687 * points for thread reload/exit.
688 *
689 * 'reload' atomic is the main one and it's used as a memory
690 * synchronization point for all other knobs and data.
691 *
692 * For a thread that requests PMD reload:
693 *
694 * * All changes that should be visible to the PMD thread must be made
695 * before setting the 'reload'. These changes could use any memory
696 * ordering model including 'relaxed'.
697 * * Setting the 'reload' atomic should occur in the same thread where
698 * all other PMD configuration options updated.
699 * * Setting the 'reload' atomic should be done with 'release' memory
700 * ordering model or stricter. This will guarantee that all previous
701 * changes (including non-atomic and 'relaxed') will be visible to
702 * the PMD thread.
703 * * To check that reload is done, thread should poll the 'reload' atomic
704 * to become 'false'. Polling should be done with 'acquire' memory
705 * ordering model or stricter. This ensures that PMD thread completed
706 * the reload process.
707 *
708 * For the PMD thread:
709 *
710 * * PMD thread should read 'reload' atomic with 'acquire' memory
711 * ordering model or stricter. This will guarantee that all changes
712 * made before setting the 'reload' in the requesting thread will be
713 * visible to the PMD thread.
714 * * All other configuration data could be read with any memory
715 * ordering model (including non-atomic and 'relaxed') but *only after*
716 * reading the 'reload' atomic set to 'true'.
717 * * When the PMD reload done, PMD should (optionally) set all the below
718 * knobs except the 'reload' to their default ('false') values and
719 * (mandatory), as the last step, set the 'reload' to 'false' using
720 * 'release' memory ordering model or stricter. This will inform the
721 * requesting thread that PMD has completed a reload cycle.
722 */
723 atomic_bool reload; /* Do we need to reload ports? */
724 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
725 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
726 atomic_bool exit; /* For terminating the pmd thread. */
727
728 pthread_t thread;
729 unsigned core_id; /* CPU core id of this pmd thread. */
730 int numa_id; /* numa node id of this pmd thread. */
731 bool isolated;
732
733 /* Queue id used by this pmd thread to send packets on all netdevs if
734 * XPS disabled for this netdev. All static_tx_qid's are unique and less
735 * than 'cmap_count(dp->poll_threads)'. */
736 uint32_t static_tx_qid;
737
738 /* Number of filled output batches. */
739 int n_output_batches;
740
741 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
742 /* List of rx queues to poll. */
743 struct hmap poll_list OVS_GUARDED;
744 /* Map of 'tx_port's used for transmission. Written by the main thread,
745 * read by the pmd thread. */
746 struct hmap tx_ports OVS_GUARDED;
747
748 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
749 * ports (that support push_tunnel/pop_tunnel), the other contains ports
750 * with at least one txq (that support send). A port can be in both.
751 *
752 * There are two separate maps to make sure that we don't try to execute
753 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
754 *
755 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
756 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
757 * other instance will only be accessed by its own pmd thread. */
758 struct hmap tnl_port_cache;
759 struct hmap send_port_cache;
760
761 /* Keep track of detailed PMD performance statistics. */
762 struct pmd_perf_stats perf_stats;
763
764 /* Stats from previous iteration used by automatic pmd
765 * load balance logic. */
766 uint64_t prev_stats[PMD_N_STATS];
767 atomic_count pmd_overloaded;
768
769 /* Set to true if the pmd thread needs to be reloaded. */
770 bool need_reload;
771 };
772
773 /* Interface to netdev-based datapath. */
774 struct dpif_netdev {
775 struct dpif dpif;
776 struct dp_netdev *dp;
777 uint64_t last_port_seq;
778 };
779
780 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
781 struct dp_netdev_port **portp)
782 OVS_REQUIRES(dp->port_mutex);
783 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
784 struct dp_netdev_port **portp)
785 OVS_REQUIRES(dp->port_mutex);
786 static void dp_netdev_free(struct dp_netdev *)
787 OVS_REQUIRES(dp_netdev_mutex);
788 static int do_add_port(struct dp_netdev *dp, const char *devname,
789 const char *type, odp_port_t port_no)
790 OVS_REQUIRES(dp->port_mutex);
791 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
792 OVS_REQUIRES(dp->port_mutex);
793 static int dpif_netdev_open(const struct dpif_class *, const char *name,
794 bool create, struct dpif **);
795 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
796 struct dp_packet_batch *,
797 bool should_steal,
798 const struct flow *flow,
799 const struct nlattr *actions,
800 size_t actions_len);
801 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
802 struct dp_packet_batch *, odp_port_t port_no);
803 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
804 struct dp_packet_batch *);
805
806 static void dp_netdev_disable_upcall(struct dp_netdev *);
807 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
808 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
809 struct dp_netdev *dp, unsigned core_id,
810 int numa_id);
811 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
812 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
813 OVS_REQUIRES(dp->port_mutex);
814
815 static void *pmd_thread_main(void *);
816 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
817 unsigned core_id);
818 static struct dp_netdev_pmd_thread *
819 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
820 static void dp_netdev_del_pmd(struct dp_netdev *dp,
821 struct dp_netdev_pmd_thread *pmd);
822 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
823 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
824 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
825 struct dp_netdev_port *port)
826 OVS_REQUIRES(pmd->port_mutex);
827 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
828 struct tx_port *tx)
829 OVS_REQUIRES(pmd->port_mutex);
830 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
831 struct dp_netdev_rxq *rxq)
832 OVS_REQUIRES(pmd->port_mutex);
833 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
834 struct rxq_poll *poll)
835 OVS_REQUIRES(pmd->port_mutex);
836 static int
837 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
838 bool force);
839
840 static void reconfigure_datapath(struct dp_netdev *dp)
841 OVS_REQUIRES(dp->port_mutex);
842 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
843 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
844 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
845 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
846 OVS_REQUIRES(pmd->port_mutex);
847 static inline void
848 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
849 struct polled_queue *poll_list, int poll_cnt);
850 static void
851 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
852 enum rxq_cycles_counter_type type,
853 unsigned long long cycles);
854 static uint64_t
855 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
856 enum rxq_cycles_counter_type type);
857 static void
858 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
859 unsigned long long cycles);
860 static uint64_t
861 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
862 static void
863 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
864 bool purge);
865 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
866 struct tx_port *tx);
867
868 static inline bool emc_entry_alive(struct emc_entry *ce);
869 static void emc_clear_entry(struct emc_entry *ce);
870 static void smc_clear_entry(struct smc_bucket *b, int idx);
871
872 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
873 static inline bool
874 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
875 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
876 struct dp_netdev_flow *flow);
877
878 static void
879 emc_cache_init(struct emc_cache *flow_cache)
880 {
881 int i;
882
883 flow_cache->sweep_idx = 0;
884 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
885 flow_cache->entries[i].flow = NULL;
886 flow_cache->entries[i].key.hash = 0;
887 flow_cache->entries[i].key.len = sizeof(struct miniflow);
888 flowmap_init(&flow_cache->entries[i].key.mf.map);
889 }
890 }
891
892 static void
893 smc_cache_init(struct smc_cache *smc_cache)
894 {
895 int i, j;
896 for (i = 0; i < SMC_BUCKET_CNT; i++) {
897 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
898 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
899 }
900 }
901 }
902
903 static void
904 dfc_cache_init(struct dfc_cache *flow_cache)
905 {
906 emc_cache_init(&flow_cache->emc_cache);
907 smc_cache_init(&flow_cache->smc_cache);
908 }
909
910 static void
911 emc_cache_uninit(struct emc_cache *flow_cache)
912 {
913 int i;
914
915 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
916 emc_clear_entry(&flow_cache->entries[i]);
917 }
918 }
919
920 static void
921 smc_cache_uninit(struct smc_cache *smc)
922 {
923 int i, j;
924
925 for (i = 0; i < SMC_BUCKET_CNT; i++) {
926 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
927 smc_clear_entry(&(smc->buckets[i]), j);
928 }
929 }
930 }
931
932 static void
933 dfc_cache_uninit(struct dfc_cache *flow_cache)
934 {
935 smc_cache_uninit(&flow_cache->smc_cache);
936 emc_cache_uninit(&flow_cache->emc_cache);
937 }
938
939 /* Check and clear dead flow references slowly (one entry at each
940 * invocation). */
941 static void
942 emc_cache_slow_sweep(struct emc_cache *flow_cache)
943 {
944 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
945
946 if (!emc_entry_alive(entry)) {
947 emc_clear_entry(entry);
948 }
949 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
950 }
951
952 /* Updates the time in PMD threads context and should be called in three cases:
953 *
954 * 1. PMD structure initialization:
955 * - dp_netdev_configure_pmd()
956 *
957 * 2. Before processing of the new packet batch:
958 * - dpif_netdev_execute()
959 * - dp_netdev_process_rxq_port()
960 *
961 * 3. At least once per polling iteration in main polling threads if no
962 * packets received on current iteration:
963 * - dpif_netdev_run()
964 * - pmd_thread_main()
965 *
966 * 'pmd->ctx.now' should be used without update in all other cases if possible.
967 */
968 static inline void
969 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
970 {
971 pmd->ctx.now = time_usec();
972 }
973
974 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
975 bool
976 dpif_is_netdev(const struct dpif *dpif)
977 {
978 return dpif->dpif_class->open == dpif_netdev_open;
979 }
980
981 static struct dpif_netdev *
982 dpif_netdev_cast(const struct dpif *dpif)
983 {
984 ovs_assert(dpif_is_netdev(dpif));
985 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
986 }
987
988 static struct dp_netdev *
989 get_dp_netdev(const struct dpif *dpif)
990 {
991 return dpif_netdev_cast(dpif)->dp;
992 }
993 \f
994 enum pmd_info_type {
995 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
996 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
997 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
998 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
999 };
1000
1001 static void
1002 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1003 {
1004 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1005 ? "main thread" : "pmd thread");
1006 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1007 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1008 }
1009 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1010 ds_put_format(reply, " core_id %u", pmd->core_id);
1011 }
1012 ds_put_cstr(reply, ":\n");
1013 }
1014
1015 static void
1016 pmd_info_show_stats(struct ds *reply,
1017 struct dp_netdev_pmd_thread *pmd)
1018 {
1019 uint64_t stats[PMD_N_STATS];
1020 uint64_t total_cycles, total_packets;
1021 double passes_per_pkt = 0;
1022 double lookups_per_hit = 0;
1023 double packets_per_batch = 0;
1024
1025 pmd_perf_read_counters(&pmd->perf_stats, stats);
1026 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1027 + stats[PMD_CYCLES_ITER_BUSY];
1028 total_packets = stats[PMD_STAT_RECV];
1029
1030 format_pmd_thread(reply, pmd);
1031
1032 if (total_packets > 0) {
1033 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1034 / (double) total_packets;
1035 }
1036 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1037 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1038 / (double) stats[PMD_STAT_MASKED_HIT];
1039 }
1040 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1041 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1042 / (double) stats[PMD_STAT_SENT_BATCHES];
1043 }
1044
1045 ds_put_format(reply,
1046 " packets received: %"PRIu64"\n"
1047 " packet recirculations: %"PRIu64"\n"
1048 " avg. datapath passes per packet: %.02f\n"
1049 " emc hits: %"PRIu64"\n"
1050 " smc hits: %"PRIu64"\n"
1051 " megaflow hits: %"PRIu64"\n"
1052 " avg. subtable lookups per megaflow hit: %.02f\n"
1053 " miss with success upcall: %"PRIu64"\n"
1054 " miss with failed upcall: %"PRIu64"\n"
1055 " avg. packets per output batch: %.02f\n",
1056 total_packets, stats[PMD_STAT_RECIRC],
1057 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1058 stats[PMD_STAT_SMC_HIT],
1059 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1060 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1061 packets_per_batch);
1062
1063 if (total_cycles == 0) {
1064 return;
1065 }
1066
1067 ds_put_format(reply,
1068 " idle cycles: %"PRIu64" (%.02f%%)\n"
1069 " processing cycles: %"PRIu64" (%.02f%%)\n",
1070 stats[PMD_CYCLES_ITER_IDLE],
1071 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1072 stats[PMD_CYCLES_ITER_BUSY],
1073 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1074
1075 if (total_packets == 0) {
1076 return;
1077 }
1078
1079 ds_put_format(reply,
1080 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1081 total_cycles / (double) total_packets,
1082 total_cycles, total_packets);
1083
1084 ds_put_format(reply,
1085 " avg processing cycles per packet: "
1086 "%.02f (%"PRIu64"/%"PRIu64")\n",
1087 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1088 stats[PMD_CYCLES_ITER_BUSY], total_packets);
1089 }
1090
1091 static void
1092 pmd_info_show_perf(struct ds *reply,
1093 struct dp_netdev_pmd_thread *pmd,
1094 struct pmd_perf_params *par)
1095 {
1096 if (pmd->core_id != NON_PMD_CORE_ID) {
1097 char *time_str =
1098 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1099 long long now = time_msec();
1100 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1101
1102 ds_put_cstr(reply, "\n");
1103 ds_put_format(reply, "Time: %s\n", time_str);
1104 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1105 ds_put_cstr(reply, "\n");
1106 format_pmd_thread(reply, pmd);
1107 ds_put_cstr(reply, "\n");
1108 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1109 if (pmd_perf_metrics_enabled(pmd)) {
1110 /* Prevent parallel clearing of perf metrics. */
1111 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1112 if (par->histograms) {
1113 ds_put_cstr(reply, "\n");
1114 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1115 }
1116 if (par->iter_hist_len > 0) {
1117 ds_put_cstr(reply, "\n");
1118 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1119 par->iter_hist_len);
1120 }
1121 if (par->ms_hist_len > 0) {
1122 ds_put_cstr(reply, "\n");
1123 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1124 par->ms_hist_len);
1125 }
1126 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1127 }
1128 free(time_str);
1129 }
1130 }
1131
1132 static int
1133 compare_poll_list(const void *a_, const void *b_)
1134 {
1135 const struct rxq_poll *a = a_;
1136 const struct rxq_poll *b = b_;
1137
1138 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1139 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1140
1141 int cmp = strcmp(namea, nameb);
1142 if (!cmp) {
1143 return netdev_rxq_get_queue_id(a->rxq->rx)
1144 - netdev_rxq_get_queue_id(b->rxq->rx);
1145 } else {
1146 return cmp;
1147 }
1148 }
1149
1150 static void
1151 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1152 size_t *n)
1153 OVS_REQUIRES(pmd->port_mutex)
1154 {
1155 struct rxq_poll *ret, *poll;
1156 size_t i;
1157
1158 *n = hmap_count(&pmd->poll_list);
1159 if (!*n) {
1160 ret = NULL;
1161 } else {
1162 ret = xcalloc(*n, sizeof *ret);
1163 i = 0;
1164 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1165 ret[i] = *poll;
1166 i++;
1167 }
1168 ovs_assert(i == *n);
1169 qsort(ret, *n, sizeof *ret, compare_poll_list);
1170 }
1171
1172 *list = ret;
1173 }
1174
1175 static void
1176 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1177 {
1178 if (pmd->core_id != NON_PMD_CORE_ID) {
1179 struct rxq_poll *list;
1180 size_t n_rxq;
1181 uint64_t total_cycles = 0;
1182
1183 ds_put_format(reply,
1184 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
1185 pmd->numa_id, pmd->core_id, (pmd->isolated)
1186 ? "true" : "false");
1187
1188 ovs_mutex_lock(&pmd->port_mutex);
1189 sorted_poll_list(pmd, &list, &n_rxq);
1190
1191 /* Get the total pmd cycles for an interval. */
1192 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1193 /* Estimate the cycles to cover all intervals. */
1194 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1195
1196 for (int i = 0; i < n_rxq; i++) {
1197 struct dp_netdev_rxq *rxq = list[i].rxq;
1198 const char *name = netdev_rxq_get_name(rxq->rx);
1199 uint64_t proc_cycles = 0;
1200
1201 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1202 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1203 }
1204 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1205 netdev_rxq_get_queue_id(list[i].rxq->rx));
1206 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1207 ? "(enabled) " : "(disabled)");
1208 ds_put_format(reply, " pmd usage: ");
1209 if (total_cycles) {
1210 ds_put_format(reply, "%2"PRIu64"",
1211 proc_cycles * 100 / total_cycles);
1212 ds_put_cstr(reply, " %");
1213 } else {
1214 ds_put_format(reply, "%s", "NOT AVAIL");
1215 }
1216 ds_put_cstr(reply, "\n");
1217 }
1218 ovs_mutex_unlock(&pmd->port_mutex);
1219 free(list);
1220 }
1221 }
1222
1223 static int
1224 compare_poll_thread_list(const void *a_, const void *b_)
1225 {
1226 const struct dp_netdev_pmd_thread *a, *b;
1227
1228 a = *(struct dp_netdev_pmd_thread **)a_;
1229 b = *(struct dp_netdev_pmd_thread **)b_;
1230
1231 if (a->core_id < b->core_id) {
1232 return -1;
1233 }
1234 if (a->core_id > b->core_id) {
1235 return 1;
1236 }
1237 return 0;
1238 }
1239
1240 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1241 * this list, as long as we do not go to quiescent state. */
1242 static void
1243 sorted_poll_thread_list(struct dp_netdev *dp,
1244 struct dp_netdev_pmd_thread ***list,
1245 size_t *n)
1246 {
1247 struct dp_netdev_pmd_thread *pmd;
1248 struct dp_netdev_pmd_thread **pmd_list;
1249 size_t k = 0, n_pmds;
1250
1251 n_pmds = cmap_count(&dp->poll_threads);
1252 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1253
1254 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1255 if (k >= n_pmds) {
1256 break;
1257 }
1258 pmd_list[k++] = pmd;
1259 }
1260
1261 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1262
1263 *list = pmd_list;
1264 *n = k;
1265 }
1266
1267 static void
1268 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1269 const char *argv[], void *aux OVS_UNUSED)
1270 {
1271 struct ds reply = DS_EMPTY_INITIALIZER;
1272 struct dp_netdev *dp = NULL;
1273
1274 ovs_mutex_lock(&dp_netdev_mutex);
1275
1276 if (argc == 2) {
1277 dp = shash_find_data(&dp_netdevs, argv[1]);
1278 } else if (shash_count(&dp_netdevs) == 1) {
1279 /* There's only one datapath */
1280 dp = shash_first(&dp_netdevs)->data;
1281 }
1282
1283 if (!dp) {
1284 ovs_mutex_unlock(&dp_netdev_mutex);
1285 unixctl_command_reply_error(conn,
1286 "please specify an existing datapath");
1287 return;
1288 }
1289
1290 dp_netdev_request_reconfigure(dp);
1291 ovs_mutex_unlock(&dp_netdev_mutex);
1292 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1293 unixctl_command_reply(conn, ds_cstr(&reply));
1294 ds_destroy(&reply);
1295 }
1296
1297 static void
1298 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1299 void *aux)
1300 {
1301 struct ds reply = DS_EMPTY_INITIALIZER;
1302 struct dp_netdev_pmd_thread **pmd_list;
1303 struct dp_netdev *dp = NULL;
1304 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1305 unsigned int core_id;
1306 bool filter_on_pmd = false;
1307 size_t n;
1308
1309 ovs_mutex_lock(&dp_netdev_mutex);
1310
1311 while (argc > 1) {
1312 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1313 if (str_to_uint(argv[2], 10, &core_id)) {
1314 filter_on_pmd = true;
1315 }
1316 argc -= 2;
1317 argv += 2;
1318 } else {
1319 dp = shash_find_data(&dp_netdevs, argv[1]);
1320 argc -= 1;
1321 argv += 1;
1322 }
1323 }
1324
1325 if (!dp) {
1326 if (shash_count(&dp_netdevs) == 1) {
1327 /* There's only one datapath */
1328 dp = shash_first(&dp_netdevs)->data;
1329 } else {
1330 ovs_mutex_unlock(&dp_netdev_mutex);
1331 unixctl_command_reply_error(conn,
1332 "please specify an existing datapath");
1333 return;
1334 }
1335 }
1336
1337 sorted_poll_thread_list(dp, &pmd_list, &n);
1338 for (size_t i = 0; i < n; i++) {
1339 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1340 if (!pmd) {
1341 break;
1342 }
1343 if (filter_on_pmd && pmd->core_id != core_id) {
1344 continue;
1345 }
1346 if (type == PMD_INFO_SHOW_RXQ) {
1347 pmd_info_show_rxq(&reply, pmd);
1348 } else if (type == PMD_INFO_CLEAR_STATS) {
1349 pmd_perf_stats_clear(&pmd->perf_stats);
1350 } else if (type == PMD_INFO_SHOW_STATS) {
1351 pmd_info_show_stats(&reply, pmd);
1352 } else if (type == PMD_INFO_PERF_SHOW) {
1353 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1354 }
1355 }
1356 free(pmd_list);
1357
1358 ovs_mutex_unlock(&dp_netdev_mutex);
1359
1360 unixctl_command_reply(conn, ds_cstr(&reply));
1361 ds_destroy(&reply);
1362 }
1363
1364 static void
1365 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1366 const char *argv[],
1367 void *aux OVS_UNUSED)
1368 {
1369 struct pmd_perf_params par;
1370 long int it_hist = 0, ms_hist = 0;
1371 par.histograms = true;
1372
1373 while (argc > 1) {
1374 if (!strcmp(argv[1], "-nh")) {
1375 par.histograms = false;
1376 argc -= 1;
1377 argv += 1;
1378 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1379 it_hist = strtol(argv[2], NULL, 10);
1380 if (it_hist < 0) {
1381 it_hist = 0;
1382 } else if (it_hist > HISTORY_LEN) {
1383 it_hist = HISTORY_LEN;
1384 }
1385 argc -= 2;
1386 argv += 2;
1387 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1388 ms_hist = strtol(argv[2], NULL, 10);
1389 if (ms_hist < 0) {
1390 ms_hist = 0;
1391 } else if (ms_hist > HISTORY_LEN) {
1392 ms_hist = HISTORY_LEN;
1393 }
1394 argc -= 2;
1395 argv += 2;
1396 } else {
1397 break;
1398 }
1399 }
1400 par.iter_hist_len = it_hist;
1401 par.ms_hist_len = ms_hist;
1402 par.command_type = PMD_INFO_PERF_SHOW;
1403 dpif_netdev_pmd_info(conn, argc, argv, &par);
1404 }
1405 \f
1406 static int
1407 dpif_netdev_init(void)
1408 {
1409 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1410 clear_aux = PMD_INFO_CLEAR_STATS,
1411 poll_aux = PMD_INFO_SHOW_RXQ;
1412
1413 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1414 0, 3, dpif_netdev_pmd_info,
1415 (void *)&show_aux);
1416 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1417 0, 3, dpif_netdev_pmd_info,
1418 (void *)&clear_aux);
1419 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1420 0, 3, dpif_netdev_pmd_info,
1421 (void *)&poll_aux);
1422 unixctl_command_register("dpif-netdev/pmd-perf-show",
1423 "[-nh] [-it iter-history-len]"
1424 " [-ms ms-history-len]"
1425 " [-pmd core] [dp]",
1426 0, 8, pmd_perf_show_cmd,
1427 NULL);
1428 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1429 0, 1, dpif_netdev_pmd_rebalance,
1430 NULL);
1431 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1432 "on|off [-b before] [-a after] [-e|-ne] "
1433 "[-us usec] [-q qlen]",
1434 0, 10, pmd_perf_log_set_cmd,
1435 NULL);
1436 return 0;
1437 }
1438
1439 static int
1440 dpif_netdev_enumerate(struct sset *all_dps,
1441 const struct dpif_class *dpif_class)
1442 {
1443 struct shash_node *node;
1444
1445 ovs_mutex_lock(&dp_netdev_mutex);
1446 SHASH_FOR_EACH(node, &dp_netdevs) {
1447 struct dp_netdev *dp = node->data;
1448 if (dpif_class != dp->class) {
1449 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1450 * If the class doesn't match, skip this dpif. */
1451 continue;
1452 }
1453 sset_add(all_dps, node->name);
1454 }
1455 ovs_mutex_unlock(&dp_netdev_mutex);
1456
1457 return 0;
1458 }
1459
1460 static bool
1461 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1462 {
1463 return class != &dpif_netdev_class;
1464 }
1465
1466 static const char *
1467 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1468 {
1469 return strcmp(type, "internal") ? type
1470 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1471 : "tap";
1472 }
1473
1474 static struct dpif *
1475 create_dpif_netdev(struct dp_netdev *dp)
1476 {
1477 uint16_t netflow_id = hash_string(dp->name, 0);
1478 struct dpif_netdev *dpif;
1479
1480 ovs_refcount_ref(&dp->ref_cnt);
1481
1482 dpif = xmalloc(sizeof *dpif);
1483 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1484 dpif->dp = dp;
1485 dpif->last_port_seq = seq_read(dp->port_seq);
1486
1487 return &dpif->dpif;
1488 }
1489
1490 /* Choose an unused, non-zero port number and return it on success.
1491 * Return ODPP_NONE on failure. */
1492 static odp_port_t
1493 choose_port(struct dp_netdev *dp, const char *name)
1494 OVS_REQUIRES(dp->port_mutex)
1495 {
1496 uint32_t port_no;
1497
1498 if (dp->class != &dpif_netdev_class) {
1499 const char *p;
1500 int start_no = 0;
1501
1502 /* If the port name begins with "br", start the number search at
1503 * 100 to make writing tests easier. */
1504 if (!strncmp(name, "br", 2)) {
1505 start_no = 100;
1506 }
1507
1508 /* If the port name contains a number, try to assign that port number.
1509 * This can make writing unit tests easier because port numbers are
1510 * predictable. */
1511 for (p = name; *p != '\0'; p++) {
1512 if (isdigit((unsigned char) *p)) {
1513 port_no = start_no + strtol(p, NULL, 10);
1514 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1515 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1516 return u32_to_odp(port_no);
1517 }
1518 break;
1519 }
1520 }
1521 }
1522
1523 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1524 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1525 return u32_to_odp(port_no);
1526 }
1527 }
1528
1529 return ODPP_NONE;
1530 }
1531
1532 static int
1533 create_dp_netdev(const char *name, const struct dpif_class *class,
1534 struct dp_netdev **dpp)
1535 OVS_REQUIRES(dp_netdev_mutex)
1536 {
1537 struct dp_netdev *dp;
1538 int error;
1539
1540 dp = xzalloc(sizeof *dp);
1541 shash_add(&dp_netdevs, name, dp);
1542
1543 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1544 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1545 ovs_refcount_init(&dp->ref_cnt);
1546 atomic_flag_clear(&dp->destroyed);
1547
1548 ovs_mutex_init(&dp->port_mutex);
1549 hmap_init(&dp->ports);
1550 dp->port_seq = seq_create();
1551 fat_rwlock_init(&dp->upcall_rwlock);
1552
1553 dp->reconfigure_seq = seq_create();
1554 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1555
1556 for (int i = 0; i < N_METER_LOCKS; ++i) {
1557 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1558 }
1559
1560 /* Disable upcalls by default. */
1561 dp_netdev_disable_upcall(dp);
1562 dp->upcall_aux = NULL;
1563 dp->upcall_cb = NULL;
1564
1565 dp->conntrack = conntrack_init();
1566
1567 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1568 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1569
1570 cmap_init(&dp->poll_threads);
1571 dp->pmd_rxq_assign_cyc = true;
1572
1573 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1574 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1575 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1576
1577 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1578 ovsthread_key_create(&dp->per_pmd_key, NULL);
1579
1580 ovs_mutex_lock(&dp->port_mutex);
1581 /* non-PMD will be created before all other threads and will
1582 * allocate static_tx_qid = 0. */
1583 dp_netdev_set_nonpmd(dp);
1584
1585 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1586 "internal"),
1587 ODPP_LOCAL);
1588 ovs_mutex_unlock(&dp->port_mutex);
1589 if (error) {
1590 dp_netdev_free(dp);
1591 return error;
1592 }
1593
1594 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1595 *dpp = dp;
1596 return 0;
1597 }
1598
1599 static void
1600 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1601 {
1602 seq_change(dp->reconfigure_seq);
1603 }
1604
1605 static bool
1606 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1607 {
1608 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1609 }
1610
1611 static int
1612 dpif_netdev_open(const struct dpif_class *class, const char *name,
1613 bool create, struct dpif **dpifp)
1614 {
1615 struct dp_netdev *dp;
1616 int error;
1617
1618 ovs_mutex_lock(&dp_netdev_mutex);
1619 dp = shash_find_data(&dp_netdevs, name);
1620 if (!dp) {
1621 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1622 } else {
1623 error = (dp->class != class ? EINVAL
1624 : create ? EEXIST
1625 : 0);
1626 }
1627 if (!error) {
1628 *dpifp = create_dpif_netdev(dp);
1629 dp->dpif = *dpifp;
1630 }
1631 ovs_mutex_unlock(&dp_netdev_mutex);
1632
1633 return error;
1634 }
1635
1636 static void
1637 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1638 OVS_NO_THREAD_SAFETY_ANALYSIS
1639 {
1640 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1641 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1642
1643 /* Before freeing a lock we should release it */
1644 fat_rwlock_unlock(&dp->upcall_rwlock);
1645 fat_rwlock_destroy(&dp->upcall_rwlock);
1646 }
1647
1648 static void
1649 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1650 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1651 {
1652 if (dp->meters[meter_id]) {
1653 free(dp->meters[meter_id]);
1654 dp->meters[meter_id] = NULL;
1655 }
1656 }
1657
1658 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1659 * through the 'dp_netdevs' shash while freeing 'dp'. */
1660 static void
1661 dp_netdev_free(struct dp_netdev *dp)
1662 OVS_REQUIRES(dp_netdev_mutex)
1663 {
1664 struct dp_netdev_port *port, *next;
1665
1666 shash_find_and_delete(&dp_netdevs, dp->name);
1667
1668 ovs_mutex_lock(&dp->port_mutex);
1669 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1670 do_del_port(dp, port);
1671 }
1672 ovs_mutex_unlock(&dp->port_mutex);
1673
1674 dp_netdev_destroy_all_pmds(dp, true);
1675 cmap_destroy(&dp->poll_threads);
1676
1677 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1678 id_pool_destroy(dp->tx_qid_pool);
1679
1680 ovs_mutex_destroy(&dp->non_pmd_mutex);
1681 ovsthread_key_delete(dp->per_pmd_key);
1682
1683 conntrack_destroy(dp->conntrack);
1684
1685
1686 seq_destroy(dp->reconfigure_seq);
1687
1688 seq_destroy(dp->port_seq);
1689 hmap_destroy(&dp->ports);
1690 ovs_mutex_destroy(&dp->port_mutex);
1691
1692 /* Upcalls must be disabled at this point */
1693 dp_netdev_destroy_upcall_lock(dp);
1694
1695 int i;
1696
1697 for (i = 0; i < MAX_METERS; ++i) {
1698 meter_lock(dp, i);
1699 dp_delete_meter(dp, i);
1700 meter_unlock(dp, i);
1701 }
1702 for (i = 0; i < N_METER_LOCKS; ++i) {
1703 ovs_mutex_destroy(&dp->meter_locks[i]);
1704 }
1705
1706 free(dp->pmd_cmask);
1707 free(CONST_CAST(char *, dp->name));
1708 free(dp);
1709 }
1710
1711 static void
1712 dp_netdev_unref(struct dp_netdev *dp)
1713 {
1714 if (dp) {
1715 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1716 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1717 ovs_mutex_lock(&dp_netdev_mutex);
1718 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1719 dp_netdev_free(dp);
1720 }
1721 ovs_mutex_unlock(&dp_netdev_mutex);
1722 }
1723 }
1724
1725 static void
1726 dpif_netdev_close(struct dpif *dpif)
1727 {
1728 struct dp_netdev *dp = get_dp_netdev(dpif);
1729
1730 dp_netdev_unref(dp);
1731 free(dpif);
1732 }
1733
1734 static int
1735 dpif_netdev_destroy(struct dpif *dpif)
1736 {
1737 struct dp_netdev *dp = get_dp_netdev(dpif);
1738
1739 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1740 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1741 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1742 OVS_NOT_REACHED();
1743 }
1744 }
1745
1746 return 0;
1747 }
1748
1749 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1750 * load/store semantics. While the increment is not atomic, the load and
1751 * store operations are, making it impossible to read inconsistent values.
1752 *
1753 * This is used to update thread local stats counters. */
1754 static void
1755 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1756 {
1757 unsigned long long tmp;
1758
1759 atomic_read_relaxed(var, &tmp);
1760 tmp += n;
1761 atomic_store_relaxed(var, tmp);
1762 }
1763
1764 static int
1765 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1766 {
1767 struct dp_netdev *dp = get_dp_netdev(dpif);
1768 struct dp_netdev_pmd_thread *pmd;
1769 uint64_t pmd_stats[PMD_N_STATS];
1770
1771 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1772 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1773 stats->n_flows += cmap_count(&pmd->flow_table);
1774 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1775 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1776 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1777 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1778 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1779 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1780 }
1781 stats->n_masks = UINT32_MAX;
1782 stats->n_mask_hit = UINT64_MAX;
1783
1784 return 0;
1785 }
1786
1787 static void
1788 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1789 {
1790 if (pmd->core_id == NON_PMD_CORE_ID) {
1791 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1792 ovs_mutex_lock(&pmd->port_mutex);
1793 pmd_load_cached_ports(pmd);
1794 ovs_mutex_unlock(&pmd->port_mutex);
1795 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1796 return;
1797 }
1798
1799 seq_change(pmd->reload_seq);
1800 atomic_store_explicit(&pmd->reload, true, memory_order_release);
1801 }
1802
1803 static uint32_t
1804 hash_port_no(odp_port_t port_no)
1805 {
1806 return hash_int(odp_to_u32(port_no), 0);
1807 }
1808
1809 static int
1810 port_create(const char *devname, const char *type,
1811 odp_port_t port_no, struct dp_netdev_port **portp)
1812 {
1813 struct netdev_saved_flags *sf;
1814 struct dp_netdev_port *port;
1815 enum netdev_flags flags;
1816 struct netdev *netdev;
1817 int error;
1818
1819 *portp = NULL;
1820
1821 /* Open and validate network device. */
1822 error = netdev_open(devname, type, &netdev);
1823 if (error) {
1824 return error;
1825 }
1826 /* XXX reject non-Ethernet devices */
1827
1828 netdev_get_flags(netdev, &flags);
1829 if (flags & NETDEV_LOOPBACK) {
1830 VLOG_ERR("%s: cannot add a loopback device", devname);
1831 error = EINVAL;
1832 goto out;
1833 }
1834
1835 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1836 if (error) {
1837 VLOG_ERR("%s: cannot set promisc flag", devname);
1838 goto out;
1839 }
1840
1841 port = xzalloc(sizeof *port);
1842 port->port_no = port_no;
1843 port->netdev = netdev;
1844 port->type = xstrdup(type);
1845 port->sf = sf;
1846 port->emc_enabled = true;
1847 port->need_reconfigure = true;
1848 ovs_mutex_init(&port->txq_used_mutex);
1849
1850 *portp = port;
1851
1852 return 0;
1853
1854 out:
1855 netdev_close(netdev);
1856 return error;
1857 }
1858
1859 static int
1860 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1861 odp_port_t port_no)
1862 OVS_REQUIRES(dp->port_mutex)
1863 {
1864 struct dp_netdev_port *port;
1865 int error;
1866
1867 /* Reject devices already in 'dp'. */
1868 if (!get_port_by_name(dp, devname, &port)) {
1869 return EEXIST;
1870 }
1871
1872 error = port_create(devname, type, port_no, &port);
1873 if (error) {
1874 return error;
1875 }
1876
1877 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1878 seq_change(dp->port_seq);
1879
1880 reconfigure_datapath(dp);
1881
1882 return 0;
1883 }
1884
1885 static int
1886 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1887 odp_port_t *port_nop)
1888 {
1889 struct dp_netdev *dp = get_dp_netdev(dpif);
1890 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1891 const char *dpif_port;
1892 odp_port_t port_no;
1893 int error;
1894
1895 ovs_mutex_lock(&dp->port_mutex);
1896 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1897 if (*port_nop != ODPP_NONE) {
1898 port_no = *port_nop;
1899 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1900 } else {
1901 port_no = choose_port(dp, dpif_port);
1902 error = port_no == ODPP_NONE ? EFBIG : 0;
1903 }
1904 if (!error) {
1905 *port_nop = port_no;
1906 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1907 }
1908 ovs_mutex_unlock(&dp->port_mutex);
1909
1910 return error;
1911 }
1912
1913 static int
1914 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1915 {
1916 struct dp_netdev *dp = get_dp_netdev(dpif);
1917 int error;
1918
1919 ovs_mutex_lock(&dp->port_mutex);
1920 if (port_no == ODPP_LOCAL) {
1921 error = EINVAL;
1922 } else {
1923 struct dp_netdev_port *port;
1924
1925 error = get_port_by_number(dp, port_no, &port);
1926 if (!error) {
1927 do_del_port(dp, port);
1928 }
1929 }
1930 ovs_mutex_unlock(&dp->port_mutex);
1931
1932 return error;
1933 }
1934
1935 static bool
1936 is_valid_port_number(odp_port_t port_no)
1937 {
1938 return port_no != ODPP_NONE;
1939 }
1940
1941 static struct dp_netdev_port *
1942 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1943 OVS_REQUIRES(dp->port_mutex)
1944 {
1945 struct dp_netdev_port *port;
1946
1947 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1948 if (port->port_no == port_no) {
1949 return port;
1950 }
1951 }
1952 return NULL;
1953 }
1954
1955 static int
1956 get_port_by_number(struct dp_netdev *dp,
1957 odp_port_t port_no, struct dp_netdev_port **portp)
1958 OVS_REQUIRES(dp->port_mutex)
1959 {
1960 if (!is_valid_port_number(port_no)) {
1961 *portp = NULL;
1962 return EINVAL;
1963 } else {
1964 *portp = dp_netdev_lookup_port(dp, port_no);
1965 return *portp ? 0 : ENODEV;
1966 }
1967 }
1968
1969 static void
1970 port_destroy(struct dp_netdev_port *port)
1971 {
1972 if (!port) {
1973 return;
1974 }
1975
1976 netdev_close(port->netdev);
1977 netdev_restore_flags(port->sf);
1978
1979 for (unsigned i = 0; i < port->n_rxq; i++) {
1980 netdev_rxq_close(port->rxqs[i].rx);
1981 }
1982 ovs_mutex_destroy(&port->txq_used_mutex);
1983 free(port->rxq_affinity_list);
1984 free(port->txq_used);
1985 free(port->rxqs);
1986 free(port->type);
1987 free(port);
1988 }
1989
1990 static int
1991 get_port_by_name(struct dp_netdev *dp,
1992 const char *devname, struct dp_netdev_port **portp)
1993 OVS_REQUIRES(dp->port_mutex)
1994 {
1995 struct dp_netdev_port *port;
1996
1997 HMAP_FOR_EACH (port, node, &dp->ports) {
1998 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1999 *portp = port;
2000 return 0;
2001 }
2002 }
2003
2004 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2005 * existing port. */
2006 return ENODEV;
2007 }
2008
2009 /* Returns 'true' if there is a port with pmd netdev. */
2010 static bool
2011 has_pmd_port(struct dp_netdev *dp)
2012 OVS_REQUIRES(dp->port_mutex)
2013 {
2014 struct dp_netdev_port *port;
2015
2016 HMAP_FOR_EACH (port, node, &dp->ports) {
2017 if (netdev_is_pmd(port->netdev)) {
2018 return true;
2019 }
2020 }
2021
2022 return false;
2023 }
2024
2025 static void
2026 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2027 OVS_REQUIRES(dp->port_mutex)
2028 {
2029 hmap_remove(&dp->ports, &port->node);
2030 seq_change(dp->port_seq);
2031
2032 reconfigure_datapath(dp);
2033
2034 port_destroy(port);
2035 }
2036
2037 static void
2038 answer_port_query(const struct dp_netdev_port *port,
2039 struct dpif_port *dpif_port)
2040 {
2041 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2042 dpif_port->type = xstrdup(port->type);
2043 dpif_port->port_no = port->port_no;
2044 }
2045
2046 static int
2047 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2048 struct dpif_port *dpif_port)
2049 {
2050 struct dp_netdev *dp = get_dp_netdev(dpif);
2051 struct dp_netdev_port *port;
2052 int error;
2053
2054 ovs_mutex_lock(&dp->port_mutex);
2055 error = get_port_by_number(dp, port_no, &port);
2056 if (!error && dpif_port) {
2057 answer_port_query(port, dpif_port);
2058 }
2059 ovs_mutex_unlock(&dp->port_mutex);
2060
2061 return error;
2062 }
2063
2064 static int
2065 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2066 struct dpif_port *dpif_port)
2067 {
2068 struct dp_netdev *dp = get_dp_netdev(dpif);
2069 struct dp_netdev_port *port;
2070 int error;
2071
2072 ovs_mutex_lock(&dp->port_mutex);
2073 error = get_port_by_name(dp, devname, &port);
2074 if (!error && dpif_port) {
2075 answer_port_query(port, dpif_port);
2076 }
2077 ovs_mutex_unlock(&dp->port_mutex);
2078
2079 return error;
2080 }
2081
2082 static void
2083 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2084 {
2085 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2086 free(flow);
2087 }
2088
2089 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2090 {
2091 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2092 ovsrcu_postpone(dp_netdev_flow_free, flow);
2093 }
2094 }
2095
2096 static uint32_t
2097 dp_netdev_flow_hash(const ovs_u128 *ufid)
2098 {
2099 return ufid->u32[0];
2100 }
2101
2102 static inline struct dpcls *
2103 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2104 odp_port_t in_port)
2105 {
2106 struct dpcls *cls;
2107 uint32_t hash = hash_port_no(in_port);
2108 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2109 if (cls->in_port == in_port) {
2110 /* Port classifier exists already */
2111 return cls;
2112 }
2113 }
2114 return NULL;
2115 }
2116
2117 static inline struct dpcls *
2118 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2119 odp_port_t in_port)
2120 OVS_REQUIRES(pmd->flow_mutex)
2121 {
2122 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2123 uint32_t hash = hash_port_no(in_port);
2124
2125 if (!cls) {
2126 /* Create new classifier for in_port */
2127 cls = xmalloc(sizeof(*cls));
2128 dpcls_init(cls);
2129 cls->in_port = in_port;
2130 cmap_insert(&pmd->classifiers, &cls->node, hash);
2131 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2132 }
2133 return cls;
2134 }
2135
2136 #define MAX_FLOW_MARK (UINT32_MAX - 1)
2137 #define INVALID_FLOW_MARK (UINT32_MAX)
2138
2139 struct megaflow_to_mark_data {
2140 const struct cmap_node node;
2141 ovs_u128 mega_ufid;
2142 uint32_t mark;
2143 };
2144
2145 struct flow_mark {
2146 struct cmap megaflow_to_mark;
2147 struct cmap mark_to_flow;
2148 struct id_pool *pool;
2149 };
2150
2151 static struct flow_mark flow_mark = {
2152 .megaflow_to_mark = CMAP_INITIALIZER,
2153 .mark_to_flow = CMAP_INITIALIZER,
2154 };
2155
2156 static uint32_t
2157 flow_mark_alloc(void)
2158 {
2159 uint32_t mark;
2160
2161 if (!flow_mark.pool) {
2162 /* Haven't initiated yet, do it here */
2163 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2164 }
2165
2166 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2167 return mark;
2168 }
2169
2170 return INVALID_FLOW_MARK;
2171 }
2172
2173 static void
2174 flow_mark_free(uint32_t mark)
2175 {
2176 id_pool_free_id(flow_mark.pool, mark);
2177 }
2178
2179 /* associate megaflow with a mark, which is a 1:1 mapping */
2180 static void
2181 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2182 {
2183 size_t hash = dp_netdev_flow_hash(mega_ufid);
2184 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2185
2186 data->mega_ufid = *mega_ufid;
2187 data->mark = mark;
2188
2189 cmap_insert(&flow_mark.megaflow_to_mark,
2190 CONST_CAST(struct cmap_node *, &data->node), hash);
2191 }
2192
2193 /* disassociate meagaflow with a mark */
2194 static void
2195 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2196 {
2197 size_t hash = dp_netdev_flow_hash(mega_ufid);
2198 struct megaflow_to_mark_data *data;
2199
2200 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2201 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2202 cmap_remove(&flow_mark.megaflow_to_mark,
2203 CONST_CAST(struct cmap_node *, &data->node), hash);
2204 ovsrcu_postpone(free, data);
2205 return;
2206 }
2207 }
2208
2209 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2210 UUID_ARGS((struct uuid *)mega_ufid));
2211 }
2212
2213 static inline uint32_t
2214 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2215 {
2216 size_t hash = dp_netdev_flow_hash(mega_ufid);
2217 struct megaflow_to_mark_data *data;
2218
2219 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2220 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2221 return data->mark;
2222 }
2223 }
2224
2225 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2226 UUID_ARGS((struct uuid *)mega_ufid));
2227 return INVALID_FLOW_MARK;
2228 }
2229
2230 /* associate mark with a flow, which is 1:N mapping */
2231 static void
2232 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2233 {
2234 dp_netdev_flow_ref(flow);
2235
2236 cmap_insert(&flow_mark.mark_to_flow,
2237 CONST_CAST(struct cmap_node *, &flow->mark_node),
2238 hash_int(mark, 0));
2239 flow->mark = mark;
2240
2241 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2242 }
2243
2244 static bool
2245 flow_mark_has_no_ref(uint32_t mark)
2246 {
2247 struct dp_netdev_flow *flow;
2248
2249 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2250 &flow_mark.mark_to_flow) {
2251 if (flow->mark == mark) {
2252 return false;
2253 }
2254 }
2255
2256 return true;
2257 }
2258
2259 static int
2260 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2261 struct dp_netdev_flow *flow)
2262 {
2263 int ret = 0;
2264 uint32_t mark = flow->mark;
2265 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2266 &flow->mark_node);
2267
2268 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2269 flow->mark = INVALID_FLOW_MARK;
2270
2271 /*
2272 * no flow is referencing the mark any more? If so, let's
2273 * remove the flow from hardware and free the mark.
2274 */
2275 if (flow_mark_has_no_ref(mark)) {
2276 struct dp_netdev_port *port;
2277 odp_port_t in_port = flow->flow.in_port.odp_port;
2278
2279 ovs_mutex_lock(&pmd->dp->port_mutex);
2280 port = dp_netdev_lookup_port(pmd->dp, in_port);
2281 if (port) {
2282 ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2283 }
2284 ovs_mutex_unlock(&pmd->dp->port_mutex);
2285
2286 flow_mark_free(mark);
2287 VLOG_DBG("Freed flow mark %u\n", mark);
2288
2289 megaflow_to_mark_disassociate(&flow->mega_ufid);
2290 }
2291 dp_netdev_flow_unref(flow);
2292
2293 return ret;
2294 }
2295
2296 static void
2297 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2298 {
2299 struct dp_netdev_flow *flow;
2300
2301 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2302 if (flow->pmd_id == pmd->core_id) {
2303 queue_netdev_flow_del(pmd, flow);
2304 }
2305 }
2306 }
2307
2308 static struct dp_netdev_flow *
2309 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2310 const uint32_t mark)
2311 {
2312 struct dp_netdev_flow *flow;
2313
2314 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2315 &flow_mark.mark_to_flow) {
2316 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2317 flow->dead == false) {
2318 return flow;
2319 }
2320 }
2321
2322 return NULL;
2323 }
2324
2325 static struct dp_flow_offload_item *
2326 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2327 struct dp_netdev_flow *flow,
2328 int op)
2329 {
2330 struct dp_flow_offload_item *offload;
2331
2332 offload = xzalloc(sizeof(*offload));
2333 offload->pmd = pmd;
2334 offload->flow = flow;
2335 offload->op = op;
2336
2337 dp_netdev_flow_ref(flow);
2338 dp_netdev_pmd_try_ref(pmd);
2339
2340 return offload;
2341 }
2342
2343 static void
2344 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2345 {
2346 dp_netdev_pmd_unref(offload->pmd);
2347 dp_netdev_flow_unref(offload->flow);
2348
2349 free(offload->actions);
2350 free(offload);
2351 }
2352
2353 static void
2354 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2355 {
2356 ovs_mutex_lock(&dp_flow_offload.mutex);
2357 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2358 xpthread_cond_signal(&dp_flow_offload.cond);
2359 ovs_mutex_unlock(&dp_flow_offload.mutex);
2360 }
2361
2362 static int
2363 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2364 {
2365 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2366 }
2367
2368 /*
2369 * There are two flow offload operations here: addition and modification.
2370 *
2371 * For flow addition, this function does:
2372 * - allocate a new flow mark id
2373 * - perform hardware flow offload
2374 * - associate the flow mark with flow and mega flow
2375 *
2376 * For flow modification, both flow mark and the associations are still
2377 * valid, thus only item 2 needed.
2378 */
2379 static int
2380 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2381 {
2382 struct dp_netdev_port *port;
2383 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2384 struct dp_netdev_flow *flow = offload->flow;
2385 odp_port_t in_port = flow->flow.in_port.odp_port;
2386 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2387 struct offload_info info;
2388 uint32_t mark;
2389 int ret;
2390
2391 if (flow->dead) {
2392 return -1;
2393 }
2394
2395 if (modification) {
2396 mark = flow->mark;
2397 ovs_assert(mark != INVALID_FLOW_MARK);
2398 } else {
2399 /*
2400 * If a mega flow has already been offloaded (from other PMD
2401 * instances), do not offload it again.
2402 */
2403 mark = megaflow_to_mark_find(&flow->mega_ufid);
2404 if (mark != INVALID_FLOW_MARK) {
2405 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2406 if (flow->mark != INVALID_FLOW_MARK) {
2407 ovs_assert(flow->mark == mark);
2408 } else {
2409 mark_to_flow_associate(mark, flow);
2410 }
2411 return 0;
2412 }
2413
2414 mark = flow_mark_alloc();
2415 if (mark == INVALID_FLOW_MARK) {
2416 VLOG_ERR("Failed to allocate flow mark!\n");
2417 }
2418 }
2419 info.flow_mark = mark;
2420
2421 ovs_mutex_lock(&pmd->dp->port_mutex);
2422 port = dp_netdev_lookup_port(pmd->dp, in_port);
2423 if (!port || netdev_vport_is_vport_class(port->netdev->netdev_class)) {
2424 ovs_mutex_unlock(&pmd->dp->port_mutex);
2425 goto err_free;
2426 }
2427 ret = netdev_flow_put(port->netdev, &offload->match,
2428 CONST_CAST(struct nlattr *, offload->actions),
2429 offload->actions_len, &flow->mega_ufid, &info,
2430 NULL);
2431 ovs_mutex_unlock(&pmd->dp->port_mutex);
2432
2433 if (ret) {
2434 goto err_free;
2435 }
2436
2437 if (!modification) {
2438 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2439 mark_to_flow_associate(mark, flow);
2440 }
2441 return 0;
2442
2443 err_free:
2444 if (!modification) {
2445 flow_mark_free(mark);
2446 } else {
2447 mark_to_flow_disassociate(pmd, flow);
2448 }
2449 return -1;
2450 }
2451
2452 static void *
2453 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2454 {
2455 struct dp_flow_offload_item *offload;
2456 struct ovs_list *list;
2457 const char *op;
2458 int ret;
2459
2460 for (;;) {
2461 ovs_mutex_lock(&dp_flow_offload.mutex);
2462 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2463 ovsrcu_quiesce_start();
2464 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2465 &dp_flow_offload.mutex);
2466 ovsrcu_quiesce_end();
2467 }
2468 list = ovs_list_pop_front(&dp_flow_offload.list);
2469 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2470 ovs_mutex_unlock(&dp_flow_offload.mutex);
2471
2472 switch (offload->op) {
2473 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2474 op = "add";
2475 ret = dp_netdev_flow_offload_put(offload);
2476 break;
2477 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2478 op = "modify";
2479 ret = dp_netdev_flow_offload_put(offload);
2480 break;
2481 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2482 op = "delete";
2483 ret = dp_netdev_flow_offload_del(offload);
2484 break;
2485 default:
2486 OVS_NOT_REACHED();
2487 }
2488
2489 VLOG_DBG("%s to %s netdev flow\n",
2490 ret == 0 ? "succeed" : "failed", op);
2491 dp_netdev_free_flow_offload(offload);
2492 }
2493
2494 return NULL;
2495 }
2496
2497 static void
2498 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2499 struct dp_netdev_flow *flow)
2500 {
2501 struct dp_flow_offload_item *offload;
2502
2503 if (ovsthread_once_start(&offload_thread_once)) {
2504 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2505 ovs_thread_create("dp_netdev_flow_offload",
2506 dp_netdev_flow_offload_main, NULL);
2507 ovsthread_once_done(&offload_thread_once);
2508 }
2509
2510 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2511 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2512 dp_netdev_append_flow_offload(offload);
2513 }
2514
2515 static void
2516 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2517 struct dp_netdev_flow *flow, struct match *match,
2518 const struct nlattr *actions, size_t actions_len)
2519 {
2520 struct dp_flow_offload_item *offload;
2521 int op;
2522
2523 if (!netdev_is_flow_api_enabled()) {
2524 return;
2525 }
2526
2527 if (ovsthread_once_start(&offload_thread_once)) {
2528 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2529 ovs_thread_create("dp_netdev_flow_offload",
2530 dp_netdev_flow_offload_main, NULL);
2531 ovsthread_once_done(&offload_thread_once);
2532 }
2533
2534 if (flow->mark != INVALID_FLOW_MARK) {
2535 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2536 } else {
2537 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2538 }
2539 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2540 offload->match = *match;
2541 offload->actions = xmalloc(actions_len);
2542 memcpy(offload->actions, actions, actions_len);
2543 offload->actions_len = actions_len;
2544
2545 dp_netdev_append_flow_offload(offload);
2546 }
2547
2548 static void
2549 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2550 struct dp_netdev_flow *flow)
2551 OVS_REQUIRES(pmd->flow_mutex)
2552 {
2553 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2554 struct dpcls *cls;
2555 odp_port_t in_port = flow->flow.in_port.odp_port;
2556
2557 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2558 ovs_assert(cls != NULL);
2559 dpcls_remove(cls, &flow->cr);
2560 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2561 if (flow->mark != INVALID_FLOW_MARK) {
2562 queue_netdev_flow_del(pmd, flow);
2563 }
2564 flow->dead = true;
2565
2566 dp_netdev_flow_unref(flow);
2567 }
2568
2569 static void
2570 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2571 {
2572 struct dp_netdev_flow *netdev_flow;
2573
2574 ovs_mutex_lock(&pmd->flow_mutex);
2575 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2576 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2577 }
2578 ovs_mutex_unlock(&pmd->flow_mutex);
2579 }
2580
2581 static int
2582 dpif_netdev_flow_flush(struct dpif *dpif)
2583 {
2584 struct dp_netdev *dp = get_dp_netdev(dpif);
2585 struct dp_netdev_pmd_thread *pmd;
2586
2587 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2588 dp_netdev_pmd_flow_flush(pmd);
2589 }
2590
2591 return 0;
2592 }
2593
2594 struct dp_netdev_port_state {
2595 struct hmap_position position;
2596 char *name;
2597 };
2598
2599 static int
2600 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2601 {
2602 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2603 return 0;
2604 }
2605
2606 static int
2607 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2608 struct dpif_port *dpif_port)
2609 {
2610 struct dp_netdev_port_state *state = state_;
2611 struct dp_netdev *dp = get_dp_netdev(dpif);
2612 struct hmap_node *node;
2613 int retval;
2614
2615 ovs_mutex_lock(&dp->port_mutex);
2616 node = hmap_at_position(&dp->ports, &state->position);
2617 if (node) {
2618 struct dp_netdev_port *port;
2619
2620 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2621
2622 free(state->name);
2623 state->name = xstrdup(netdev_get_name(port->netdev));
2624 dpif_port->name = state->name;
2625 dpif_port->type = port->type;
2626 dpif_port->port_no = port->port_no;
2627
2628 retval = 0;
2629 } else {
2630 retval = EOF;
2631 }
2632 ovs_mutex_unlock(&dp->port_mutex);
2633
2634 return retval;
2635 }
2636
2637 static int
2638 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2639 {
2640 struct dp_netdev_port_state *state = state_;
2641 free(state->name);
2642 free(state);
2643 return 0;
2644 }
2645
2646 static int
2647 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2648 {
2649 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2650 uint64_t new_port_seq;
2651 int error;
2652
2653 new_port_seq = seq_read(dpif->dp->port_seq);
2654 if (dpif->last_port_seq != new_port_seq) {
2655 dpif->last_port_seq = new_port_seq;
2656 error = ENOBUFS;
2657 } else {
2658 error = EAGAIN;
2659 }
2660
2661 return error;
2662 }
2663
2664 static void
2665 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2666 {
2667 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2668
2669 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2670 }
2671
2672 static struct dp_netdev_flow *
2673 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2674 {
2675 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2676 }
2677
2678 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2679 {
2680 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2681 }
2682
2683 /* netdev_flow_key utilities.
2684 *
2685 * netdev_flow_key is basically a miniflow. We use these functions
2686 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2687 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2688 *
2689 * - Since we are dealing exclusively with miniflows created by
2690 * miniflow_extract(), if the map is different the miniflow is different.
2691 * Therefore we can be faster by comparing the map and the miniflow in a
2692 * single memcmp().
2693 * - These functions can be inlined by the compiler. */
2694
2695 /* Given the number of bits set in miniflow's maps, returns the size of the
2696 * 'netdev_flow_key.mf' */
2697 static inline size_t
2698 netdev_flow_key_size(size_t flow_u64s)
2699 {
2700 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2701 }
2702
2703 static inline bool
2704 netdev_flow_key_equal(const struct netdev_flow_key *a,
2705 const struct netdev_flow_key *b)
2706 {
2707 /* 'b->len' may be not set yet. */
2708 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2709 }
2710
2711 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2712 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2713 * generated by miniflow_extract. */
2714 static inline bool
2715 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2716 const struct miniflow *mf)
2717 {
2718 return !memcmp(&key->mf, mf, key->len);
2719 }
2720
2721 static inline void
2722 netdev_flow_key_clone(struct netdev_flow_key *dst,
2723 const struct netdev_flow_key *src)
2724 {
2725 memcpy(dst, src,
2726 offsetof(struct netdev_flow_key, mf) + src->len);
2727 }
2728
2729 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2730 static inline void
2731 netdev_flow_mask_init(struct netdev_flow_key *mask,
2732 const struct match *match)
2733 {
2734 uint64_t *dst = miniflow_values(&mask->mf);
2735 struct flowmap fmap;
2736 uint32_t hash = 0;
2737 size_t idx;
2738
2739 /* Only check masks that make sense for the flow. */
2740 flow_wc_map(&match->flow, &fmap);
2741 flowmap_init(&mask->mf.map);
2742
2743 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2744 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2745
2746 if (mask_u64) {
2747 flowmap_set(&mask->mf.map, idx, 1);
2748 *dst++ = mask_u64;
2749 hash = hash_add64(hash, mask_u64);
2750 }
2751 }
2752
2753 map_t map;
2754
2755 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2756 hash = hash_add64(hash, map);
2757 }
2758
2759 size_t n = dst - miniflow_get_values(&mask->mf);
2760
2761 mask->hash = hash_finish(hash, n * 8);
2762 mask->len = netdev_flow_key_size(n);
2763 }
2764
2765 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2766 static inline void
2767 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2768 const struct flow *flow,
2769 const struct netdev_flow_key *mask)
2770 {
2771 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2772 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2773 uint32_t hash = 0;
2774 uint64_t value;
2775
2776 dst->len = mask->len;
2777 dst->mf = mask->mf; /* Copy maps. */
2778
2779 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2780 *dst_u64 = value & *mask_u64++;
2781 hash = hash_add64(hash, *dst_u64++);
2782 }
2783 dst->hash = hash_finish(hash,
2784 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2785 }
2786
2787 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2788 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
2789 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2790
2791 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2792 * 'mask'. */
2793 static inline uint32_t
2794 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2795 const struct netdev_flow_key *mask)
2796 {
2797 const uint64_t *p = miniflow_get_values(&mask->mf);
2798 uint32_t hash = 0;
2799 uint64_t value;
2800
2801 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2802 hash = hash_add64(hash, value & *p++);
2803 }
2804
2805 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2806 }
2807
2808 static inline bool
2809 emc_entry_alive(struct emc_entry *ce)
2810 {
2811 return ce->flow && !ce->flow->dead;
2812 }
2813
2814 static void
2815 emc_clear_entry(struct emc_entry *ce)
2816 {
2817 if (ce->flow) {
2818 dp_netdev_flow_unref(ce->flow);
2819 ce->flow = NULL;
2820 }
2821 }
2822
2823 static inline void
2824 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2825 const struct netdev_flow_key *key)
2826 {
2827 if (ce->flow != flow) {
2828 if (ce->flow) {
2829 dp_netdev_flow_unref(ce->flow);
2830 }
2831
2832 if (dp_netdev_flow_ref(flow)) {
2833 ce->flow = flow;
2834 } else {
2835 ce->flow = NULL;
2836 }
2837 }
2838 if (key) {
2839 netdev_flow_key_clone(&ce->key, key);
2840 }
2841 }
2842
2843 static inline void
2844 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2845 struct dp_netdev_flow *flow)
2846 {
2847 struct emc_entry *to_be_replaced = NULL;
2848 struct emc_entry *current_entry;
2849
2850 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2851 if (netdev_flow_key_equal(&current_entry->key, key)) {
2852 /* We found the entry with the 'mf' miniflow */
2853 emc_change_entry(current_entry, flow, NULL);
2854 return;
2855 }
2856
2857 /* Replacement policy: put the flow in an empty (not alive) entry, or
2858 * in the first entry where it can be */
2859 if (!to_be_replaced
2860 || (emc_entry_alive(to_be_replaced)
2861 && !emc_entry_alive(current_entry))
2862 || current_entry->key.hash < to_be_replaced->key.hash) {
2863 to_be_replaced = current_entry;
2864 }
2865 }
2866 /* We didn't find the miniflow in the cache.
2867 * The 'to_be_replaced' entry is where the new flow will be stored */
2868
2869 emc_change_entry(to_be_replaced, flow, key);
2870 }
2871
2872 static inline void
2873 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2874 const struct netdev_flow_key *key,
2875 struct dp_netdev_flow *flow)
2876 {
2877 /* Insert an entry into the EMC based on probability value 'min'. By
2878 * default the value is UINT32_MAX / 100 which yields an insertion
2879 * probability of 1/100 ie. 1% */
2880
2881 uint32_t min = pmd->ctx.emc_insert_min;
2882
2883 if (min && random_uint32() <= min) {
2884 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2885 }
2886 }
2887
2888 static inline struct dp_netdev_flow *
2889 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2890 {
2891 struct emc_entry *current_entry;
2892
2893 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2894 if (current_entry->key.hash == key->hash
2895 && emc_entry_alive(current_entry)
2896 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2897
2898 /* We found the entry with the 'key->mf' miniflow */
2899 return current_entry->flow;
2900 }
2901 }
2902
2903 return NULL;
2904 }
2905
2906 static inline const struct cmap_node *
2907 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2908 {
2909 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2910 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2911 uint16_t sig = hash >> 16;
2912 uint16_t index = UINT16_MAX;
2913
2914 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2915 if (bucket->sig[i] == sig) {
2916 index = bucket->flow_idx[i];
2917 break;
2918 }
2919 }
2920 if (index != UINT16_MAX) {
2921 return cmap_find_by_index(&pmd->flow_table, index);
2922 }
2923 return NULL;
2924 }
2925
2926 static void
2927 smc_clear_entry(struct smc_bucket *b, int idx)
2928 {
2929 b->flow_idx[idx] = UINT16_MAX;
2930 }
2931
2932 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2933 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2934 * If there is already an SMC entry having same signature, the index will be
2935 * updated. If there is no existing entry, but an empty entry is available,
2936 * the empty entry will be taken. If no empty entry or existing same signature,
2937 * a random entry from the hashed bucket will be picked. */
2938 static inline void
2939 smc_insert(struct dp_netdev_pmd_thread *pmd,
2940 const struct netdev_flow_key *key,
2941 uint32_t hash)
2942 {
2943 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2944 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2945 uint16_t index;
2946 uint32_t cmap_index;
2947 bool smc_enable_db;
2948 int i;
2949
2950 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2951 if (!smc_enable_db) {
2952 return;
2953 }
2954
2955 cmap_index = cmap_find_index(&pmd->flow_table, hash);
2956 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2957
2958 /* If the index is larger than SMC can handle (uint16_t), we don't
2959 * insert */
2960 if (index == UINT16_MAX) {
2961 return;
2962 }
2963
2964 /* If an entry with same signature already exists, update the index */
2965 uint16_t sig = key->hash >> 16;
2966 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2967 if (bucket->sig[i] == sig) {
2968 bucket->flow_idx[i] = index;
2969 return;
2970 }
2971 }
2972 /* If there is an empty entry, occupy it. */
2973 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2974 if (bucket->flow_idx[i] == UINT16_MAX) {
2975 bucket->sig[i] = sig;
2976 bucket->flow_idx[i] = index;
2977 return;
2978 }
2979 }
2980 /* Otherwise, pick a random entry. */
2981 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2982 bucket->sig[i] = sig;
2983 bucket->flow_idx[i] = index;
2984 }
2985
2986 static struct dp_netdev_flow *
2987 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2988 const struct netdev_flow_key *key,
2989 int *lookup_num_p)
2990 {
2991 struct dpcls *cls;
2992 struct dpcls_rule *rule;
2993 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2994 in_port.odp_port));
2995 struct dp_netdev_flow *netdev_flow = NULL;
2996
2997 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2998 if (OVS_LIKELY(cls)) {
2999 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3000 netdev_flow = dp_netdev_flow_cast(rule);
3001 }
3002 return netdev_flow;
3003 }
3004
3005 static struct dp_netdev_flow *
3006 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3007 const ovs_u128 *ufidp, const struct nlattr *key,
3008 size_t key_len)
3009 {
3010 struct dp_netdev_flow *netdev_flow;
3011 struct flow flow;
3012 ovs_u128 ufid;
3013
3014 /* If a UFID is not provided, determine one based on the key. */
3015 if (!ufidp && key && key_len
3016 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3017 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
3018 ufidp = &ufid;
3019 }
3020
3021 if (ufidp) {
3022 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3023 &pmd->flow_table) {
3024 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3025 return netdev_flow;
3026 }
3027 }
3028 }
3029
3030 return NULL;
3031 }
3032
3033 static void
3034 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
3035 struct dpif_flow_stats *stats)
3036 {
3037 struct dp_netdev_flow *netdev_flow;
3038 unsigned long long n;
3039 long long used;
3040 uint16_t flags;
3041
3042 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3043
3044 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3045 stats->n_packets = n;
3046 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3047 stats->n_bytes = n;
3048 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3049 stats->used = used;
3050 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3051 stats->tcp_flags = flags;
3052 }
3053
3054 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3055 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3056 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3057 * protect them. */
3058 static void
3059 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
3060 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3061 struct dpif_flow *flow, bool terse)
3062 {
3063 if (terse) {
3064 memset(flow, 0, sizeof *flow);
3065 } else {
3066 struct flow_wildcards wc;
3067 struct dp_netdev_actions *actions;
3068 size_t offset;
3069 struct odp_flow_key_parms odp_parms = {
3070 .flow = &netdev_flow->flow,
3071 .mask = &wc.masks,
3072 .support = dp_netdev_support,
3073 };
3074
3075 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3076 /* in_port is exact matched, but we have left it out from the mask for
3077 * optimnization reasons. Add in_port back to the mask. */
3078 wc.masks.in_port.odp_port = ODPP_NONE;
3079
3080 /* Key */
3081 offset = key_buf->size;
3082 flow->key = ofpbuf_tail(key_buf);
3083 odp_flow_key_from_flow(&odp_parms, key_buf);
3084 flow->key_len = key_buf->size - offset;
3085
3086 /* Mask */
3087 offset = mask_buf->size;
3088 flow->mask = ofpbuf_tail(mask_buf);
3089 odp_parms.key_buf = key_buf;
3090 odp_flow_key_from_mask(&odp_parms, mask_buf);
3091 flow->mask_len = mask_buf->size - offset;
3092
3093 /* Actions */
3094 actions = dp_netdev_flow_get_actions(netdev_flow);
3095 flow->actions = actions->actions;
3096 flow->actions_len = actions->size;
3097 }
3098
3099 flow->ufid = netdev_flow->ufid;
3100 flow->ufid_present = true;
3101 flow->pmd_id = netdev_flow->pmd_id;
3102 get_dpif_flow_stats(netdev_flow, &flow->stats);
3103
3104 flow->attrs.offloaded = false;
3105 flow->attrs.dp_layer = "ovs";
3106 }
3107
3108 static int
3109 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3110 const struct nlattr *mask_key,
3111 uint32_t mask_key_len, const struct flow *flow,
3112 struct flow_wildcards *wc, bool probe)
3113 {
3114 enum odp_key_fitness fitness;
3115
3116 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3117 if (fitness) {
3118 if (!probe) {
3119 /* This should not happen: it indicates that
3120 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3121 * disagree on the acceptable form of a mask. Log the problem
3122 * as an error, with enough details to enable debugging. */
3123 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3124
3125 if (!VLOG_DROP_ERR(&rl)) {
3126 struct ds s;
3127
3128 ds_init(&s);
3129 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3130 true);
3131 VLOG_ERR("internal error parsing flow mask %s (%s)",
3132 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3133 ds_destroy(&s);
3134 }
3135 }
3136
3137 return EINVAL;
3138 }
3139
3140 return 0;
3141 }
3142
3143 static int
3144 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3145 struct flow *flow, bool probe)
3146 {
3147 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3148 if (!probe) {
3149 /* This should not happen: it indicates that
3150 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3151 * the acceptable form of a flow. Log the problem as an error,
3152 * with enough details to enable debugging. */
3153 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3154
3155 if (!VLOG_DROP_ERR(&rl)) {
3156 struct ds s;
3157
3158 ds_init(&s);
3159 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3160 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3161 ds_destroy(&s);
3162 }
3163 }
3164
3165 return EINVAL;
3166 }
3167
3168 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3169 return EINVAL;
3170 }
3171
3172 return 0;
3173 }
3174
3175 static int
3176 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3177 {
3178 struct dp_netdev *dp = get_dp_netdev(dpif);
3179 struct dp_netdev_flow *netdev_flow;
3180 struct dp_netdev_pmd_thread *pmd;
3181 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3182 struct hmapx_node *node;
3183 int error = EINVAL;
3184
3185 if (get->pmd_id == PMD_ID_NULL) {
3186 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3187 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3188 dp_netdev_pmd_unref(pmd);
3189 }
3190 }
3191 } else {
3192 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3193 if (!pmd) {
3194 goto out;
3195 }
3196 hmapx_add(&to_find, pmd);
3197 }
3198
3199 if (!hmapx_count(&to_find)) {
3200 goto out;
3201 }
3202
3203 HMAPX_FOR_EACH (node, &to_find) {
3204 pmd = (struct dp_netdev_pmd_thread *) node->data;
3205 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3206 get->key_len);
3207 if (netdev_flow) {
3208 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
3209 get->flow, false);
3210 error = 0;
3211 break;
3212 } else {
3213 error = ENOENT;
3214 }
3215 }
3216
3217 HMAPX_FOR_EACH (node, &to_find) {
3218 pmd = (struct dp_netdev_pmd_thread *) node->data;
3219 dp_netdev_pmd_unref(pmd);
3220 }
3221 out:
3222 hmapx_destroy(&to_find);
3223 return error;
3224 }
3225
3226 static void
3227 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3228 {
3229 struct flow masked_flow;
3230 size_t i;
3231
3232 for (i = 0; i < sizeof(struct flow); i++) {
3233 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3234 ((uint8_t *)&match->wc)[i];
3235 }
3236 dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
3237 }
3238
3239 static struct dp_netdev_flow *
3240 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3241 struct match *match, const ovs_u128 *ufid,
3242 const struct nlattr *actions, size_t actions_len)
3243 OVS_REQUIRES(pmd->flow_mutex)
3244 {
3245 struct dp_netdev_flow *flow;
3246 struct netdev_flow_key mask;
3247 struct dpcls *cls;
3248
3249 /* Make sure in_port is exact matched before we read it. */
3250 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3251 odp_port_t in_port = match->flow.in_port.odp_port;
3252
3253 /* As we select the dpcls based on the port number, each netdev flow
3254 * belonging to the same dpcls will have the same odp_port value.
3255 * For performance reasons we wildcard odp_port here in the mask. In the
3256 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3257 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3258 * will not be part of the subtable mask.
3259 * This will speed up the hash computation during dpcls_lookup() because
3260 * there is one less call to hash_add64() in this case. */
3261 match->wc.masks.in_port.odp_port = 0;
3262 netdev_flow_mask_init(&mask, match);
3263 match->wc.masks.in_port.odp_port = ODPP_NONE;
3264
3265 /* Make sure wc does not have metadata. */
3266 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3267 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3268
3269 /* Do not allocate extra space. */
3270 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3271 memset(&flow->stats, 0, sizeof flow->stats);
3272 flow->dead = false;
3273 flow->batch = NULL;
3274 flow->mark = INVALID_FLOW_MARK;
3275 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3276 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3277 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3278 ovs_refcount_init(&flow->ref_cnt);
3279 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3280
3281 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3282 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3283
3284 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3285 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3286 dpcls_insert(cls, &flow->cr, &mask);
3287
3288 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3289 dp_netdev_flow_hash(&flow->ufid));
3290
3291 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3292
3293 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3294 struct ds ds = DS_EMPTY_INITIALIZER;
3295 struct ofpbuf key_buf, mask_buf;
3296 struct odp_flow_key_parms odp_parms = {
3297 .flow = &match->flow,
3298 .mask = &match->wc.masks,
3299 .support = dp_netdev_support,
3300 };
3301
3302 ofpbuf_init(&key_buf, 0);
3303 ofpbuf_init(&mask_buf, 0);
3304
3305 odp_flow_key_from_flow(&odp_parms, &key_buf);
3306 odp_parms.key_buf = &key_buf;
3307 odp_flow_key_from_mask(&odp_parms, &mask_buf);
3308
3309 ds_put_cstr(&ds, "flow_add: ");
3310 odp_format_ufid(ufid, &ds);
3311 ds_put_cstr(&ds, " ");
3312 odp_flow_format(key_buf.data, key_buf.size,
3313 mask_buf.data, mask_buf.size,
3314 NULL, &ds, false);
3315 ds_put_cstr(&ds, ", actions:");
3316 format_odp_actions(&ds, actions, actions_len, NULL);
3317
3318 VLOG_DBG("%s", ds_cstr(&ds));
3319
3320 ofpbuf_uninit(&key_buf);
3321 ofpbuf_uninit(&mask_buf);
3322
3323 /* Add a printout of the actual match installed. */
3324 struct match m;
3325 ds_clear(&ds);
3326 ds_put_cstr(&ds, "flow match: ");
3327 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3328 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3329 memset(&m.tun_md, 0, sizeof m.tun_md);
3330 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3331
3332 VLOG_DBG("%s", ds_cstr(&ds));
3333
3334 ds_destroy(&ds);
3335 }
3336
3337 return flow;
3338 }
3339
3340 static int
3341 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3342 struct netdev_flow_key *key,
3343 struct match *match,
3344 ovs_u128 *ufid,
3345 const struct dpif_flow_put *put,
3346 struct dpif_flow_stats *stats)
3347 {
3348 struct dp_netdev_flow *netdev_flow;
3349 int error = 0;
3350
3351 if (stats) {
3352 memset(stats, 0, sizeof *stats);
3353 }
3354
3355 ovs_mutex_lock(&pmd->flow_mutex);
3356 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3357 if (!netdev_flow) {
3358 if (put->flags & DPIF_FP_CREATE) {
3359 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
3360 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3361 put->actions_len);
3362 error = 0;
3363 } else {
3364 error = EFBIG;
3365 }
3366 } else {
3367 error = ENOENT;
3368 }
3369 } else {
3370 if (put->flags & DPIF_FP_MODIFY) {
3371 struct dp_netdev_actions *new_actions;
3372 struct dp_netdev_actions *old_actions;
3373
3374 new_actions = dp_netdev_actions_create(put->actions,
3375 put->actions_len);
3376
3377 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3378 ovsrcu_set(&netdev_flow->actions, new_actions);
3379
3380 queue_netdev_flow_put(pmd, netdev_flow, match,
3381 put->actions, put->actions_len);
3382
3383 if (stats) {
3384 get_dpif_flow_stats(netdev_flow, stats);
3385 }
3386 if (put->flags & DPIF_FP_ZERO_STATS) {
3387 /* XXX: The userspace datapath uses thread local statistics
3388 * (for flows), which should be updated only by the owning
3389 * thread. Since we cannot write on stats memory here,
3390 * we choose not to support this flag. Please note:
3391 * - This feature is currently used only by dpctl commands with
3392 * option --clear.
3393 * - Should the need arise, this operation can be implemented
3394 * by keeping a base value (to be update here) for each
3395 * counter, and subtracting it before outputting the stats */
3396 error = EOPNOTSUPP;
3397 }
3398
3399 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3400 } else if (put->flags & DPIF_FP_CREATE) {
3401 error = EEXIST;
3402 } else {
3403 /* Overlapping flow. */
3404 error = EINVAL;
3405 }
3406 }
3407 ovs_mutex_unlock(&pmd->flow_mutex);
3408 return error;
3409 }
3410
3411 static int
3412 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3413 {
3414 struct dp_netdev *dp = get_dp_netdev(dpif);
3415 struct netdev_flow_key key, mask;
3416 struct dp_netdev_pmd_thread *pmd;
3417 struct match match;
3418 ovs_u128 ufid;
3419 int error;
3420 bool probe = put->flags & DPIF_FP_PROBE;
3421
3422 if (put->stats) {
3423 memset(put->stats, 0, sizeof *put->stats);
3424 }
3425 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3426 probe);
3427 if (error) {
3428 return error;
3429 }
3430 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3431 put->mask, put->mask_len,
3432 &match.flow, &match.wc, probe);
3433 if (error) {
3434 return error;
3435 }
3436
3437 if (put->ufid) {
3438 ufid = *put->ufid;
3439 } else {
3440 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3441 }
3442
3443 /* The Netlink encoding of datapath flow keys cannot express
3444 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3445 * tag is interpreted as exact match on the fact that there is no
3446 * VLAN. Unless we refactor a lot of code that translates between
3447 * Netlink and struct flow representations, we have to do the same
3448 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3449 if (!match.wc.masks.vlans[0].tci) {
3450 match.wc.masks.vlans[0].tci = htons(0xffff);
3451 }
3452
3453 /* Must produce a netdev_flow_key for lookup.
3454 * Use the same method as employed to create the key when adding
3455 * the flow to the dplcs to make sure they match. */
3456 netdev_flow_mask_init(&mask, &match);
3457 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3458
3459 if (put->pmd_id == PMD_ID_NULL) {
3460 if (cmap_count(&dp->poll_threads) == 0) {
3461 return EINVAL;
3462 }
3463 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3464 struct dpif_flow_stats pmd_stats;
3465 int pmd_error;
3466
3467 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3468 &pmd_stats);
3469 if (pmd_error) {
3470 error = pmd_error;
3471 } else if (put->stats) {
3472 put->stats->n_packets += pmd_stats.n_packets;
3473 put->stats->n_bytes += pmd_stats.n_bytes;
3474 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3475 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3476 }
3477 }
3478 } else {
3479 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3480 if (!pmd) {
3481 return EINVAL;
3482 }
3483 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3484 dp_netdev_pmd_unref(pmd);
3485 }
3486
3487 return error;
3488 }
3489
3490 static int
3491 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3492 struct dpif_flow_stats *stats,
3493 const struct dpif_flow_del *del)
3494 {
3495 struct dp_netdev_flow *netdev_flow;
3496 int error = 0;
3497
3498 ovs_mutex_lock(&pmd->flow_mutex);
3499 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3500 del->key_len);
3501 if (netdev_flow) {
3502 if (stats) {
3503 get_dpif_flow_stats(netdev_flow, stats);
3504 }
3505 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3506 } else {
3507 error = ENOENT;
3508 }
3509 ovs_mutex_unlock(&pmd->flow_mutex);
3510
3511 return error;
3512 }
3513
3514 static int
3515 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3516 {
3517 struct dp_netdev *dp = get_dp_netdev(dpif);
3518 struct dp_netdev_pmd_thread *pmd;
3519 int error = 0;
3520
3521 if (del->stats) {
3522 memset(del->stats, 0, sizeof *del->stats);
3523 }
3524
3525 if (del->pmd_id == PMD_ID_NULL) {
3526 if (cmap_count(&dp->poll_threads) == 0) {
3527 return EINVAL;
3528 }
3529 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3530 struct dpif_flow_stats pmd_stats;
3531 int pmd_error;
3532
3533 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3534 if (pmd_error) {
3535 error = pmd_error;
3536 } else if (del->stats) {
3537 del->stats->n_packets += pmd_stats.n_packets;
3538 del->stats->n_bytes += pmd_stats.n_bytes;
3539 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3540 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3541 }
3542 }
3543 } else {
3544 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3545 if (!pmd) {
3546 return EINVAL;
3547 }
3548 error = flow_del_on_pmd(pmd, del->stats, del);
3549 dp_netdev_pmd_unref(pmd);
3550 }
3551
3552
3553 return error;
3554 }
3555
3556 struct dpif_netdev_flow_dump {
3557 struct dpif_flow_dump up;
3558 struct cmap_position poll_thread_pos;
3559 struct cmap_position flow_pos;
3560 struct dp_netdev_pmd_thread *cur_pmd;
3561 int status;
3562 struct ovs_mutex mutex;
3563 };
3564
3565 static struct dpif_netdev_flow_dump *
3566 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3567 {
3568 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3569 }
3570
3571 static struct dpif_flow_dump *
3572 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3573 struct dpif_flow_dump_types *types OVS_UNUSED)
3574 {
3575 struct dpif_netdev_flow_dump *dump;
3576
3577 dump = xzalloc(sizeof *dump);
3578 dpif_flow_dump_init(&dump->up, dpif_);
3579 dump->up.terse = terse;
3580 ovs_mutex_init(&dump->mutex);
3581
3582 return &dump->up;
3583 }
3584
3585 static int
3586 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3587 {
3588 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3589
3590 ovs_mutex_destroy(&dump->mutex);
3591 free(dump);
3592 return 0;
3593 }
3594
3595 struct dpif_netdev_flow_dump_thread {
3596 struct dpif_flow_dump_thread up;
3597 struct dpif_netdev_flow_dump *dump;
3598 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3599 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3600 };
3601
3602 static struct dpif_netdev_flow_dump_thread *
3603 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3604 {
3605 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3606 }
3607
3608 static struct dpif_flow_dump_thread *
3609 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3610 {
3611 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3612 struct dpif_netdev_flow_dump_thread *thread;
3613
3614 thread = xmalloc(sizeof *thread);
3615 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3616 thread->dump = dump;
3617 return &thread->up;
3618 }
3619
3620 static void
3621 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3622 {
3623 struct dpif_netdev_flow_dump_thread *thread
3624 = dpif_netdev_flow_dump_thread_cast(thread_);
3625
3626 free(thread);
3627 }
3628
3629 static int
3630 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3631 struct dpif_flow *flows, int max_flows)
3632 {
3633 struct dpif_netdev_flow_dump_thread *thread
3634 = dpif_netdev_flow_dump_thread_cast(thread_);
3635 struct dpif_netdev_flow_dump *dump = thread->dump;
3636 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3637 int n_flows = 0;
3638 int i;
3639
3640 ovs_mutex_lock(&dump->mutex);
3641 if (!dump->status) {
3642 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3643 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3644 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3645 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3646
3647 /* First call to dump_next(), extracts the first pmd thread.
3648 * If there is no pmd thread, returns immediately. */
3649 if (!pmd) {
3650 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3651 if (!pmd) {
3652 ovs_mutex_unlock(&dump->mutex);
3653 return n_flows;
3654
3655 }
3656 }
3657
3658 do {
3659 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3660 struct cmap_node *node;
3661
3662 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3663 if (!node) {
3664 break;
3665 }
3666 netdev_flows[n_flows] = CONTAINER_OF(node,
3667 struct dp_netdev_flow,
3668 node);
3669 }
3670 /* When finishing dumping the current pmd thread, moves to
3671 * the next. */
3672 if (n_flows < flow_limit) {
3673 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3674 dp_netdev_pmd_unref(pmd);
3675 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3676 if (!pmd) {
3677 dump->status = EOF;
3678 break;
3679 }
3680 }
3681 /* Keeps the reference to next caller. */
3682 dump->cur_pmd = pmd;
3683
3684 /* If the current dump is empty, do not exit the loop, since the
3685 * remaining pmds could have flows to be dumped. Just dumps again
3686 * on the new 'pmd'. */
3687 } while (!n_flows);
3688 }
3689 ovs_mutex_unlock(&dump->mutex);
3690
3691 for (i = 0; i < n_flows; i++) {
3692 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3693 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3694 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3695 struct dpif_flow *f = &flows[i];
3696 struct ofpbuf key, mask;
3697
3698 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3699 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3700 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3701 dump->up.terse);
3702 }
3703
3704 return n_flows;
3705 }
3706
3707 static int
3708 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3709 OVS_NO_THREAD_SAFETY_ANALYSIS
3710 {
3711 struct dp_netdev *dp = get_dp_netdev(dpif);
3712 struct dp_netdev_pmd_thread *pmd;
3713 struct dp_packet_batch pp;
3714
3715 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3716 dp_packet_size(execute->packet) > UINT16_MAX) {
3717 return EINVAL;
3718 }
3719
3720 /* Tries finding the 'pmd'. If NULL is returned, that means
3721 * the current thread is a non-pmd thread and should use
3722 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3723 pmd = ovsthread_getspecific(dp->per_pmd_key);
3724 if (!pmd) {
3725 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3726 if (!pmd) {
3727 return EBUSY;
3728 }
3729 }
3730
3731 if (execute->probe) {
3732 /* If this is part of a probe, Drop the packet, since executing
3733 * the action may actually cause spurious packets be sent into
3734 * the network. */
3735 if (pmd->core_id == NON_PMD_CORE_ID) {
3736 dp_netdev_pmd_unref(pmd);
3737 }
3738 return 0;
3739 }
3740
3741 /* If the current thread is non-pmd thread, acquires
3742 * the 'non_pmd_mutex'. */
3743 if (pmd->core_id == NON_PMD_CORE_ID) {
3744 ovs_mutex_lock(&dp->non_pmd_mutex);
3745 }
3746
3747 /* Update current time in PMD context. We don't care about EMC insertion
3748 * probability, because we are on a slow path. */
3749 pmd_thread_ctx_time_update(pmd);
3750
3751 /* The action processing expects the RSS hash to be valid, because
3752 * it's always initialized at the beginning of datapath processing.
3753 * In this case, though, 'execute->packet' may not have gone through
3754 * the datapath at all, it may have been generated by the upper layer
3755 * (OpenFlow packet-out, BFD frame, ...). */
3756 if (!dp_packet_rss_valid(execute->packet)) {
3757 dp_packet_set_rss_hash(execute->packet,
3758 flow_hash_5tuple(execute->flow, 0));
3759 }
3760
3761 dp_packet_batch_init_packet(&pp, execute->packet);
3762 pp.do_not_steal = true;
3763 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3764 execute->actions, execute->actions_len);
3765 dp_netdev_pmd_flush_output_packets(pmd, true);
3766
3767 if (pmd->core_id == NON_PMD_CORE_ID) {
3768 ovs_mutex_unlock(&dp->non_pmd_mutex);
3769 dp_netdev_pmd_unref(pmd);
3770 }
3771
3772 return 0;
3773 }
3774
3775 static void
3776 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3777 enum dpif_offload_type offload_type OVS_UNUSED)
3778 {
3779 size_t i;
3780
3781 for (i = 0; i < n_ops; i++) {
3782 struct dpif_op *op = ops[i];
3783
3784 switch (op->type) {
3785 case DPIF_OP_FLOW_PUT:
3786 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3787 break;
3788
3789 case DPIF_OP_FLOW_DEL:
3790 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3791 break;
3792
3793 case DPIF_OP_EXECUTE:
3794 op->error = dpif_netdev_execute(dpif, &op->execute);
3795 break;
3796
3797 case DPIF_OP_FLOW_GET:
3798 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3799 break;
3800 }
3801 }
3802 }
3803
3804 /* Enable or Disable PMD auto load balancing. */
3805 static void
3806 set_pmd_auto_lb(struct dp_netdev *dp)
3807 {
3808 unsigned int cnt = 0;
3809 struct dp_netdev_pmd_thread *pmd;
3810 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3811
3812 bool enable_alb = false;
3813 bool multi_rxq = false;
3814 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3815
3816 /* Ensure that there is at least 2 non-isolated PMDs and
3817 * one of them is polling more than one rxq. */
3818 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3819 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3820 continue;
3821 }
3822
3823 if (hmap_count(&pmd->poll_list) > 1) {
3824 multi_rxq = true;
3825 }
3826 if (cnt && multi_rxq) {
3827 enable_alb = true;
3828 break;
3829 }
3830 cnt++;
3831 }
3832
3833 /* Enable auto LB if it is requested and cycle based assignment is true. */
3834 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3835 pmd_alb->auto_lb_requested;
3836
3837 if (pmd_alb->is_enabled != enable_alb) {
3838 pmd_alb->is_enabled = enable_alb;
3839 if (pmd_alb->is_enabled) {
3840 VLOG_INFO("PMD auto load balance is enabled "
3841 "(with rebalance interval:%"PRIu64" msec)",
3842 pmd_alb->rebalance_intvl);
3843 } else {
3844 pmd_alb->rebalance_poll_timer = 0;
3845 VLOG_INFO("PMD auto load balance is disabled");
3846 }
3847 }
3848
3849 }
3850
3851 /* Applies datapath configuration from the database. Some of the changes are
3852 * actually applied in dpif_netdev_run(). */
3853 static int
3854 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3855 {
3856 struct dp_netdev *dp = get_dp_netdev(dpif);
3857 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3858 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3859 "cycles");
3860 unsigned long long insert_prob =
3861 smap_get_ullong(other_config, "emc-insert-inv-prob",
3862 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3863 uint32_t insert_min, cur_min;
3864 uint32_t tx_flush_interval, cur_tx_flush_interval;
3865 uint64_t rebalance_intvl;
3866
3867 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3868 DEFAULT_TX_FLUSH_INTERVAL);
3869 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3870 if (tx_flush_interval != cur_tx_flush_interval) {
3871 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3872 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3873 tx_flush_interval);
3874 }
3875
3876 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3877 free(dp->pmd_cmask);
3878 dp->pmd_cmask = nullable_xstrdup(cmask);
3879 dp_netdev_request_reconfigure(dp);
3880 }
3881
3882 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3883 if (insert_prob <= UINT32_MAX) {
3884 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3885 } else {
3886 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3887 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3888 }
3889
3890 if (insert_min != cur_min) {
3891 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3892 if (insert_min == 0) {
3893 VLOG_INFO("EMC insertion probability changed to zero");
3894 } else {
3895 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3896 insert_prob, (100 / (float)insert_prob));
3897 }
3898 }
3899
3900 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3901 bool cur_perf_enabled;
3902 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3903 if (perf_enabled != cur_perf_enabled) {
3904 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3905 if (perf_enabled) {
3906 VLOG_INFO("PMD performance metrics collection enabled");
3907 } else {
3908 VLOG_INFO("PMD performance metrics collection disabled");
3909 }
3910 }
3911
3912 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3913 bool cur_smc;
3914 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3915 if (smc_enable != cur_smc) {
3916 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3917 if (smc_enable) {
3918 VLOG_INFO("SMC cache is enabled");
3919 } else {
3920 VLOG_INFO("SMC cache is disabled");
3921 }
3922 }
3923
3924 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3925 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3926 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3927 "Defaulting to 'cycles'.");
3928 pmd_rxq_assign_cyc = true;
3929 pmd_rxq_assign = "cycles";
3930 }
3931 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
3932 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
3933 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3934 pmd_rxq_assign);
3935 dp_netdev_request_reconfigure(dp);
3936 }
3937
3938 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3939 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
3940 false);
3941
3942 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
3943 ALB_PMD_REBALANCE_POLL_INTERVAL);
3944
3945 /* Input is in min, convert it to msec. */
3946 rebalance_intvl =
3947 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
3948
3949 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
3950 pmd_alb->rebalance_intvl = rebalance_intvl;
3951 }
3952
3953 set_pmd_auto_lb(dp);
3954 return 0;
3955 }
3956
3957 /* Parses affinity list and returns result in 'core_ids'. */
3958 static int
3959 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3960 {
3961 unsigned i;
3962 char *list, *copy, *key, *value;
3963 int error = 0;
3964
3965 for (i = 0; i < n_rxq; i++) {
3966 core_ids[i] = OVS_CORE_UNSPEC;
3967 }
3968
3969 if (!affinity_list) {
3970 return 0;
3971 }
3972
3973 list = copy = xstrdup(affinity_list);
3974
3975 while (ofputil_parse_key_value(&list, &key, &value)) {
3976 int rxq_id, core_id;
3977
3978 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3979 || !str_to_int(value, 0, &core_id) || core_id < 0) {
3980 error = EINVAL;
3981 break;
3982 }
3983
3984 if (rxq_id < n_rxq) {
3985 core_ids[rxq_id] = core_id;
3986 }
3987 }
3988
3989 free(copy);
3990 return error;
3991 }
3992
3993 /* Parses 'affinity_list' and applies configuration if it is valid. */
3994 static int
3995 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3996 const char *affinity_list)
3997 {
3998 unsigned *core_ids, i;
3999 int error = 0;
4000
4001 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4002 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4003 error = EINVAL;
4004 goto exit;
4005 }
4006
4007 for (i = 0; i < port->n_rxq; i++) {
4008 port->rxqs[i].core_id = core_ids[i];
4009 }
4010
4011 exit:
4012 free(core_ids);
4013 return error;
4014 }
4015
4016 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4017 * of given PMD thread. */
4018 static bool
4019 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4020 struct dp_netdev_port *port)
4021 OVS_EXCLUDED(pmd->port_mutex)
4022 {
4023 struct rxq_poll *poll;
4024 bool found = false;
4025
4026 ovs_mutex_lock(&pmd->port_mutex);
4027 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4028 if (port == poll->rxq->port) {
4029 found = true;
4030 break;
4031 }
4032 }
4033 ovs_mutex_unlock(&pmd->port_mutex);
4034 return found;
4035 }
4036
4037 /* Updates port configuration from the database. The changes are actually
4038 * applied in dpif_netdev_run(). */
4039 static int
4040 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4041 const struct smap *cfg)
4042 {
4043 struct dp_netdev *dp = get_dp_netdev(dpif);
4044 struct dp_netdev_port *port;
4045 int error = 0;
4046 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4047 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4048
4049 ovs_mutex_lock(&dp->port_mutex);
4050 error = get_port_by_number(dp, port_no, &port);
4051 if (error) {
4052 goto unlock;
4053 }
4054
4055 if (emc_enabled != port->emc_enabled) {
4056 struct dp_netdev_pmd_thread *pmd;
4057 struct ds ds = DS_EMPTY_INITIALIZER;
4058 uint32_t cur_min, insert_prob;
4059
4060 port->emc_enabled = emc_enabled;
4061 /* Mark for reload all the threads that polls this port and request
4062 * for reconfiguration for the actual reloading of threads. */
4063 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4064 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4065 pmd->need_reload = true;
4066 }
4067 }
4068 dp_netdev_request_reconfigure(dp);
4069
4070 ds_put_format(&ds, "%s: EMC has been %s.",
4071 netdev_get_name(port->netdev),
4072 (emc_enabled) ? "enabled" : "disabled");
4073 if (emc_enabled) {
4074 ds_put_cstr(&ds, " Current insertion probability is ");
4075 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4076 if (!cur_min) {
4077 ds_put_cstr(&ds, "zero.");
4078 } else {
4079 insert_prob = UINT32_MAX / cur_min;
4080 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4081 insert_prob, 100 / (float) insert_prob);
4082 }
4083 }
4084 VLOG_INFO("%s", ds_cstr(&ds));
4085 ds_destroy(&ds);
4086 }
4087
4088 /* Checking for RXq affinity changes. */
4089 if (!netdev_is_pmd(port->netdev)
4090 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4091 goto unlock;
4092 }
4093
4094 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4095 if (error) {
4096 goto unlock;
4097 }
4098 free(port->rxq_affinity_list);
4099 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4100
4101 dp_netdev_request_reconfigure(dp);
4102 unlock:
4103 ovs_mutex_unlock(&dp->port_mutex);
4104 return error;
4105 }
4106
4107 static int
4108 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4109 uint32_t queue_id, uint32_t *priority)
4110 {
4111 *priority = queue_id;
4112 return 0;
4113 }
4114
4115 \f
4116 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4117 * a copy of the 'size' bytes of 'actions' input parameters. */
4118 struct dp_netdev_actions *
4119 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4120 {
4121 struct dp_netdev_actions *netdev_actions;
4122
4123 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4124 memcpy(netdev_actions->actions, actions, size);
4125 netdev_actions->size = size;
4126
4127 return netdev_actions;
4128 }
4129
4130 struct dp_netdev_actions *
4131 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4132 {
4133 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4134 }
4135
4136 static void
4137 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4138 {
4139 free(actions);
4140 }
4141 \f
4142 static void
4143 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4144 enum rxq_cycles_counter_type type,
4145 unsigned long long cycles)
4146 {
4147 atomic_store_relaxed(&rx->cycles[type], cycles);
4148 }
4149
4150 static void
4151 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4152 enum rxq_cycles_counter_type type,
4153 unsigned long long cycles)
4154 {
4155 non_atomic_ullong_add(&rx->cycles[type], cycles);
4156 }
4157
4158 static uint64_t
4159 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4160 enum rxq_cycles_counter_type type)
4161 {
4162 unsigned long long processing_cycles;
4163 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4164 return processing_cycles;
4165 }
4166
4167 static void
4168 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4169 unsigned long long cycles)
4170 {
4171 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4172 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4173 }
4174
4175 static uint64_t
4176 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4177 {
4178 unsigned long long processing_cycles;
4179 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4180 return processing_cycles;
4181 }
4182
4183 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4184 static inline bool
4185 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4186 {
4187 bool pmd_perf_enabled;
4188 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4189 return pmd_perf_enabled;
4190 }
4191 #else
4192 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4193 * performance metrics are not available as locked access to 64 bit
4194 * integers would be prohibitively expensive. */
4195 static inline bool
4196 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4197 {
4198 return false;
4199 }
4200 #endif
4201
4202 static int
4203 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4204 struct tx_port *p)
4205 {
4206 int i;
4207 int tx_qid;
4208 int output_cnt;
4209 bool dynamic_txqs;
4210 struct cycle_timer timer;
4211 uint64_t cycles;
4212 uint32_t tx_flush_interval;
4213
4214 cycle_timer_start(&pmd->perf_stats, &timer);
4215
4216 dynamic_txqs = p->port->dynamic_txqs;
4217 if (dynamic_txqs) {
4218 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4219 } else {
4220 tx_qid = pmd->static_tx_qid;
4221 }
4222
4223 output_cnt = dp_packet_batch_size(&p->output_pkts);
4224 ovs_assert(output_cnt > 0);
4225
4226 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4227 dp_packet_batch_init(&p->output_pkts);
4228
4229 /* Update time of the next flush. */
4230 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4231 p->flush_time = pmd->ctx.now + tx_flush_interval;
4232
4233 ovs_assert(pmd->n_output_batches > 0);
4234 pmd->n_output_batches--;
4235
4236 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4237 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4238
4239 /* Distribute send cycles evenly among transmitted packets and assign to
4240 * their respective rx queues. */
4241 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4242 for (i = 0; i < output_cnt; i++) {
4243 if (p->output_pkts_rxqs[i]) {
4244 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4245 RXQ_CYCLES_PROC_CURR, cycles);
4246 }
4247 }
4248
4249 return output_cnt;
4250 }
4251
4252 static int
4253 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4254 bool force)
4255 {
4256 struct tx_port *p;
4257 int output_cnt = 0;
4258
4259 if (!pmd->n_output_batches) {
4260 return 0;
4261 }
4262
4263 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4264 if (!dp_packet_batch_is_empty(&p->output_pkts)
4265 && (force || pmd->ctx.now >= p->flush_time)) {
4266 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4267 }
4268 }
4269 return output_cnt;
4270 }
4271
4272 static int
4273 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4274 struct dp_netdev_rxq *rxq,
4275 odp_port_t port_no)
4276 {
4277 struct pmd_perf_stats *s = &pmd->perf_stats;
4278 struct dp_packet_batch batch;
4279 struct cycle_timer timer;
4280 int error;
4281 int batch_cnt = 0;
4282 int rem_qlen = 0, *qlen_p = NULL;
4283 uint64_t cycles;
4284
4285 /* Measure duration for polling and processing rx burst. */
4286 cycle_timer_start(&pmd->perf_stats, &timer);
4287
4288 pmd->ctx.last_rxq = rxq;
4289 dp_packet_batch_init(&batch);
4290
4291 /* Fetch the rx queue length only for vhostuser ports. */
4292 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4293 qlen_p = &rem_qlen;
4294 }
4295
4296 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4297 if (!error) {
4298 /* At least one packet received. */
4299 *recirc_depth_get() = 0;
4300 pmd_thread_ctx_time_update(pmd);
4301 batch_cnt = batch.count;
4302 if (pmd_perf_metrics_enabled(pmd)) {
4303 /* Update batch histogram. */
4304 s->current.batches++;
4305 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4306 /* Update the maximum vhost rx queue fill level. */
4307 if (rxq->is_vhost && rem_qlen >= 0) {
4308 uint32_t qfill = batch_cnt + rem_qlen;
4309 if (qfill > s->current.max_vhost_qfill) {
4310 s->current.max_vhost_qfill = qfill;
4311 }
4312 }
4313 }
4314 /* Process packet batch. */
4315 dp_netdev_input(pmd, &batch, port_no);
4316
4317 /* Assign processing cycles to rx queue. */
4318 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4319 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4320
4321 dp_netdev_pmd_flush_output_packets(pmd, false);
4322 } else {
4323 /* Discard cycles. */
4324 cycle_timer_stop(&pmd->perf_stats, &timer);
4325 if (error != EAGAIN && error != EOPNOTSUPP) {
4326 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4327
4328 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4329 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4330 }
4331 }
4332
4333 pmd->ctx.last_rxq = NULL;
4334
4335 return batch_cnt;
4336 }
4337
4338 static struct tx_port *
4339 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4340 {
4341 struct tx_port *tx;
4342
4343 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4344 if (tx->port->port_no == port_no) {
4345 return tx;
4346 }
4347 }
4348
4349 return NULL;
4350 }
4351
4352 static int
4353 port_reconfigure(struct dp_netdev_port *port)
4354 {
4355 struct netdev *netdev = port->netdev;
4356 int i, err;
4357
4358 /* Closes the existing 'rxq's. */
4359 for (i = 0; i < port->n_rxq; i++) {
4360 netdev_rxq_close(port->rxqs[i].rx);
4361 port->rxqs[i].rx = NULL;
4362 }
4363 unsigned last_nrxq = port->n_rxq;
4364 port->n_rxq = 0;
4365
4366 /* Allows 'netdev' to apply the pending configuration changes. */
4367 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4368 err = netdev_reconfigure(netdev);
4369 if (err && (err != EOPNOTSUPP)) {
4370 VLOG_ERR("Failed to set interface %s new configuration",
4371 netdev_get_name(netdev));
4372 return err;
4373 }
4374 }
4375 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4376 port->rxqs = xrealloc(port->rxqs,
4377 sizeof *port->rxqs * netdev_n_rxq(netdev));
4378 /* Realloc 'used' counters for tx queues. */
4379 free(port->txq_used);
4380 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4381
4382 for (i = 0; i < netdev_n_rxq(netdev); i++) {
4383 bool new_queue = i >= last_nrxq;
4384 if (new_queue) {
4385 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4386 }
4387
4388 port->rxqs[i].port = port;
4389 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4390
4391 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4392 if (err) {
4393 return err;
4394 }
4395 port->n_rxq++;
4396 }
4397
4398 /* Parse affinity list to apply configuration for new queues. */
4399 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4400
4401 /* If reconfiguration was successful mark it as such, so we can use it */
4402 port->need_reconfigure = false;
4403
4404 return 0;
4405 }
4406
4407 struct rr_numa_list {
4408 struct hmap numas; /* Contains 'struct rr_numa' */
4409 };
4410
4411 struct rr_numa {
4412 struct hmap_node node;
4413
4414 int numa_id;
4415
4416 /* Non isolated pmds on numa node 'numa_id' */
4417 struct dp_netdev_pmd_thread **pmds;
4418 int n_pmds;
4419
4420 int cur_index;
4421 bool idx_inc;
4422 };
4423
4424 static struct rr_numa *
4425 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4426 {
4427 struct rr_numa *numa;
4428
4429 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4430 if (numa->numa_id == numa_id) {
4431 return numa;
4432 }
4433 }
4434
4435 return NULL;
4436 }
4437
4438 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4439 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4440 * Returns NULL if 'rr' numa list is empty. */
4441 static struct rr_numa *
4442 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4443 {
4444 struct hmap_node *node = NULL;
4445
4446 if (numa) {
4447 node = hmap_next(&rr->numas, &numa->node);
4448 }
4449 if (!node) {
4450 node = hmap_first(&rr->numas);
4451 }
4452
4453 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4454 }
4455
4456 static void
4457 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4458 {
4459 struct dp_netdev_pmd_thread *pmd;
4460 struct rr_numa *numa;
4461
4462 hmap_init(&rr->numas);
4463
4464 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4465 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4466 continue;
4467 }
4468
4469 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4470 if (!numa) {
4471 numa = xzalloc(sizeof *numa);
4472 numa->numa_id = pmd->numa_id;
4473 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4474 }
4475 numa->n_pmds++;
4476 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4477 numa->pmds[numa->n_pmds - 1] = pmd;
4478 /* At least one pmd so initialise curr_idx and idx_inc. */
4479 numa->cur_index = 0;
4480 numa->idx_inc = true;
4481 }
4482 }
4483
4484 /*
4485 * Returns the next pmd from the numa node.
4486 *
4487 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4488 * either an up or down walk, switching between up/down when the first or last
4489 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4490 *
4491 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4492 * core reached. e.g. 1,2,3,1,2,3,1,2...
4493 */
4494 static struct dp_netdev_pmd_thread *
4495 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4496 {
4497 int numa_idx = numa->cur_index;
4498
4499 if (numa->idx_inc == true) {
4500 /* Incrementing through list of pmds. */
4501 if (numa->cur_index == numa->n_pmds-1) {
4502 /* Reached the last pmd. */
4503 if (updown) {
4504 numa->idx_inc = false;
4505 } else {
4506 numa->cur_index = 0;
4507 }
4508 } else {
4509 numa->cur_index++;
4510 }
4511 } else {
4512 /* Decrementing through list of pmds. */
4513 if (numa->cur_index == 0) {
4514 /* Reached the first pmd. */
4515 numa->idx_inc = true;
4516 } else {
4517 numa->cur_index--;
4518 }
4519 }
4520 return numa->pmds[numa_idx];
4521 }
4522
4523 static void
4524 rr_numa_list_destroy(struct rr_numa_list *rr)
4525 {
4526 struct rr_numa *numa;
4527
4528 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4529 free(numa->pmds);
4530 free(numa);
4531 }
4532 hmap_destroy(&rr->numas);
4533 }
4534
4535 /* Sort Rx Queues by the processing cycles they are consuming. */
4536 static int
4537 compare_rxq_cycles(const void *a, const void *b)
4538 {
4539 struct dp_netdev_rxq *qa;
4540 struct dp_netdev_rxq *qb;
4541 uint64_t cycles_qa, cycles_qb;
4542
4543 qa = *(struct dp_netdev_rxq **) a;
4544 qb = *(struct dp_netdev_rxq **) b;
4545
4546 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4547 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4548
4549 if (cycles_qa != cycles_qb) {
4550 return (cycles_qa < cycles_qb) ? 1 : -1;
4551 } else {
4552 /* Cycles are the same so tiebreak on port/queue id.
4553 * Tiebreaking (as opposed to return 0) ensures consistent
4554 * sort results across multiple OS's. */
4555 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4556 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4557 if (port_qa != port_qb) {
4558 return port_qa > port_qb ? 1 : -1;
4559 } else {
4560 return netdev_rxq_get_queue_id(qa->rx)
4561 - netdev_rxq_get_queue_id(qb->rx);
4562 }
4563 }
4564 }
4565
4566 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4567 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4568 * pmds to unpinned queues.
4569 *
4570 * The function doesn't touch the pmd threads, it just stores the assignment
4571 * in the 'pmd' member of each rxq. */
4572 static void
4573 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4574 {
4575 struct dp_netdev_port *port;
4576 struct rr_numa_list rr;
4577 struct rr_numa *non_local_numa = NULL;
4578 struct dp_netdev_rxq ** rxqs = NULL;
4579 int n_rxqs = 0;
4580 struct rr_numa *numa = NULL;
4581 int numa_id;
4582 bool assign_cyc = dp->pmd_rxq_assign_cyc;
4583
4584 HMAP_FOR_EACH (port, node, &dp->ports) {
4585 if (!netdev_is_pmd(port->netdev)) {
4586 continue;
4587 }
4588
4589 for (int qid = 0; qid < port->n_rxq; qid++) {
4590 struct dp_netdev_rxq *q = &port->rxqs[qid];
4591
4592 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4593 struct dp_netdev_pmd_thread *pmd;
4594
4595 pmd = dp_netdev_get_pmd(dp, q->core_id);
4596 if (!pmd) {
4597 VLOG_WARN("There is no PMD thread on core %d. Queue "
4598 "%d on port \'%s\' will not be polled.",
4599 q->core_id, qid, netdev_get_name(port->netdev));
4600 } else {
4601 q->pmd = pmd;
4602 pmd->isolated = true;
4603 dp_netdev_pmd_unref(pmd);
4604 }
4605 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4606 uint64_t cycle_hist = 0;
4607
4608 if (n_rxqs == 0) {
4609 rxqs = xmalloc(sizeof *rxqs);
4610 } else {
4611 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4612 }
4613
4614 if (assign_cyc) {
4615 /* Sum the queue intervals and store the cycle history. */
4616 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4617 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4618 }
4619 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4620 cycle_hist);
4621 }
4622 /* Store the queue. */
4623 rxqs[n_rxqs++] = q;
4624 }
4625 }
4626 }
4627
4628 if (n_rxqs > 1 && assign_cyc) {
4629 /* Sort the queues in order of the processing cycles
4630 * they consumed during their last pmd interval. */
4631 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4632 }
4633
4634 rr_numa_list_populate(dp, &rr);
4635 /* Assign the sorted queues to pmds in round robin. */
4636 for (int i = 0; i < n_rxqs; i++) {
4637 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4638 numa = rr_numa_list_lookup(&rr, numa_id);
4639 if (!numa) {
4640 /* There are no pmds on the queue's local NUMA node.
4641 Round robin on the NUMA nodes that do have pmds. */
4642 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4643 if (!non_local_numa) {
4644 VLOG_ERR("There is no available (non-isolated) pmd "
4645 "thread for port \'%s\' queue %d. This queue "
4646 "will not be polled. Is pmd-cpu-mask set to "
4647 "zero? Or are all PMDs isolated to other "
4648 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4649 netdev_rxq_get_queue_id(rxqs[i]->rx));
4650 continue;
4651 }
4652 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4653 VLOG_WARN("There's no available (non-isolated) pmd thread "
4654 "on numa node %d. Queue %d on port \'%s\' will "
4655 "be assigned to the pmd on core %d "
4656 "(numa node %d). Expect reduced performance.",
4657 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4658 netdev_rxq_get_name(rxqs[i]->rx),
4659 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4660 } else {
4661 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4662 if (assign_cyc) {
4663 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4664 "rx queue %d "
4665 "(measured processing cycles %"PRIu64").",
4666 rxqs[i]->pmd->core_id, numa_id,
4667 netdev_rxq_get_name(rxqs[i]->rx),
4668 netdev_rxq_get_queue_id(rxqs[i]->rx),
4669 dp_netdev_rxq_get_cycles(rxqs[i],
4670 RXQ_CYCLES_PROC_HIST));
4671 } else {
4672 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4673 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4674 netdev_rxq_get_name(rxqs[i]->rx),
4675 netdev_rxq_get_queue_id(rxqs[i]->rx));
4676 }
4677 }
4678 }
4679
4680 rr_numa_list_destroy(&rr);
4681 free(rxqs);
4682 }
4683
4684 static void
4685 reload_affected_pmds(struct dp_netdev *dp)
4686 {
4687 struct dp_netdev_pmd_thread *pmd;
4688
4689 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4690 if (pmd->need_reload) {
4691 flow_mark_flush(pmd);
4692 dp_netdev_reload_pmd__(pmd);
4693 }
4694 }
4695
4696 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4697 if (pmd->need_reload) {
4698 if (pmd->core_id != NON_PMD_CORE_ID) {
4699 bool reload;
4700
4701 do {
4702 atomic_read_explicit(&pmd->reload, &reload,
4703 memory_order_acquire);
4704 } while (reload);
4705 }
4706 pmd->need_reload = false;
4707 }
4708 }
4709 }
4710
4711 static void
4712 reconfigure_pmd_threads(struct dp_netdev *dp)
4713 OVS_REQUIRES(dp->port_mutex)
4714 {
4715 struct dp_netdev_pmd_thread *pmd;
4716 struct ovs_numa_dump *pmd_cores;
4717 struct ovs_numa_info_core *core;
4718 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4719 struct hmapx_node *node;
4720 bool changed = false;
4721 bool need_to_adjust_static_tx_qids = false;
4722
4723 /* The pmd threads should be started only if there's a pmd port in the
4724 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4725 * NR_PMD_THREADS per numa node. */
4726 if (!has_pmd_port(dp)) {
4727 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4728 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4729 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4730 } else {
4731 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4732 }
4733
4734 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4735 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4736 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4737 /* Adjustment is required to keep 'static_tx_qid's sequential and
4738 * avoid possible issues, for example, imbalanced tx queue usage
4739 * and unnecessary locking caused by remapping on netdev level. */
4740 need_to_adjust_static_tx_qids = true;
4741 }
4742
4743 /* Check for unwanted pmd threads */
4744 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4745 if (pmd->core_id == NON_PMD_CORE_ID) {
4746 continue;
4747 }
4748 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4749 pmd->core_id)) {
4750 hmapx_add(&to_delete, pmd);
4751 } else if (need_to_adjust_static_tx_qids) {
4752 atomic_store_relaxed(&pmd->reload_tx_qid, true);
4753 pmd->need_reload = true;
4754 }
4755 }
4756
4757 HMAPX_FOR_EACH (node, &to_delete) {
4758 pmd = (struct dp_netdev_pmd_thread *) node->data;
4759 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4760 pmd->numa_id, pmd->core_id);
4761 dp_netdev_del_pmd(dp, pmd);
4762 }
4763 changed = !hmapx_is_empty(&to_delete);
4764 hmapx_destroy(&to_delete);
4765
4766 if (need_to_adjust_static_tx_qids) {
4767 /* 'static_tx_qid's are not sequential now.
4768 * Reload remaining threads to fix this. */
4769 reload_affected_pmds(dp);
4770 }
4771
4772 /* Check for required new pmd threads */
4773 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4774 pmd = dp_netdev_get_pmd(dp, core->core_id);
4775 if (!pmd) {
4776 pmd = xzalloc(sizeof *pmd);
4777 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4778 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4779 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4780 pmd->numa_id, pmd->core_id);
4781 changed = true;
4782 } else {
4783 dp_netdev_pmd_unref(pmd);
4784 }
4785 }
4786
4787 if (changed) {
4788 struct ovs_numa_info_numa *numa;
4789
4790 /* Log the number of pmd threads per numa node. */
4791 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4792 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4793 numa->n_cores, numa->numa_id);
4794 }
4795 }
4796
4797 ovs_numa_dump_destroy(pmd_cores);
4798 }
4799
4800 static void
4801 pmd_remove_stale_ports(struct dp_netdev *dp,
4802 struct dp_netdev_pmd_thread *pmd)
4803 OVS_EXCLUDED(pmd->port_mutex)
4804 OVS_REQUIRES(dp->port_mutex)
4805 {
4806 struct rxq_poll *poll, *poll_next;
4807 struct tx_port *tx, *tx_next;
4808
4809 ovs_mutex_lock(&pmd->port_mutex);
4810 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4811 struct dp_netdev_port *port = poll->rxq->port;
4812
4813 if (port->need_reconfigure
4814 || !hmap_contains(&dp->ports, &port->node)) {
4815 dp_netdev_del_rxq_from_pmd(pmd, poll);
4816 }
4817 }
4818 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4819 struct dp_netdev_port *port = tx->port;
4820
4821 if (port->need_reconfigure
4822 || !hmap_contains(&dp->ports, &port->node)) {
4823 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4824 }
4825 }
4826 ovs_mutex_unlock(&pmd->port_mutex);
4827 }
4828
4829 /* Must be called each time a port is added/removed or the cmask changes.
4830 * This creates and destroys pmd threads, reconfigures ports, opens their
4831 * rxqs and assigns all rxqs/txqs to pmd threads. */
4832 static void
4833 reconfigure_datapath(struct dp_netdev *dp)
4834 OVS_REQUIRES(dp->port_mutex)
4835 {
4836 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
4837 struct dp_netdev_pmd_thread *pmd;
4838 struct dp_netdev_port *port;
4839 int wanted_txqs;
4840
4841 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4842
4843 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4844 * on the system and the user configuration. */
4845 reconfigure_pmd_threads(dp);
4846
4847 wanted_txqs = cmap_count(&dp->poll_threads);
4848
4849 /* The number of pmd threads might have changed, or a port can be new:
4850 * adjust the txqs. */
4851 HMAP_FOR_EACH (port, node, &dp->ports) {
4852 netdev_set_tx_multiq(port->netdev, wanted_txqs);
4853 }
4854
4855 /* Step 2: Remove from the pmd threads ports that have been removed or
4856 * need reconfiguration. */
4857
4858 /* Check for all the ports that need reconfiguration. We cache this in
4859 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4860 * change at any time. */
4861 HMAP_FOR_EACH (port, node, &dp->ports) {
4862 if (netdev_is_reconf_required(port->netdev)) {
4863 port->need_reconfigure = true;
4864 }
4865 }
4866
4867 /* Remove from the pmd threads all the ports that have been deleted or
4868 * need reconfiguration. */
4869 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4870 pmd_remove_stale_ports(dp, pmd);
4871 }
4872
4873 /* Reload affected pmd threads. We must wait for the pmd threads before
4874 * reconfiguring the ports, because a port cannot be reconfigured while
4875 * it's being used. */
4876 reload_affected_pmds(dp);
4877
4878 /* Step 3: Reconfigure ports. */
4879
4880 /* We only reconfigure the ports that we determined above, because they're
4881 * not being used by any pmd thread at the moment. If a port fails to
4882 * reconfigure we remove it from the datapath. */
4883 struct dp_netdev_port *next_port;
4884 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4885 int err;
4886
4887 if (!port->need_reconfigure) {
4888 continue;
4889 }
4890
4891 err = port_reconfigure(port);
4892 if (err) {
4893 hmap_remove(&dp->ports, &port->node);
4894 seq_change(dp->port_seq);
4895 port_destroy(port);
4896 } else {
4897 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4898 }
4899 }
4900
4901 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4902 * for now, we just update the 'pmd' pointer in each rxq to point to the
4903 * wanted thread according to the scheduling policy. */
4904
4905 /* Reset all the pmd threads to non isolated. */
4906 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4907 pmd->isolated = false;
4908 }
4909
4910 /* Reset all the queues to unassigned */
4911 HMAP_FOR_EACH (port, node, &dp->ports) {
4912 for (int i = 0; i < port->n_rxq; i++) {
4913 port->rxqs[i].pmd = NULL;
4914 }
4915 }
4916
4917 /* Add pinned queues and mark pmd threads isolated. */
4918 rxq_scheduling(dp, true);
4919
4920 /* Add non-pinned queues. */
4921 rxq_scheduling(dp, false);
4922
4923 /* Step 5: Remove queues not compliant with new scheduling. */
4924
4925 /* Count all the threads that will have at least one queue to poll. */
4926 HMAP_FOR_EACH (port, node, &dp->ports) {
4927 for (int qid = 0; qid < port->n_rxq; qid++) {
4928 struct dp_netdev_rxq *q = &port->rxqs[qid];
4929
4930 if (q->pmd) {
4931 hmapx_add(&busy_threads, q->pmd);
4932 }
4933 }
4934 }
4935
4936 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4937 struct rxq_poll *poll, *poll_next;
4938
4939 ovs_mutex_lock(&pmd->port_mutex);
4940 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4941 if (poll->rxq->pmd != pmd) {
4942 dp_netdev_del_rxq_from_pmd(pmd, poll);
4943
4944 /* This pmd might sleep after this step if it has no rxq
4945 * remaining. Tell it to busy wait for new assignment if it
4946 * has at least one scheduled queue. */
4947 if (hmap_count(&pmd->poll_list) == 0 &&
4948 hmapx_contains(&busy_threads, pmd)) {
4949 atomic_store_relaxed(&pmd->wait_for_reload, true);
4950 }
4951 }
4952 }
4953 ovs_mutex_unlock(&pmd->port_mutex);
4954 }
4955
4956 hmapx_destroy(&busy_threads);
4957
4958 /* Reload affected pmd threads. We must wait for the pmd threads to remove
4959 * the old queues before readding them, otherwise a queue can be polled by
4960 * two threads at the same time. */
4961 reload_affected_pmds(dp);
4962
4963 /* Step 6: Add queues from scheduling, if they're not there already. */
4964 HMAP_FOR_EACH (port, node, &dp->ports) {
4965 if (!netdev_is_pmd(port->netdev)) {
4966 continue;
4967 }
4968
4969 for (int qid = 0; qid < port->n_rxq; qid++) {
4970 struct dp_netdev_rxq *q = &port->rxqs[qid];
4971
4972 if (q->pmd) {
4973 ovs_mutex_lock(&q->pmd->port_mutex);
4974 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4975 ovs_mutex_unlock(&q->pmd->port_mutex);
4976 }
4977 }
4978 }
4979
4980 /* Add every port to the tx cache of every pmd thread, if it's not
4981 * there already and if this pmd has at least one rxq to poll. */
4982 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4983 ovs_mutex_lock(&pmd->port_mutex);
4984 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4985 HMAP_FOR_EACH (port, node, &dp->ports) {
4986 dp_netdev_add_port_tx_to_pmd(pmd, port);
4987 }
4988 }
4989 ovs_mutex_unlock(&pmd->port_mutex);
4990 }
4991
4992 /* Reload affected pmd threads. */
4993 reload_affected_pmds(dp);
4994
4995 /* Check if PMD Auto LB is to be enabled */
4996 set_pmd_auto_lb(dp);
4997 }
4998
4999 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5000 static bool
5001 ports_require_restart(const struct dp_netdev *dp)
5002 OVS_REQUIRES(dp->port_mutex)
5003 {
5004 struct dp_netdev_port *port;
5005
5006 HMAP_FOR_EACH (port, node, &dp->ports) {
5007 if (netdev_is_reconf_required(port->netdev)) {
5008 return true;
5009 }
5010 }
5011
5012 return false;
5013 }
5014
5015 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5016 * of elements in array to be considered for calculating vairance.
5017 * Usage example: data array 'a' contains the processing load of each pmd and
5018 * 'n' is the number of PMDs. It returns the variance in processing load of
5019 * PMDs*/
5020 static uint64_t
5021 variance(uint64_t a[], int n)
5022 {
5023 /* Compute mean (average of elements). */
5024 uint64_t sum = 0;
5025 uint64_t mean = 0;
5026 uint64_t sqDiff = 0;
5027
5028 if (!n) {
5029 return 0;
5030 }
5031
5032 for (int i = 0; i < n; i++) {
5033 sum += a[i];
5034 }
5035
5036 if (sum) {
5037 mean = sum / n;
5038
5039 /* Compute sum squared differences with mean. */
5040 for (int i = 0; i < n; i++) {
5041 sqDiff += (a[i] - mean)*(a[i] - mean);
5042 }
5043 }
5044 return (sqDiff ? (sqDiff / n) : 0);
5045 }
5046
5047
5048 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5049 * assignment to PMDs. */
5050 static bool
5051 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5052 uint32_t num_pmds, uint64_t *predicted_variance)
5053 OVS_REQUIRES(dp->port_mutex)
5054 {
5055 struct dp_netdev_port *port;
5056 struct dp_netdev_pmd_thread *pmd;
5057 struct dp_netdev_rxq **rxqs = NULL;
5058 struct rr_numa *numa = NULL;
5059 struct rr_numa_list rr;
5060 int n_rxqs = 0;
5061 bool ret = false;
5062 uint64_t *pmd_usage;
5063
5064 if (!predicted_variance) {
5065 return ret;
5066 }
5067
5068 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5069
5070 HMAP_FOR_EACH (port, node, &dp->ports) {
5071 if (!netdev_is_pmd(port->netdev)) {
5072 continue;
5073 }
5074
5075 for (int qid = 0; qid < port->n_rxq; qid++) {
5076 struct dp_netdev_rxq *q = &port->rxqs[qid];
5077 uint64_t cycle_hist = 0;
5078
5079 if (q->pmd->isolated) {
5080 continue;
5081 }
5082
5083 if (n_rxqs == 0) {
5084 rxqs = xmalloc(sizeof *rxqs);
5085 } else {
5086 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5087 }
5088
5089 /* Sum the queue intervals and store the cycle history. */
5090 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5091 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5092 }
5093 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5094 cycle_hist);
5095 /* Store the queue. */
5096 rxqs[n_rxqs++] = q;
5097 }
5098 }
5099 if (n_rxqs > 1) {
5100 /* Sort the queues in order of the processing cycles
5101 * they consumed during their last pmd interval. */
5102 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5103 }
5104 rr_numa_list_populate(dp, &rr);
5105
5106 for (int i = 0; i < n_rxqs; i++) {
5107 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5108 numa = rr_numa_list_lookup(&rr, numa_id);
5109 if (!numa) {
5110 /* Abort if cross NUMA polling. */
5111 VLOG_DBG("PMD auto lb dry run."
5112 " Aborting due to cross-numa polling.");
5113 goto cleanup;
5114 }
5115
5116 pmd = rr_numa_get_pmd(numa, true);
5117 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5118 "to be assigned port \'%s\' rx queue %d "
5119 "(measured processing cycles %"PRIu64").",
5120 pmd->core_id, numa_id,
5121 netdev_rxq_get_name(rxqs[i]->rx),
5122 netdev_rxq_get_queue_id(rxqs[i]->rx),
5123 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5124
5125 for (int id = 0; id < num_pmds; id++) {
5126 if (pmd->core_id == core_list[id]) {
5127 /* Add the processing cycles of rxq to pmd polling it. */
5128 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5129 RXQ_CYCLES_PROC_HIST);
5130 }
5131 }
5132 }
5133
5134 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5135 uint64_t total_cycles = 0;
5136
5137 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5138 continue;
5139 }
5140
5141 /* Get the total pmd cycles for an interval. */
5142 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5143 /* Estimate the cycles to cover all intervals. */
5144 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5145 for (int id = 0; id < num_pmds; id++) {
5146 if (pmd->core_id == core_list[id]) {
5147 if (pmd_usage[id]) {
5148 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5149 }
5150 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5151 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5152 }
5153 }
5154 }
5155 *predicted_variance = variance(pmd_usage, num_pmds);
5156 ret = true;
5157
5158 cleanup:
5159 rr_numa_list_destroy(&rr);
5160 free(rxqs);
5161 free(pmd_usage);
5162 return ret;
5163 }
5164
5165 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5166 * better distribution of load on PMDs. */
5167 static bool
5168 pmd_rebalance_dry_run(struct dp_netdev *dp)
5169 OVS_REQUIRES(dp->port_mutex)
5170 {
5171 struct dp_netdev_pmd_thread *pmd;
5172 uint64_t *curr_pmd_usage;
5173
5174 uint64_t curr_variance;
5175 uint64_t new_variance;
5176 uint64_t improvement = 0;
5177 uint32_t num_pmds;
5178 uint32_t *pmd_corelist;
5179 struct rxq_poll *poll;
5180 bool ret;
5181
5182 num_pmds = cmap_count(&dp->poll_threads);
5183
5184 if (num_pmds > 1) {
5185 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5186 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5187 } else {
5188 return false;
5189 }
5190
5191 num_pmds = 0;
5192 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5193 uint64_t total_cycles = 0;
5194 uint64_t total_proc = 0;
5195
5196 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5197 continue;
5198 }
5199
5200 /* Get the total pmd cycles for an interval. */
5201 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5202 /* Estimate the cycles to cover all intervals. */
5203 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5204
5205 ovs_mutex_lock(&pmd->port_mutex);
5206 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5207 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5208 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5209 }
5210 }
5211 ovs_mutex_unlock(&pmd->port_mutex);
5212
5213 if (total_proc) {
5214 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5215 }
5216
5217 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5218 pmd->core_id, curr_pmd_usage[num_pmds]);
5219
5220 if (atomic_count_get(&pmd->pmd_overloaded)) {
5221 atomic_count_set(&pmd->pmd_overloaded, 0);
5222 }
5223
5224 pmd_corelist[num_pmds] = pmd->core_id;
5225 num_pmds++;
5226 }
5227
5228 curr_variance = variance(curr_pmd_usage, num_pmds);
5229 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5230
5231 if (ret) {
5232 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5233 " Predicted PMD variance: %"PRIu64"",
5234 curr_variance, new_variance);
5235
5236 if (new_variance < curr_variance) {
5237 improvement =
5238 ((curr_variance - new_variance) * 100) / curr_variance;
5239 }
5240 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5241 ret = false;
5242 }
5243 }
5244
5245 free(curr_pmd_usage);
5246 free(pmd_corelist);
5247 return ret;
5248 }
5249
5250
5251 /* Return true if needs to revalidate datapath flows. */
5252 static bool
5253 dpif_netdev_run(struct dpif *dpif)
5254 {
5255 struct dp_netdev_port *port;
5256 struct dp_netdev *dp = get_dp_netdev(dpif);
5257 struct dp_netdev_pmd_thread *non_pmd;
5258 uint64_t new_tnl_seq;
5259 bool need_to_flush = true;
5260 bool pmd_rebalance = false;
5261 long long int now = time_msec();
5262 struct dp_netdev_pmd_thread *pmd;
5263
5264 ovs_mutex_lock(&dp->port_mutex);
5265 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5266 if (non_pmd) {
5267 ovs_mutex_lock(&dp->non_pmd_mutex);
5268 HMAP_FOR_EACH (port, node, &dp->ports) {
5269 if (!netdev_is_pmd(port->netdev)) {
5270 int i;
5271
5272 if (port->emc_enabled) {
5273 atomic_read_relaxed(&dp->emc_insert_min,
5274 &non_pmd->ctx.emc_insert_min);
5275 } else {
5276 non_pmd->ctx.emc_insert_min = 0;
5277 }
5278
5279 for (i = 0; i < port->n_rxq; i++) {
5280
5281 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5282 continue;
5283 }
5284
5285 if (dp_netdev_process_rxq_port(non_pmd,
5286 &port->rxqs[i],
5287 port->port_no)) {
5288 need_to_flush = false;
5289 }
5290 }
5291 }
5292 }
5293 if (need_to_flush) {
5294 /* We didn't receive anything in the process loop.
5295 * Check if we need to send something.
5296 * There was no time updates on current iteration. */
5297 pmd_thread_ctx_time_update(non_pmd);
5298 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5299 }
5300
5301 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5302 ovs_mutex_unlock(&dp->non_pmd_mutex);
5303
5304 dp_netdev_pmd_unref(non_pmd);
5305 }
5306
5307 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5308 if (pmd_alb->is_enabled) {
5309 if (!pmd_alb->rebalance_poll_timer) {
5310 pmd_alb->rebalance_poll_timer = now;
5311 } else if ((pmd_alb->rebalance_poll_timer +
5312 pmd_alb->rebalance_intvl) < now) {
5313 pmd_alb->rebalance_poll_timer = now;
5314 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5315 if (atomic_count_get(&pmd->pmd_overloaded) >=
5316 PMD_RXQ_INTERVAL_MAX) {
5317 pmd_rebalance = true;
5318 break;
5319 }
5320 }
5321
5322 if (pmd_rebalance &&
5323 !dp_netdev_is_reconf_required(dp) &&
5324 !ports_require_restart(dp) &&
5325 pmd_rebalance_dry_run(dp)) {
5326 VLOG_INFO("PMD auto lb dry run."
5327 " requesting datapath reconfigure.");
5328 dp_netdev_request_reconfigure(dp);
5329 }
5330 }
5331 }
5332
5333 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5334 reconfigure_datapath(dp);
5335 }
5336 ovs_mutex_unlock(&dp->port_mutex);
5337
5338 tnl_neigh_cache_run();
5339 tnl_port_map_run();
5340 new_tnl_seq = seq_read(tnl_conf_seq);
5341
5342 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5343 dp->last_tnl_conf_seq = new_tnl_seq;
5344 return true;
5345 }
5346 return false;
5347 }
5348
5349 static void
5350 dpif_netdev_wait(struct dpif *dpif)
5351 {
5352 struct dp_netdev_port *port;
5353 struct dp_netdev *dp = get_dp_netdev(dpif);
5354
5355 ovs_mutex_lock(&dp_netdev_mutex);
5356 ovs_mutex_lock(&dp->port_mutex);
5357 HMAP_FOR_EACH (port, node, &dp->ports) {
5358 netdev_wait_reconf_required(port->netdev);
5359 if (!netdev_is_pmd(port->netdev)) {
5360 int i;
5361
5362 for (i = 0; i < port->n_rxq; i++) {
5363 netdev_rxq_wait(port->rxqs[i].rx);
5364 }
5365 }
5366 }
5367 ovs_mutex_unlock(&dp->port_mutex);
5368 ovs_mutex_unlock(&dp_netdev_mutex);
5369 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5370 }
5371
5372 static void
5373 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5374 {
5375 struct tx_port *tx_port_cached;
5376
5377 /* Flush all the queued packets. */
5378 dp_netdev_pmd_flush_output_packets(pmd, true);
5379 /* Free all used tx queue ids. */
5380 dpif_netdev_xps_revalidate_pmd(pmd, true);
5381
5382 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5383 free(tx_port_cached);
5384 }
5385 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5386 free(tx_port_cached);
5387 }
5388 }
5389
5390 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5391 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5392 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5393 * one txq. */
5394 static void
5395 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5396 OVS_REQUIRES(pmd->port_mutex)
5397 {
5398 struct tx_port *tx_port, *tx_port_cached;
5399
5400 pmd_free_cached_ports(pmd);
5401 hmap_shrink(&pmd->send_port_cache);
5402 hmap_shrink(&pmd->tnl_port_cache);
5403
5404 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5405 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5406 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5407 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5408 hash_port_no(tx_port_cached->port->port_no));
5409 }
5410
5411 if (netdev_n_txq(tx_port->port->netdev)) {
5412 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5413 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5414 hash_port_no(tx_port_cached->port->port_no));
5415 }
5416 }
5417 }
5418
5419 static void
5420 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5421 {
5422 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5423 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5424 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5425 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5426 }
5427 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5428
5429 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5430 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5431 }
5432
5433 static void
5434 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5435 {
5436 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5437 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5438 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5439 }
5440
5441 static int
5442 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5443 struct polled_queue **ppoll_list)
5444 {
5445 struct polled_queue *poll_list = *ppoll_list;
5446 struct rxq_poll *poll;
5447 int i;
5448
5449 ovs_mutex_lock(&pmd->port_mutex);
5450 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5451 * sizeof *poll_list);
5452
5453 i = 0;
5454 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5455 poll_list[i].rxq = poll->rxq;
5456 poll_list[i].port_no = poll->rxq->port->port_no;
5457 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5458 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5459 poll_list[i].change_seq =
5460 netdev_get_change_seq(poll->rxq->port->netdev);
5461 i++;
5462 }
5463
5464 pmd_load_cached_ports(pmd);
5465
5466 ovs_mutex_unlock(&pmd->port_mutex);
5467
5468 *ppoll_list = poll_list;
5469 return i;
5470 }
5471
5472 static void *
5473 pmd_thread_main(void *f_)
5474 {
5475 struct dp_netdev_pmd_thread *pmd = f_;
5476 struct pmd_perf_stats *s = &pmd->perf_stats;
5477 unsigned int lc = 0;
5478 struct polled_queue *poll_list;
5479 bool wait_for_reload = false;
5480 bool reload_tx_qid;
5481 bool exiting;
5482 bool reload;
5483 int poll_cnt;
5484 int i;
5485 int process_packets = 0;
5486
5487 poll_list = NULL;
5488
5489 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5490 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5491 ovs_numa_thread_setaffinity_core(pmd->core_id);
5492 dpdk_set_lcore_id(pmd->core_id);
5493 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5494 dfc_cache_init(&pmd->flow_cache);
5495 pmd_alloc_static_tx_qid(pmd);
5496
5497 reload:
5498 atomic_count_init(&pmd->pmd_overloaded, 0);
5499
5500 /* List port/core affinity */
5501 for (i = 0; i < poll_cnt; i++) {
5502 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5503 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5504 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5505 /* Reset the rxq current cycles counter. */
5506 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5507 }
5508
5509 if (!poll_cnt) {
5510 if (wait_for_reload) {
5511 /* Don't sleep, control thread will ask for a reload shortly. */
5512 do {
5513 atomic_read_explicit(&pmd->reload, &reload,
5514 memory_order_acquire);
5515 } while (!reload);
5516 } else {
5517 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5518 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5519 poll_block();
5520 }
5521 }
5522 }
5523
5524 pmd->intrvl_tsc_prev = 0;
5525 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5526 cycles_counter_update(s);
5527 /* Protect pmd stats from external clearing while polling. */
5528 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5529 for (;;) {
5530 uint64_t rx_packets = 0, tx_packets = 0;
5531
5532 pmd_perf_start_iteration(s);
5533
5534 for (i = 0; i < poll_cnt; i++) {
5535
5536 if (!poll_list[i].rxq_enabled) {
5537 continue;
5538 }
5539
5540 if (poll_list[i].emc_enabled) {
5541 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5542 &pmd->ctx.emc_insert_min);
5543 } else {
5544 pmd->ctx.emc_insert_min = 0;
5545 }
5546
5547 process_packets =
5548 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5549 poll_list[i].port_no);
5550 rx_packets += process_packets;
5551 }
5552
5553 if (!rx_packets) {
5554 /* We didn't receive anything in the process loop.
5555 * Check if we need to send something.
5556 * There was no time updates on current iteration. */
5557 pmd_thread_ctx_time_update(pmd);
5558 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5559 }
5560
5561 if (lc++ > 1024) {
5562 lc = 0;
5563
5564 coverage_try_clear();
5565 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5566 if (!ovsrcu_try_quiesce()) {
5567 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5568 }
5569
5570 for (i = 0; i < poll_cnt; i++) {
5571 uint64_t current_seq =
5572 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5573 if (poll_list[i].change_seq != current_seq) {
5574 poll_list[i].change_seq = current_seq;
5575 poll_list[i].rxq_enabled =
5576 netdev_rxq_enabled(poll_list[i].rxq->rx);
5577 }
5578 }
5579 }
5580
5581 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5582 if (OVS_UNLIKELY(reload)) {
5583 break;
5584 }
5585
5586 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5587 pmd_perf_metrics_enabled(pmd));
5588 }
5589 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5590
5591 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5592 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5593 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5594 atomic_read_relaxed(&pmd->exit, &exiting);
5595 /* Signal here to make sure the pmd finishes
5596 * reloading the updated configuration. */
5597 dp_netdev_pmd_reload_done(pmd);
5598
5599 if (reload_tx_qid) {
5600 pmd_free_static_tx_qid(pmd);
5601 pmd_alloc_static_tx_qid(pmd);
5602 }
5603
5604 if (!exiting) {
5605 goto reload;
5606 }
5607
5608 pmd_free_static_tx_qid(pmd);
5609 dfc_cache_uninit(&pmd->flow_cache);
5610 free(poll_list);
5611 pmd_free_cached_ports(pmd);
5612 return NULL;
5613 }
5614
5615 static void
5616 dp_netdev_disable_upcall(struct dp_netdev *dp)
5617 OVS_ACQUIRES(dp->upcall_rwlock)
5618 {
5619 fat_rwlock_wrlock(&dp->upcall_rwlock);
5620 }
5621
5622 \f
5623 /* Meters */
5624 static void
5625 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5626 struct ofputil_meter_features *features)
5627 {
5628 features->max_meters = MAX_METERS;
5629 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5630 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5631 features->max_bands = MAX_BANDS;
5632 features->max_color = 0;
5633 }
5634
5635 /* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5636 * that exceed a band are dropped in-place. */
5637 static void
5638 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5639 uint32_t meter_id, long long int now)
5640 {
5641 struct dp_meter *meter;
5642 struct dp_meter_band *band;
5643 struct dp_packet *packet;
5644 long long int long_delta_t; /* msec */
5645 uint32_t delta_t; /* msec */
5646 const size_t cnt = dp_packet_batch_size(packets_);
5647 uint32_t bytes, volume;
5648 int exceeded_band[NETDEV_MAX_BURST];
5649 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5650 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5651
5652 if (meter_id >= MAX_METERS) {
5653 return;
5654 }
5655
5656 meter_lock(dp, meter_id);
5657 meter = dp->meters[meter_id];
5658 if (!meter) {
5659 goto out;
5660 }
5661
5662 /* Initialize as negative values. */
5663 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5664 /* Initialize as zeroes. */
5665 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5666
5667 /* All packets will hit the meter at the same time. */
5668 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5669
5670 /* Make sure delta_t will not be too large, so that bucket will not
5671 * wrap around below. */
5672 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5673 ? meter->max_delta_t : (uint32_t)long_delta_t;
5674
5675 /* Update meter stats. */
5676 meter->used = now;
5677 meter->packet_count += cnt;
5678 bytes = 0;
5679 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5680 bytes += dp_packet_size(packet);
5681 }
5682 meter->byte_count += bytes;
5683
5684 /* Meters can operate in terms of packets per second or kilobits per
5685 * second. */
5686 if (meter->flags & OFPMF13_PKTPS) {
5687 /* Rate in packets/second, bucket 1/1000 packets. */
5688 /* msec * packets/sec = 1/1000 packets. */
5689 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5690 } else {
5691 /* Rate in kbps, bucket in bits. */
5692 /* msec * kbps = bits */
5693 volume = bytes * 8;
5694 }
5695
5696 /* Update all bands and find the one hit with the highest rate for each
5697 * packet (if any). */
5698 for (int m = 0; m < meter->n_bands; ++m) {
5699 band = &meter->bands[m];
5700
5701 /* Update band's bucket. */
5702 band->bucket += delta_t * band->up.rate;
5703 if (band->bucket > band->up.burst_size) {
5704 band->bucket = band->up.burst_size;
5705 }
5706
5707 /* Drain the bucket for all the packets, if possible. */
5708 if (band->bucket >= volume) {
5709 band->bucket -= volume;
5710 } else {
5711 int band_exceeded_pkt;
5712
5713 /* Band limit hit, must process packet-by-packet. */
5714 if (meter->flags & OFPMF13_PKTPS) {
5715 band_exceeded_pkt = band->bucket / 1000;
5716 band->bucket %= 1000; /* Remainder stays in bucket. */
5717
5718 /* Update the exceeding band for each exceeding packet.
5719 * (Only one band will be fired by a packet, and that
5720 * can be different for each packet.) */
5721 for (int i = band_exceeded_pkt; i < cnt; i++) {
5722 if (band->up.rate > exceeded_rate[i]) {
5723 exceeded_rate[i] = band->up.rate;
5724 exceeded_band[i] = m;
5725 }
5726 }
5727 } else {
5728 /* Packet sizes differ, must process one-by-one. */
5729 band_exceeded_pkt = cnt;
5730 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5731 uint32_t bits = dp_packet_size(packet) * 8;
5732
5733 if (band->bucket >= bits) {
5734 band->bucket -= bits;
5735 } else {
5736 if (i < band_exceeded_pkt) {
5737 band_exceeded_pkt = i;
5738 }
5739 /* Update the exceeding band for the exceeding packet.
5740 * (Only one band will be fired by a packet, and that
5741 * can be different for each packet.) */
5742 if (band->up.rate > exceeded_rate[i]) {
5743 exceeded_rate[i] = band->up.rate;
5744 exceeded_band[i] = m;
5745 }
5746 }
5747 }
5748 }
5749 /* Remember the first exceeding packet. */
5750 if (exceeded_pkt > band_exceeded_pkt) {
5751 exceeded_pkt = band_exceeded_pkt;
5752 }
5753 }
5754 }
5755
5756 /* Fire the highest rate band exceeded by each packet, and drop
5757 * packets if needed. */
5758 size_t j;
5759 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
5760 if (exceeded_band[j] >= 0) {
5761 /* Meter drop packet. */
5762 band = &meter->bands[exceeded_band[j]];
5763 band->packet_count += 1;
5764 band->byte_count += dp_packet_size(packet);
5765
5766 dp_packet_delete(packet);
5767 } else {
5768 /* Meter accepts packet. */
5769 dp_packet_batch_refill(packets_, packet, j);
5770 }
5771 }
5772 out:
5773 meter_unlock(dp, meter_id);
5774 }
5775
5776 /* Meter set/get/del processing is still single-threaded. */
5777 static int
5778 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
5779 struct ofputil_meter_config *config)
5780 {
5781 struct dp_netdev *dp = get_dp_netdev(dpif);
5782 uint32_t mid = meter_id.uint32;
5783 struct dp_meter *meter;
5784 int i;
5785
5786 if (mid >= MAX_METERS) {
5787 return EFBIG; /* Meter_id out of range. */
5788 }
5789
5790 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
5791 return EBADF; /* Unsupported flags set */
5792 }
5793
5794 if (config->n_bands > MAX_BANDS) {
5795 return EINVAL;
5796 }
5797
5798 for (i = 0; i < config->n_bands; ++i) {
5799 switch (config->bands[i].type) {
5800 case OFPMBT13_DROP:
5801 break;
5802 default:
5803 return ENODEV; /* Unsupported band type */
5804 }
5805 }
5806
5807 /* Allocate meter */
5808 meter = xzalloc(sizeof *meter
5809 + config->n_bands * sizeof(struct dp_meter_band));
5810
5811 meter->flags = config->flags;
5812 meter->n_bands = config->n_bands;
5813 meter->max_delta_t = 0;
5814 meter->used = time_usec();
5815
5816 /* set up bands */
5817 for (i = 0; i < config->n_bands; ++i) {
5818 uint32_t band_max_delta_t;
5819
5820 /* Set burst size to a workable value if none specified. */
5821 if (config->bands[i].burst_size == 0) {
5822 config->bands[i].burst_size = config->bands[i].rate;
5823 }
5824
5825 meter->bands[i].up = config->bands[i];
5826 /* Convert burst size to the bucket units: */
5827 /* pkts => 1/1000 packets, kilobits => bits. */
5828 meter->bands[i].up.burst_size *= 1000;
5829 /* Initialize bucket to empty. */
5830 meter->bands[i].bucket = 0;
5831
5832 /* Figure out max delta_t that is enough to fill any bucket. */
5833 band_max_delta_t
5834 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5835 if (band_max_delta_t > meter->max_delta_t) {
5836 meter->max_delta_t = band_max_delta_t;
5837 }
5838 }
5839
5840 meter_lock(dp, mid);
5841 dp_delete_meter(dp, mid); /* Free existing meter, if any */
5842 dp->meters[mid] = meter;
5843 meter_unlock(dp, mid);
5844
5845 return 0;
5846 }
5847
5848 static int
5849 dpif_netdev_meter_get(const struct dpif *dpif,
5850 ofproto_meter_id meter_id_,
5851 struct ofputil_meter_stats *stats, uint16_t n_bands)
5852 {
5853 const struct dp_netdev *dp = get_dp_netdev(dpif);
5854 uint32_t meter_id = meter_id_.uint32;
5855 int retval = 0;
5856
5857 if (meter_id >= MAX_METERS) {
5858 return EFBIG;
5859 }
5860
5861 meter_lock(dp, meter_id);
5862 const struct dp_meter *meter = dp->meters[meter_id];
5863 if (!meter) {
5864 retval = ENOENT;
5865 goto done;
5866 }
5867 if (stats) {
5868 int i = 0;
5869
5870 stats->packet_in_count = meter->packet_count;
5871 stats->byte_in_count = meter->byte_count;
5872
5873 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5874 stats->bands[i].packet_count = meter->bands[i].packet_count;
5875 stats->bands[i].byte_count = meter->bands[i].byte_count;
5876 }
5877
5878 stats->n_bands = i;
5879 }
5880
5881 done:
5882 meter_unlock(dp, meter_id);
5883 return retval;
5884 }
5885
5886 static int
5887 dpif_netdev_meter_del(struct dpif *dpif,
5888 ofproto_meter_id meter_id_,
5889 struct ofputil_meter_stats *stats, uint16_t n_bands)
5890 {
5891 struct dp_netdev *dp = get_dp_netdev(dpif);
5892 int error;
5893
5894 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5895 if (!error) {
5896 uint32_t meter_id = meter_id_.uint32;
5897
5898 meter_lock(dp, meter_id);
5899 dp_delete_meter(dp, meter_id);
5900 meter_unlock(dp, meter_id);
5901 }
5902 return error;
5903 }
5904
5905 \f
5906 static void
5907 dpif_netdev_disable_upcall(struct dpif *dpif)
5908 OVS_NO_THREAD_SAFETY_ANALYSIS
5909 {
5910 struct dp_netdev *dp = get_dp_netdev(dpif);
5911 dp_netdev_disable_upcall(dp);
5912 }
5913
5914 static void
5915 dp_netdev_enable_upcall(struct dp_netdev *dp)
5916 OVS_RELEASES(dp->upcall_rwlock)
5917 {
5918 fat_rwlock_unlock(&dp->upcall_rwlock);
5919 }
5920
5921 static void
5922 dpif_netdev_enable_upcall(struct dpif *dpif)
5923 OVS_NO_THREAD_SAFETY_ANALYSIS
5924 {
5925 struct dp_netdev *dp = get_dp_netdev(dpif);
5926 dp_netdev_enable_upcall(dp);
5927 }
5928
5929 static void
5930 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
5931 {
5932 atomic_store_relaxed(&pmd->wait_for_reload, false);
5933 atomic_store_relaxed(&pmd->reload_tx_qid, false);
5934 pmd->last_reload_seq = seq_read(pmd->reload_seq);
5935 atomic_store_explicit(&pmd->reload, false, memory_order_release);
5936 }
5937
5938 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
5939 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
5940 * 'core_id' is NON_PMD_CORE_ID).
5941 *
5942 * Caller must unrefs the returned reference. */
5943 static struct dp_netdev_pmd_thread *
5944 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
5945 {
5946 struct dp_netdev_pmd_thread *pmd;
5947 const struct cmap_node *pnode;
5948
5949 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
5950 if (!pnode) {
5951 return NULL;
5952 }
5953 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
5954
5955 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
5956 }
5957
5958 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
5959 static void
5960 dp_netdev_set_nonpmd(struct dp_netdev *dp)
5961 OVS_REQUIRES(dp->port_mutex)
5962 {
5963 struct dp_netdev_pmd_thread *non_pmd;
5964
5965 non_pmd = xzalloc(sizeof *non_pmd);
5966 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5967 }
5968
5969 /* Caller must have valid pointer to 'pmd'. */
5970 static bool
5971 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5972 {
5973 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5974 }
5975
5976 static void
5977 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5978 {
5979 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5980 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5981 }
5982 }
5983
5984 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
5985 * fails, keeps checking for next node until reaching the end of cmap.
5986 *
5987 * Caller must unrefs the returned reference. */
5988 static struct dp_netdev_pmd_thread *
5989 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5990 {
5991 struct dp_netdev_pmd_thread *next;
5992
5993 do {
5994 struct cmap_node *node;
5995
5996 node = cmap_next_position(&dp->poll_threads, pos);
5997 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5998 : NULL;
5999 } while (next && !dp_netdev_pmd_try_ref(next));
6000
6001 return next;
6002 }
6003
6004 /* Configures the 'pmd' based on the input argument. */
6005 static void
6006 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6007 unsigned core_id, int numa_id)
6008 {
6009 pmd->dp = dp;
6010 pmd->core_id = core_id;
6011 pmd->numa_id = numa_id;
6012 pmd->need_reload = false;
6013 pmd->n_output_batches = 0;
6014
6015 ovs_refcount_init(&pmd->ref_cnt);
6016 atomic_init(&pmd->exit, false);
6017 pmd->reload_seq = seq_create();
6018 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6019 atomic_init(&pmd->reload, false);
6020 ovs_mutex_init(&pmd->flow_mutex);
6021 ovs_mutex_init(&pmd->port_mutex);
6022 cmap_init(&pmd->flow_table);
6023 cmap_init(&pmd->classifiers);
6024 pmd->ctx.last_rxq = NULL;
6025 pmd_thread_ctx_time_update(pmd);
6026 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6027 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6028 hmap_init(&pmd->poll_list);
6029 hmap_init(&pmd->tx_ports);
6030 hmap_init(&pmd->tnl_port_cache);
6031 hmap_init(&pmd->send_port_cache);
6032 /* init the 'flow_cache' since there is no
6033 * actual thread created for NON_PMD_CORE_ID. */
6034 if (core_id == NON_PMD_CORE_ID) {
6035 dfc_cache_init(&pmd->flow_cache);
6036 pmd_alloc_static_tx_qid(pmd);
6037 }
6038 pmd_perf_stats_init(&pmd->perf_stats);
6039 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6040 hash_int(core_id, 0));
6041 }
6042
6043 static void
6044 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6045 {
6046 struct dpcls *cls;
6047
6048 dp_netdev_pmd_flow_flush(pmd);
6049 hmap_destroy(&pmd->send_port_cache);
6050 hmap_destroy(&pmd->tnl_port_cache);
6051 hmap_destroy(&pmd->tx_ports);
6052 hmap_destroy(&pmd->poll_list);
6053 /* All flows (including their dpcls_rules) have been deleted already */
6054 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6055 dpcls_destroy(cls);
6056 ovsrcu_postpone(free, cls);
6057 }
6058 cmap_destroy(&pmd->classifiers);
6059 cmap_destroy(&pmd->flow_table);
6060 ovs_mutex_destroy(&pmd->flow_mutex);
6061 seq_destroy(pmd->reload_seq);
6062 ovs_mutex_destroy(&pmd->port_mutex);
6063 free(pmd);
6064 }
6065
6066 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6067 * and unrefs the struct. */
6068 static void
6069 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6070 {
6071 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6072 * but extra cleanup is necessary */
6073 if (pmd->core_id == NON_PMD_CORE_ID) {
6074 ovs_mutex_lock(&dp->non_pmd_mutex);
6075 dfc_cache_uninit(&pmd->flow_cache);
6076 pmd_free_cached_ports(pmd);
6077 pmd_free_static_tx_qid(pmd);
6078 ovs_mutex_unlock(&dp->non_pmd_mutex);
6079 } else {
6080 atomic_store_relaxed(&pmd->exit, true);
6081 dp_netdev_reload_pmd__(pmd);
6082 xpthread_join(pmd->thread, NULL);
6083 }
6084
6085 dp_netdev_pmd_clear_ports(pmd);
6086
6087 /* Purges the 'pmd''s flows after stopping the thread, but before
6088 * destroying the flows, so that the flow stats can be collected. */
6089 if (dp->dp_purge_cb) {
6090 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6091 }
6092 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6093 dp_netdev_pmd_unref(pmd);
6094 }
6095
6096 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6097 * thread. */
6098 static void
6099 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6100 {
6101 struct dp_netdev_pmd_thread *pmd;
6102 struct dp_netdev_pmd_thread **pmd_list;
6103 size_t k = 0, n_pmds;
6104
6105 n_pmds = cmap_count(&dp->poll_threads);
6106 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6107
6108 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6109 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6110 continue;
6111 }
6112 /* We cannot call dp_netdev_del_pmd(), since it alters
6113 * 'dp->poll_threads' (while we're iterating it) and it
6114 * might quiesce. */
6115 ovs_assert(k < n_pmds);
6116 pmd_list[k++] = pmd;
6117 }
6118
6119 for (size_t i = 0; i < k; i++) {
6120 dp_netdev_del_pmd(dp, pmd_list[i]);
6121 }
6122 free(pmd_list);
6123 }
6124
6125 /* Deletes all rx queues from pmd->poll_list and all the ports from
6126 * pmd->tx_ports. */
6127 static void
6128 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6129 {
6130 struct rxq_poll *poll;
6131 struct tx_port *port;
6132
6133 ovs_mutex_lock(&pmd->port_mutex);
6134 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6135 free(poll);
6136 }
6137 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6138 free(port);
6139 }
6140 ovs_mutex_unlock(&pmd->port_mutex);
6141 }
6142
6143 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6144 static void
6145 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6146 struct dp_netdev_rxq *rxq)
6147 OVS_REQUIRES(pmd->port_mutex)
6148 {
6149 int qid = netdev_rxq_get_queue_id(rxq->rx);
6150 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6151 struct rxq_poll *poll;
6152
6153 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6154 if (poll->rxq == rxq) {
6155 /* 'rxq' is already polled by this thread. Do nothing. */
6156 return;
6157 }
6158 }
6159
6160 poll = xmalloc(sizeof *poll);
6161 poll->rxq = rxq;
6162 hmap_insert(&pmd->poll_list, &poll->node, hash);
6163
6164 pmd->need_reload = true;
6165 }
6166
6167 /* Delete 'poll' from poll_list of PMD thread. */
6168 static void
6169 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6170 struct rxq_poll *poll)
6171 OVS_REQUIRES(pmd->port_mutex)
6172 {
6173 hmap_remove(&pmd->poll_list, &poll->node);
6174 free(poll);
6175
6176 pmd->need_reload = true;
6177 }
6178
6179 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6180 * changes to take effect. */
6181 static void
6182 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6183 struct dp_netdev_port *port)
6184 OVS_REQUIRES(pmd->port_mutex)
6185 {
6186 struct tx_port *tx;
6187
6188 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6189 if (tx) {
6190 /* 'port' is already on this thread tx cache. Do nothing. */
6191 return;
6192 }
6193
6194 tx = xzalloc(sizeof *tx);
6195
6196 tx->port = port;
6197 tx->qid = -1;
6198 tx->flush_time = 0LL;
6199 dp_packet_batch_init(&tx->output_pkts);
6200
6201 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6202 pmd->need_reload = true;
6203 }
6204
6205 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6206 * changes to take effect. */
6207 static void
6208 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6209 struct tx_port *tx)
6210 OVS_REQUIRES(pmd->port_mutex)
6211 {
6212 hmap_remove(&pmd->tx_ports, &tx->node);
6213 free(tx);
6214 pmd->need_reload = true;
6215 }
6216 \f
6217 static char *
6218 dpif_netdev_get_datapath_version(void)
6219 {
6220 return xstrdup("<built-in>");
6221 }
6222
6223 static void
6224 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6225 uint16_t tcp_flags, long long now)
6226 {
6227 uint16_t flags;
6228
6229 atomic_store_relaxed(&netdev_flow->stats.used, now);
6230 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6231 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6232 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6233 flags |= tcp_flags;
6234 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6235 }
6236
6237 static int
6238 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6239 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6240 enum dpif_upcall_type type, const struct nlattr *userdata,
6241 struct ofpbuf *actions, struct ofpbuf *put_actions)
6242 {
6243 struct dp_netdev *dp = pmd->dp;
6244
6245 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6246 return ENODEV;
6247 }
6248
6249 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6250 struct ds ds = DS_EMPTY_INITIALIZER;
6251 char *packet_str;
6252 struct ofpbuf key;
6253 struct odp_flow_key_parms odp_parms = {
6254 .flow = flow,
6255 .mask = wc ? &wc->masks : NULL,
6256 .support = dp_netdev_support,
6257 };
6258
6259 ofpbuf_init(&key, 0);
6260 odp_flow_key_from_flow(&odp_parms, &key);
6261 packet_str = ofp_dp_packet_to_string(packet_);
6262
6263 odp_flow_key_format(key.data, key.size, &ds);
6264
6265 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6266 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6267
6268 ofpbuf_uninit(&key);
6269 free(packet_str);
6270
6271 ds_destroy(&ds);
6272 }
6273
6274 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6275 actions, wc, put_actions, dp->upcall_aux);
6276 }
6277
6278 static inline uint32_t
6279 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6280 const struct miniflow *mf)
6281 {
6282 uint32_t hash;
6283
6284 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6285 hash = dp_packet_get_rss_hash(packet);
6286 } else {
6287 hash = miniflow_hash_5tuple(mf, 0);
6288 dp_packet_set_rss_hash(packet, hash);
6289 }
6290
6291 return hash;
6292 }
6293
6294 static inline uint32_t
6295 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6296 const struct miniflow *mf)
6297 {
6298 uint32_t hash, recirc_depth;
6299
6300 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6301 hash = dp_packet_get_rss_hash(packet);
6302 } else {
6303 hash = miniflow_hash_5tuple(mf, 0);
6304 dp_packet_set_rss_hash(packet, hash);
6305 }
6306
6307 /* The RSS hash must account for the recirculation depth to avoid
6308 * collisions in the exact match cache */
6309 recirc_depth = *recirc_depth_get_unsafe();
6310 if (OVS_UNLIKELY(recirc_depth)) {
6311 hash = hash_finish(hash, recirc_depth);
6312 dp_packet_set_rss_hash(packet, hash);
6313 }
6314 return hash;
6315 }
6316
6317 struct packet_batch_per_flow {
6318 unsigned int byte_count;
6319 uint16_t tcp_flags;
6320 struct dp_netdev_flow *flow;
6321
6322 struct dp_packet_batch array;
6323 };
6324
6325 static inline void
6326 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6327 struct dp_packet *packet,
6328 uint16_t tcp_flags)
6329 {
6330 batch->byte_count += dp_packet_size(packet);
6331 batch->tcp_flags |= tcp_flags;
6332 batch->array.packets[batch->array.count++] = packet;
6333 }
6334
6335 static inline void
6336 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6337 struct dp_netdev_flow *flow)
6338 {
6339 flow->batch = batch;
6340
6341 batch->flow = flow;
6342 dp_packet_batch_init(&batch->array);
6343 batch->byte_count = 0;
6344 batch->tcp_flags = 0;
6345 }
6346
6347 static inline void
6348 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6349 struct dp_netdev_pmd_thread *pmd)
6350 {
6351 struct dp_netdev_actions *actions;
6352 struct dp_netdev_flow *flow = batch->flow;
6353
6354 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
6355 batch->tcp_flags, pmd->ctx.now / 1000);
6356
6357 actions = dp_netdev_flow_get_actions(flow);
6358
6359 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6360 actions->actions, actions->size);
6361 }
6362
6363 static inline void
6364 dp_netdev_queue_batches(struct dp_packet *pkt,
6365 struct dp_netdev_flow *flow, uint16_t tcp_flags,
6366 struct packet_batch_per_flow *batches,
6367 size_t *n_batches)
6368 {
6369 struct packet_batch_per_flow *batch = flow->batch;
6370
6371 if (OVS_UNLIKELY(!batch)) {
6372 batch = &batches[(*n_batches)++];
6373 packet_batch_per_flow_init(batch, flow);
6374 }
6375
6376 packet_batch_per_flow_update(batch, pkt, tcp_flags);
6377 }
6378
6379 static inline void
6380 packet_enqueue_to_flow_map(struct dp_packet *packet,
6381 struct dp_netdev_flow *flow,
6382 uint16_t tcp_flags,
6383 struct dp_packet_flow_map *flow_map,
6384 size_t index)
6385 {
6386 struct dp_packet_flow_map *map = &flow_map[index];
6387 map->flow = flow;
6388 map->packet = packet;
6389 map->tcp_flags = tcp_flags;
6390 }
6391
6392 /* SMC lookup function for a batch of packets.
6393 * By doing batching SMC lookup, we can use prefetch
6394 * to hide memory access latency.
6395 */
6396 static inline void
6397 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6398 struct netdev_flow_key *keys,
6399 struct netdev_flow_key **missed_keys,
6400 struct dp_packet_batch *packets_,
6401 const int cnt,
6402 struct dp_packet_flow_map *flow_map,
6403 uint8_t *index_map)
6404 {
6405 int i;
6406 struct dp_packet *packet;
6407 size_t n_smc_hit = 0, n_missed = 0;
6408 struct dfc_cache *cache = &pmd->flow_cache;
6409 struct smc_cache *smc_cache = &cache->smc_cache;
6410 const struct cmap_node *flow_node;
6411 int recv_idx;
6412 uint16_t tcp_flags;
6413
6414 /* Prefetch buckets for all packets */
6415 for (i = 0; i < cnt; i++) {
6416 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6417 }
6418
6419 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6420 struct dp_netdev_flow *flow = NULL;
6421 flow_node = smc_entry_get(pmd, keys[i].hash);
6422 bool hit = false;
6423 /* Get the original order of this packet in received batch. */
6424 recv_idx = index_map[i];
6425
6426 if (OVS_LIKELY(flow_node != NULL)) {
6427 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6428 /* Since we dont have per-port megaflow to check the port
6429 * number, we need to verify that the input ports match. */
6430 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6431 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6432 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6433
6434 /* SMC hit and emc miss, we insert into EMC */
6435 keys[i].len =
6436 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6437 emc_probabilistic_insert(pmd, &keys[i], flow);
6438 /* Add these packets into the flow map in the same order
6439 * as received.
6440 */
6441 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6442 flow_map, recv_idx);
6443 n_smc_hit++;
6444 hit = true;
6445 break;
6446 }
6447 }
6448 if (hit) {
6449 continue;
6450 }
6451 }
6452
6453 /* SMC missed. Group missed packets together at
6454 * the beginning of the 'packets' array. */
6455 dp_packet_batch_refill(packets_, packet, i);
6456
6457 /* Preserve the order of packet for flow batching. */
6458 index_map[n_missed] = recv_idx;
6459
6460 /* Put missed keys to the pointer arrays return to the caller */
6461 missed_keys[n_missed++] = &keys[i];
6462 }
6463
6464 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6465 }
6466
6467 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6468 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6469 * miniflow is copied into 'keys' and the packet pointer is moved at the
6470 * beginning of the 'packets' array. The pointers of missed keys are put in the
6471 * missed_keys pointer array for future processing.
6472 *
6473 * The function returns the number of packets that needs to be processed in the
6474 * 'packets' array (they have been moved to the beginning of the vector).
6475 *
6476 * For performance reasons a caller may choose not to initialize the metadata
6477 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6478 * is not valid and must be initialized by this function using 'port_no'.
6479 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6480 * will be ignored.
6481 */
6482 static inline size_t
6483 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6484 struct dp_packet_batch *packets_,
6485 struct netdev_flow_key *keys,
6486 struct netdev_flow_key **missed_keys,
6487 struct packet_batch_per_flow batches[], size_t *n_batches,
6488 struct dp_packet_flow_map *flow_map,
6489 size_t *n_flows, uint8_t *index_map,
6490 bool md_is_valid, odp_port_t port_no)
6491 {
6492 struct netdev_flow_key *key = &keys[0];
6493 size_t n_missed = 0, n_emc_hit = 0;
6494 struct dfc_cache *cache = &pmd->flow_cache;
6495 struct dp_packet *packet;
6496 const size_t cnt = dp_packet_batch_size(packets_);
6497 uint32_t cur_min = pmd->ctx.emc_insert_min;
6498 int i;
6499 uint16_t tcp_flags;
6500 bool smc_enable_db;
6501 size_t map_cnt = 0;
6502 bool batch_enable = true;
6503
6504 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6505 pmd_perf_update_counter(&pmd->perf_stats,
6506 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6507 cnt);
6508
6509 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6510 struct dp_netdev_flow *flow;
6511 uint32_t mark;
6512
6513 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6514 dp_packet_delete(packet);
6515 continue;
6516 }
6517
6518 if (i != cnt - 1) {
6519 struct dp_packet **packets = packets_->packets;
6520 /* Prefetch next packet data and metadata. */
6521 OVS_PREFETCH(dp_packet_data(packets[i+1]));
6522 pkt_metadata_prefetch_init(&packets[i+1]->md);
6523 }
6524
6525 if (!md_is_valid) {
6526 pkt_metadata_init(&packet->md, port_no);
6527 }
6528
6529 if ((*recirc_depth_get() == 0) &&
6530 dp_packet_has_flow_mark(packet, &mark)) {
6531 flow = mark_to_flow_find(pmd, mark);
6532 if (OVS_LIKELY(flow)) {
6533 tcp_flags = parse_tcp_flags(packet);
6534 if (OVS_LIKELY(batch_enable)) {
6535 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6536 n_batches);
6537 } else {
6538 /* Flow batching should be performed only after fast-path
6539 * processing is also completed for packets with emc miss
6540 * or else it will result in reordering of packets with
6541 * same datapath flows. */
6542 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6543 flow_map, map_cnt++);
6544 }
6545 continue;
6546 }
6547 }
6548
6549 miniflow_extract(packet, &key->mf);
6550 key->len = 0; /* Not computed yet. */
6551 key->hash =
6552 (md_is_valid == false)
6553 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6554 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6555
6556 /* If EMC is disabled skip emc_lookup */
6557 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6558 if (OVS_LIKELY(flow)) {
6559 tcp_flags = miniflow_get_tcp_flags(&key->mf);
6560 n_emc_hit++;
6561 if (OVS_LIKELY(batch_enable)) {
6562 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6563 n_batches);
6564 } else {
6565 /* Flow batching should be performed only after fast-path
6566 * processing is also completed for packets with emc miss
6567 * or else it will result in reordering of packets with
6568 * same datapath flows. */
6569 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6570 flow_map, map_cnt++);
6571 }
6572 } else {
6573 /* Exact match cache missed. Group missed packets together at
6574 * the beginning of the 'packets' array. */
6575 dp_packet_batch_refill(packets_, packet, i);
6576
6577 /* Preserve the order of packet for flow batching. */
6578 index_map[n_missed] = map_cnt;
6579 flow_map[map_cnt++].flow = NULL;
6580
6581 /* 'key[n_missed]' contains the key of the current packet and it
6582 * will be passed to SMC lookup. The next key should be extracted
6583 * to 'keys[n_missed + 1]'.
6584 * We also maintain a pointer array to keys missed both SMC and EMC
6585 * which will be returned to the caller for future processing. */
6586 missed_keys[n_missed] = key;
6587 key = &keys[++n_missed];
6588
6589 /* Skip batching for subsequent packets to avoid reordering. */
6590 batch_enable = false;
6591 }
6592 }
6593 /* Count of packets which are not flow batched. */
6594 *n_flows = map_cnt;
6595
6596 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6597
6598 if (!smc_enable_db) {
6599 return dp_packet_batch_size(packets_);
6600 }
6601
6602 /* Packets miss EMC will do a batch lookup in SMC if enabled */
6603 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6604 n_missed, flow_map, index_map);
6605
6606 return dp_packet_batch_size(packets_);
6607 }
6608
6609 static inline int
6610 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6611 struct dp_packet *packet,
6612 const struct netdev_flow_key *key,
6613 struct ofpbuf *actions, struct ofpbuf *put_actions)
6614 {
6615 struct ofpbuf *add_actions;
6616 struct dp_packet_batch b;
6617 struct match match;
6618 ovs_u128 ufid;
6619 int error;
6620 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6621
6622 match.tun_md.valid = false;
6623 miniflow_expand(&key->mf, &match.flow);
6624
6625 ofpbuf_clear(actions);
6626 ofpbuf_clear(put_actions);
6627
6628 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
6629 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6630 &ufid, DPIF_UC_MISS, NULL, actions,
6631 put_actions);
6632 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6633 dp_packet_delete(packet);
6634 return error;
6635 }
6636
6637 /* The Netlink encoding of datapath flow keys cannot express
6638 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6639 * tag is interpreted as exact match on the fact that there is no
6640 * VLAN. Unless we refactor a lot of code that translates between
6641 * Netlink and struct flow representations, we have to do the same
6642 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
6643 if (!match.wc.masks.vlans[0].tci) {
6644 match.wc.masks.vlans[0].tci = htons(0xffff);
6645 }
6646
6647 /* We can't allow the packet batching in the next loop to execute
6648 * the actions. Otherwise, if there are any slow path actions,
6649 * we'll send the packet up twice. */
6650 dp_packet_batch_init_packet(&b, packet);
6651 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6652 actions->data, actions->size);
6653
6654 add_actions = put_actions->size ? put_actions : actions;
6655 if (OVS_LIKELY(error != ENOSPC)) {
6656 struct dp_netdev_flow *netdev_flow;
6657
6658 /* XXX: There's a race window where a flow covering this packet
6659 * could have already been installed since we last did the flow
6660 * lookup before upcall. This could be solved by moving the
6661 * mutex lock outside the loop, but that's an awful long time
6662 * to be locking revalidators out of making flow modifications. */
6663 ovs_mutex_lock(&pmd->flow_mutex);
6664 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6665 if (OVS_LIKELY(!netdev_flow)) {
6666 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6667 add_actions->data,
6668 add_actions->size);
6669 }
6670 ovs_mutex_unlock(&pmd->flow_mutex);
6671 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6672 smc_insert(pmd, key, hash);
6673 emc_probabilistic_insert(pmd, key, netdev_flow);
6674 }
6675 if (pmd_perf_metrics_enabled(pmd)) {
6676 /* Update upcall stats. */
6677 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6678 struct pmd_perf_stats *s = &pmd->perf_stats;
6679 s->current.upcalls++;
6680 s->current.upcall_cycles += cycles;
6681 histogram_add_sample(&s->cycles_per_upcall, cycles);
6682 }
6683 return error;
6684 }
6685
6686 static inline void
6687 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
6688 struct dp_packet_batch *packets_,
6689 struct netdev_flow_key **keys,
6690 struct dp_packet_flow_map *flow_map,
6691 uint8_t *index_map,
6692 odp_port_t in_port)
6693 {
6694 const size_t cnt = dp_packet_batch_size(packets_);
6695 #if !defined(__CHECKER__) && !defined(_WIN32)
6696 const size_t PKT_ARRAY_SIZE = cnt;
6697 #else
6698 /* Sparse or MSVC doesn't like variable length array. */
6699 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6700 #endif
6701 struct dp_packet *packet;
6702 struct dpcls *cls;
6703 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
6704 struct dp_netdev *dp = pmd->dp;
6705 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
6706 int lookup_cnt = 0, add_lookup_cnt;
6707 bool any_miss;
6708
6709 for (size_t i = 0; i < cnt; i++) {
6710 /* Key length is needed in all the cases, hash computed on demand. */
6711 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
6712 }
6713 /* Get the classifier for the in_port */
6714 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6715 if (OVS_LIKELY(cls)) {
6716 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6717 rules, cnt, &lookup_cnt);
6718 } else {
6719 any_miss = true;
6720 memset(rules, 0, sizeof(rules));
6721 }
6722 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6723 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6724 struct ofpbuf actions, put_actions;
6725
6726 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6727 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6728
6729 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6730 struct dp_netdev_flow *netdev_flow;
6731
6732 if (OVS_LIKELY(rules[i])) {
6733 continue;
6734 }
6735
6736 /* It's possible that an earlier slow path execution installed
6737 * a rule covering this flow. In this case, it's a lot cheaper
6738 * to catch it here than execute a miss. */
6739 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
6740 &add_lookup_cnt);
6741 if (netdev_flow) {
6742 lookup_cnt += add_lookup_cnt;
6743 rules[i] = &netdev_flow->cr;
6744 continue;
6745 }
6746
6747 int error = handle_packet_upcall(pmd, packet, keys[i],
6748 &actions, &put_actions);
6749
6750 if (OVS_UNLIKELY(error)) {
6751 upcall_fail_cnt++;
6752 } else {
6753 upcall_ok_cnt++;
6754 }
6755 }
6756
6757 ofpbuf_uninit(&actions);
6758 ofpbuf_uninit(&put_actions);
6759 fat_rwlock_unlock(&dp->upcall_rwlock);
6760 } else if (OVS_UNLIKELY(any_miss)) {
6761 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6762 if (OVS_UNLIKELY(!rules[i])) {
6763 dp_packet_delete(packet);
6764 upcall_fail_cnt++;
6765 }
6766 }
6767 }
6768
6769 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6770 struct dp_netdev_flow *flow;
6771 /* Get the original order of this packet in received batch. */
6772 int recv_idx = index_map[i];
6773 uint16_t tcp_flags;
6774
6775 if (OVS_UNLIKELY(!rules[i])) {
6776 continue;
6777 }
6778
6779 flow = dp_netdev_flow_cast(rules[i]);
6780 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
6781 smc_insert(pmd, keys[i], hash);
6782
6783 emc_probabilistic_insert(pmd, keys[i], flow);
6784 /* Add these packets into the flow map in the same order
6785 * as received.
6786 */
6787 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6788 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6789 flow_map, recv_idx);
6790 }
6791
6792 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6793 cnt - upcall_ok_cnt - upcall_fail_cnt);
6794 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6795 lookup_cnt);
6796 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6797 upcall_ok_cnt);
6798 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6799 upcall_fail_cnt);
6800 }
6801
6802 /* Packets enter the datapath from a port (or from recirculation) here.
6803 *
6804 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6805 * When false the metadata in 'packets' need to be initialized. */
6806 static void
6807 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
6808 struct dp_packet_batch *packets,
6809 bool md_is_valid, odp_port_t port_no)
6810 {
6811 #if !defined(__CHECKER__) && !defined(_WIN32)
6812 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
6813 #else
6814 /* Sparse or MSVC doesn't like variable length array. */
6815 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6816 #endif
6817 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6818 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
6819 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
6820 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
6821 size_t n_batches;
6822 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6823 uint8_t index_map[PKT_ARRAY_SIZE];
6824 size_t n_flows, i;
6825
6826 odp_port_t in_port;
6827
6828 n_batches = 0;
6829 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
6830 flow_map, &n_flows, index_map, md_is_valid, port_no);
6831
6832 if (!dp_packet_batch_is_empty(packets)) {
6833 /* Get ingress port from first packet's metadata. */
6834 in_port = packets->packets[0]->md.in_port.odp_port;
6835 fast_path_processing(pmd, packets, missed_keys,
6836 flow_map, index_map, in_port);
6837 }
6838
6839 /* Batch rest of packets which are in flow map. */
6840 for (i = 0; i < n_flows; i++) {
6841 struct dp_packet_flow_map *map = &flow_map[i];
6842
6843 if (OVS_UNLIKELY(!map->flow)) {
6844 continue;
6845 }
6846 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6847 batches, &n_batches);
6848 }
6849
6850 /* All the flow batches need to be reset before any call to
6851 * packet_batch_per_flow_execute() as it could potentially trigger
6852 * recirculation. When a packet matching flow ‘j’ happens to be
6853 * recirculated, the nested call to dp_netdev_input__() could potentially
6854 * classify the packet as matching another flow - say 'k'. It could happen
6855 * that in the previous call to dp_netdev_input__() that same flow 'k' had
6856 * already its own batches[k] still waiting to be served. So if its
6857 * ‘batch’ member is not reset, the recirculated packet would be wrongly
6858 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
6859 for (i = 0; i < n_batches; i++) {
6860 batches[i].flow->batch = NULL;
6861 }
6862
6863 for (i = 0; i < n_batches; i++) {
6864 packet_batch_per_flow_execute(&batches[i], pmd);
6865 }
6866 }
6867
6868 static void
6869 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
6870 struct dp_packet_batch *packets,
6871 odp_port_t port_no)
6872 {
6873 dp_netdev_input__(pmd, packets, false, port_no);
6874 }
6875
6876 static void
6877 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
6878 struct dp_packet_batch *packets)
6879 {
6880 dp_netdev_input__(pmd, packets, true, 0);
6881 }
6882
6883 struct dp_netdev_execute_aux {
6884 struct dp_netdev_pmd_thread *pmd;
6885 const struct flow *flow;
6886 };
6887
6888 static void
6889 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6890 void *aux)
6891 {
6892 struct dp_netdev *dp = get_dp_netdev(dpif);
6893 dp->dp_purge_aux = aux;
6894 dp->dp_purge_cb = cb;
6895 }
6896
6897 static void
6898 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6899 void *aux)
6900 {
6901 struct dp_netdev *dp = get_dp_netdev(dpif);
6902 dp->upcall_aux = aux;
6903 dp->upcall_cb = cb;
6904 }
6905
6906 static void
6907 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
6908 bool purge)
6909 {
6910 struct tx_port *tx;
6911 struct dp_netdev_port *port;
6912 long long interval;
6913
6914 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
6915 if (!tx->port->dynamic_txqs) {
6916 continue;
6917 }
6918 interval = pmd->ctx.now - tx->last_used;
6919 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
6920 port = tx->port;
6921 ovs_mutex_lock(&port->txq_used_mutex);
6922 port->txq_used[tx->qid]--;
6923 ovs_mutex_unlock(&port->txq_used_mutex);
6924 tx->qid = -1;
6925 }
6926 }
6927 }
6928
6929 static int
6930 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
6931 struct tx_port *tx)
6932 {
6933 struct dp_netdev_port *port;
6934 long long interval;
6935 int i, min_cnt, min_qid;
6936
6937 interval = pmd->ctx.now - tx->last_used;
6938 tx->last_used = pmd->ctx.now;
6939
6940 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
6941 return tx->qid;
6942 }
6943
6944 port = tx->port;
6945
6946 ovs_mutex_lock(&port->txq_used_mutex);
6947 if (tx->qid >= 0) {
6948 port->txq_used[tx->qid]--;
6949 tx->qid = -1;
6950 }
6951
6952 min_cnt = -1;
6953 min_qid = 0;
6954 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
6955 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
6956 min_cnt = port->txq_used[i];
6957 min_qid = i;
6958 }
6959 }
6960
6961 port->txq_used[min_qid]++;
6962 tx->qid = min_qid;
6963
6964 ovs_mutex_unlock(&port->txq_used_mutex);
6965
6966 dpif_netdev_xps_revalidate_pmd(pmd, false);
6967
6968 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
6969 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
6970 return min_qid;
6971 }
6972
6973 static struct tx_port *
6974 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6975 odp_port_t port_no)
6976 {
6977 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
6978 }
6979
6980 static struct tx_port *
6981 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6982 odp_port_t port_no)
6983 {
6984 return tx_port_lookup(&pmd->send_port_cache, port_no);
6985 }
6986
6987 static int
6988 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
6989 const struct nlattr *attr,
6990 struct dp_packet_batch *batch)
6991 {
6992 struct tx_port *tun_port;
6993 const struct ovs_action_push_tnl *data;
6994 int err;
6995
6996 data = nl_attr_get(attr);
6997
6998 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
6999 if (!tun_port) {
7000 err = -EINVAL;
7001 goto error;
7002 }
7003 err = netdev_push_header(tun_port->port->netdev, batch, data);
7004 if (!err) {
7005 return 0;
7006 }
7007 error:
7008 dp_packet_delete_batch(batch, true);
7009 return err;
7010 }
7011
7012 static void
7013 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7014 struct dp_packet *packet, bool should_steal,
7015 struct flow *flow, ovs_u128 *ufid,
7016 struct ofpbuf *actions,
7017 const struct nlattr *userdata)
7018 {
7019 struct dp_packet_batch b;
7020 int error;
7021
7022 ofpbuf_clear(actions);
7023
7024 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7025 DPIF_UC_ACTION, userdata, actions,
7026 NULL);
7027 if (!error || error == ENOSPC) {
7028 dp_packet_batch_init_packet(&b, packet);
7029 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7030 actions->data, actions->size);
7031 } else if (should_steal) {
7032 dp_packet_delete(packet);
7033 }
7034 }
7035
7036 static void
7037 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7038 const struct nlattr *a, bool should_steal)
7039 OVS_NO_THREAD_SAFETY_ANALYSIS
7040 {
7041 struct dp_netdev_execute_aux *aux = aux_;
7042 uint32_t *depth = recirc_depth_get();
7043 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7044 struct dp_netdev *dp = pmd->dp;
7045 int type = nl_attr_type(a);
7046 struct tx_port *p;
7047
7048 switch ((enum ovs_action_attr)type) {
7049 case OVS_ACTION_ATTR_OUTPUT:
7050 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
7051 if (OVS_LIKELY(p)) {
7052 struct dp_packet *packet;
7053 struct dp_packet_batch out;
7054
7055 if (!should_steal) {
7056 dp_packet_batch_clone(&out, packets_);
7057 dp_packet_batch_reset_cutlen(packets_);
7058 packets_ = &out;
7059 }
7060 dp_packet_batch_apply_cutlen(packets_);
7061
7062 #ifdef DPDK_NETDEV
7063 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7064 && packets_->packets[0]->source
7065 != p->output_pkts.packets[0]->source)) {
7066 /* XXX: netdev-dpdk assumes that all packets in a single
7067 * output batch has the same source. Flush here to
7068 * avoid memory access issues. */
7069 dp_netdev_pmd_flush_output_on_port(pmd, p);
7070 }
7071 #endif
7072 if (dp_packet_batch_size(&p->output_pkts)
7073 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7074 /* Flush here to avoid overflow. */
7075 dp_netdev_pmd_flush_output_on_port(pmd, p);
7076 }
7077
7078 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7079 pmd->n_output_batches++;
7080 }
7081
7082 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7083 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7084 pmd->ctx.last_rxq;
7085 dp_packet_batch_add(&p->output_pkts, packet);
7086 }
7087 return;
7088 }
7089 break;
7090
7091 case OVS_ACTION_ATTR_TUNNEL_PUSH:
7092 if (should_steal) {
7093 /* We're requested to push tunnel header, but also we need to take
7094 * the ownership of these packets. Thus, we can avoid performing
7095 * the action, because the caller will not use the result anyway.
7096 * Just break to free the batch. */
7097 break;
7098 }
7099 dp_packet_batch_apply_cutlen(packets_);
7100 push_tnl_action(pmd, a, packets_);
7101 return;
7102
7103 case OVS_ACTION_ATTR_TUNNEL_POP:
7104 if (*depth < MAX_RECIRC_DEPTH) {
7105 struct dp_packet_batch *orig_packets_ = packets_;
7106 odp_port_t portno = nl_attr_get_odp_port(a);
7107
7108 p = pmd_tnl_port_cache_lookup(pmd, portno);
7109 if (p) {
7110 struct dp_packet_batch tnl_pkt;
7111
7112 if (!should_steal) {
7113 dp_packet_batch_clone(&tnl_pkt, packets_);
7114 packets_ = &tnl_pkt;
7115 dp_packet_batch_reset_cutlen(orig_packets_);
7116 }
7117
7118 dp_packet_batch_apply_cutlen(packets_);
7119
7120 netdev_pop_header(p->port->netdev, packets_);
7121 if (dp_packet_batch_is_empty(packets_)) {
7122 return;
7123 }
7124
7125 struct dp_packet *packet;
7126 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7127 packet->md.in_port.odp_port = portno;
7128 }
7129
7130 (*depth)++;
7131 dp_netdev_recirculate(pmd, packets_);
7132 (*depth)--;
7133 return;
7134 }
7135 }
7136 break;
7137
7138 case OVS_ACTION_ATTR_USERSPACE:
7139 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7140 struct dp_packet_batch *orig_packets_ = packets_;
7141 const struct nlattr *userdata;
7142 struct dp_packet_batch usr_pkt;
7143 struct ofpbuf actions;
7144 struct flow flow;
7145 ovs_u128 ufid;
7146 bool clone = false;
7147
7148 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7149 ofpbuf_init(&actions, 0);
7150
7151 if (packets_->trunc) {
7152 if (!should_steal) {
7153 dp_packet_batch_clone(&usr_pkt, packets_);
7154 packets_ = &usr_pkt;
7155 clone = true;
7156 dp_packet_batch_reset_cutlen(orig_packets_);
7157 }
7158
7159 dp_packet_batch_apply_cutlen(packets_);
7160 }
7161
7162 struct dp_packet *packet;
7163 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7164 flow_extract(packet, &flow);
7165 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
7166 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7167 &ufid, &actions, userdata);
7168 }
7169
7170 if (clone) {
7171 dp_packet_delete_batch(packets_, true);
7172 }
7173
7174 ofpbuf_uninit(&actions);
7175 fat_rwlock_unlock(&dp->upcall_rwlock);
7176
7177 return;
7178 }
7179 break;
7180
7181 case OVS_ACTION_ATTR_RECIRC:
7182 if (*depth < MAX_RECIRC_DEPTH) {
7183 struct dp_packet_batch recirc_pkts;
7184
7185 if (!should_steal) {
7186 dp_packet_batch_clone(&recirc_pkts, packets_);
7187 packets_ = &recirc_pkts;
7188 }
7189
7190 struct dp_packet *packet;
7191 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7192 packet->md.recirc_id = nl_attr_get_u32(a);
7193 }
7194
7195 (*depth)++;
7196 dp_netdev_recirculate(pmd, packets_);
7197 (*depth)--;
7198
7199 return;
7200 }
7201
7202 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7203 break;
7204
7205 case OVS_ACTION_ATTR_CT: {
7206 const struct nlattr *b;
7207 bool force = false;
7208 bool commit = false;
7209 unsigned int left;
7210 uint16_t zone = 0;
7211 const char *helper = NULL;
7212 const uint32_t *setmark = NULL;
7213 const struct ovs_key_ct_labels *setlabel = NULL;
7214 struct nat_action_info_t nat_action_info;
7215 struct nat_action_info_t *nat_action_info_ref = NULL;
7216 bool nat_config = false;
7217
7218 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7219 nl_attr_get_size(a)) {
7220 enum ovs_ct_attr sub_type = nl_attr_type(b);
7221
7222 switch(sub_type) {
7223 case OVS_CT_ATTR_FORCE_COMMIT:
7224 force = true;
7225 /* fall through. */
7226 case OVS_CT_ATTR_COMMIT:
7227 commit = true;
7228 break;
7229 case OVS_CT_ATTR_ZONE:
7230 zone = nl_attr_get_u16(b);
7231 break;
7232 case OVS_CT_ATTR_HELPER:
7233 helper = nl_attr_get_string(b);
7234 break;
7235 case OVS_CT_ATTR_MARK:
7236 setmark = nl_attr_get(b);
7237 break;
7238 case OVS_CT_ATTR_LABELS:
7239 setlabel = nl_attr_get(b);
7240 break;
7241 case OVS_CT_ATTR_EVENTMASK:
7242 /* Silently ignored, as userspace datapath does not generate
7243 * netlink events. */
7244 break;
7245 case OVS_CT_ATTR_NAT: {
7246 const struct nlattr *b_nest;
7247 unsigned int left_nest;
7248 bool ip_min_specified = false;
7249 bool proto_num_min_specified = false;
7250 bool ip_max_specified = false;
7251 bool proto_num_max_specified = false;
7252 memset(&nat_action_info, 0, sizeof nat_action_info);
7253 nat_action_info_ref = &nat_action_info;
7254
7255 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7256 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7257
7258 switch (sub_type_nest) {
7259 case OVS_NAT_ATTR_SRC:
7260 case OVS_NAT_ATTR_DST:
7261 nat_config = true;
7262 nat_action_info.nat_action |=
7263 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7264 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7265 break;
7266 case OVS_NAT_ATTR_IP_MIN:
7267 memcpy(&nat_action_info.min_addr,
7268 nl_attr_get(b_nest),
7269 nl_attr_get_size(b_nest));
7270 ip_min_specified = true;
7271 break;
7272 case OVS_NAT_ATTR_IP_MAX:
7273 memcpy(&nat_action_info.max_addr,
7274 nl_attr_get(b_nest),
7275 nl_attr_get_size(b_nest));
7276 ip_max_specified = true;
7277 break;
7278 case OVS_NAT_ATTR_PROTO_MIN:
7279 nat_action_info.min_port =
7280 nl_attr_get_u16(b_nest);
7281 proto_num_min_specified = true;
7282 break;
7283 case OVS_NAT_ATTR_PROTO_MAX:
7284 nat_action_info.max_port =
7285 nl_attr_get_u16(b_nest);
7286 proto_num_max_specified = true;
7287 break;
7288 case OVS_NAT_ATTR_PERSISTENT:
7289 case OVS_NAT_ATTR_PROTO_HASH:
7290 case OVS_NAT_ATTR_PROTO_RANDOM:
7291 break;
7292 case OVS_NAT_ATTR_UNSPEC:
7293 case __OVS_NAT_ATTR_MAX:
7294 OVS_NOT_REACHED();
7295 }
7296 }
7297
7298 if (ip_min_specified && !ip_max_specified) {
7299 nat_action_info.max_addr = nat_action_info.min_addr;
7300 }
7301 if (proto_num_min_specified && !proto_num_max_specified) {
7302 nat_action_info.max_port = nat_action_info.min_port;
7303 }
7304 if (proto_num_min_specified || proto_num_max_specified) {
7305 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7306 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7307 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7308 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7309 }
7310 }
7311 break;
7312 }
7313 case OVS_CT_ATTR_UNSPEC:
7314 case __OVS_CT_ATTR_MAX:
7315 OVS_NOT_REACHED();
7316 }
7317 }
7318
7319 /* We won't be able to function properly in this case, hence
7320 * complain loudly. */
7321 if (nat_config && !commit) {
7322 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7323 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7324 }
7325
7326 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7327 commit, zone, setmark, setlabel, aux->flow->tp_src,
7328 aux->flow->tp_dst, helper, nat_action_info_ref,
7329 pmd->ctx.now / 1000);
7330 break;
7331 }
7332
7333 case OVS_ACTION_ATTR_METER:
7334 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7335 pmd->ctx.now);
7336 break;
7337
7338 case OVS_ACTION_ATTR_PUSH_VLAN:
7339 case OVS_ACTION_ATTR_POP_VLAN:
7340 case OVS_ACTION_ATTR_PUSH_MPLS:
7341 case OVS_ACTION_ATTR_POP_MPLS:
7342 case OVS_ACTION_ATTR_SET:
7343 case OVS_ACTION_ATTR_SET_MASKED:
7344 case OVS_ACTION_ATTR_SAMPLE:
7345 case OVS_ACTION_ATTR_HASH:
7346 case OVS_ACTION_ATTR_UNSPEC:
7347 case OVS_ACTION_ATTR_TRUNC:
7348 case OVS_ACTION_ATTR_PUSH_ETH:
7349 case OVS_ACTION_ATTR_POP_ETH:
7350 case OVS_ACTION_ATTR_CLONE:
7351 case OVS_ACTION_ATTR_PUSH_NSH:
7352 case OVS_ACTION_ATTR_POP_NSH:
7353 case OVS_ACTION_ATTR_CT_CLEAR:
7354 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7355 case __OVS_ACTION_ATTR_MAX:
7356 OVS_NOT_REACHED();
7357 }
7358
7359 dp_packet_delete_batch(packets_, should_steal);
7360 }
7361
7362 static void
7363 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7364 struct dp_packet_batch *packets,
7365 bool should_steal, const struct flow *flow,
7366 const struct nlattr *actions, size_t actions_len)
7367 {
7368 struct dp_netdev_execute_aux aux = { pmd, flow };
7369
7370 odp_execute_actions(&aux, packets, should_steal, actions,
7371 actions_len, dp_execute_cb);
7372 }
7373
7374 struct dp_netdev_ct_dump {
7375 struct ct_dpif_dump_state up;
7376 struct conntrack_dump dump;
7377 struct conntrack *ct;
7378 struct dp_netdev *dp;
7379 };
7380
7381 static int
7382 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7383 const uint16_t *pzone, int *ptot_bkts)
7384 {
7385 struct dp_netdev *dp = get_dp_netdev(dpif);
7386 struct dp_netdev_ct_dump *dump;
7387
7388 dump = xzalloc(sizeof *dump);
7389 dump->dp = dp;
7390 dump->ct = dp->conntrack;
7391
7392 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7393
7394 *dump_ = &dump->up;
7395
7396 return 0;
7397 }
7398
7399 static int
7400 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7401 struct ct_dpif_dump_state *dump_,
7402 struct ct_dpif_entry *entry)
7403 {
7404 struct dp_netdev_ct_dump *dump;
7405
7406 INIT_CONTAINER(dump, dump_, up);
7407
7408 return conntrack_dump_next(&dump->dump, entry);
7409 }
7410
7411 static int
7412 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7413 struct ct_dpif_dump_state *dump_)
7414 {
7415 struct dp_netdev_ct_dump *dump;
7416 int err;
7417
7418 INIT_CONTAINER(dump, dump_, up);
7419
7420 err = conntrack_dump_done(&dump->dump);
7421
7422 free(dump);
7423
7424 return err;
7425 }
7426
7427 static int
7428 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7429 const struct ct_dpif_tuple *tuple)
7430 {
7431 struct dp_netdev *dp = get_dp_netdev(dpif);
7432
7433 if (tuple) {
7434 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7435 }
7436 return conntrack_flush(dp->conntrack, zone);
7437 }
7438
7439 static int
7440 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7441 {
7442 struct dp_netdev *dp = get_dp_netdev(dpif);
7443
7444 return conntrack_set_maxconns(dp->conntrack, maxconns);
7445 }
7446
7447 static int
7448 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7449 {
7450 struct dp_netdev *dp = get_dp_netdev(dpif);
7451
7452 return conntrack_get_maxconns(dp->conntrack, maxconns);
7453 }
7454
7455 static int
7456 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7457 {
7458 struct dp_netdev *dp = get_dp_netdev(dpif);
7459
7460 return conntrack_get_nconns(dp->conntrack, nconns);
7461 }
7462
7463 static int
7464 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7465 {
7466 struct dp_netdev *dp = get_dp_netdev(dpif);
7467 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
7468 }
7469
7470 static int
7471 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7472 {
7473 struct dp_netdev *dp = get_dp_netdev(dpif);
7474 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
7475 }
7476
7477 static int
7478 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7479 {
7480 struct dp_netdev *dp = get_dp_netdev(dpif);
7481 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
7482 }
7483
7484 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7485 * diverge. */
7486 static int
7487 dpif_netdev_ipf_get_status(struct dpif *dpif,
7488 struct dpif_ipf_status *dpif_ipf_status)
7489 {
7490 struct dp_netdev *dp = get_dp_netdev(dpif);
7491 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
7492 (struct ipf_status *) dpif_ipf_status);
7493 return 0;
7494 }
7495
7496 static int
7497 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7498 struct ipf_dump_ctx **ipf_dump_ctx)
7499 {
7500 return ipf_dump_start(ipf_dump_ctx);
7501 }
7502
7503 static int
7504 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7505 {
7506 struct dp_netdev *dp = get_dp_netdev(dpif);
7507 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
7508 dump);
7509 }
7510
7511 static int
7512 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7513 {
7514 return ipf_dump_done(ipf_dump_ctx);
7515
7516 }
7517
7518 const struct dpif_class dpif_netdev_class = {
7519 "netdev",
7520 true, /* cleanup_required */
7521 dpif_netdev_init,
7522 dpif_netdev_enumerate,
7523 dpif_netdev_port_open_type,
7524 dpif_netdev_open,
7525 dpif_netdev_close,
7526 dpif_netdev_destroy,
7527 dpif_netdev_run,
7528 dpif_netdev_wait,
7529 dpif_netdev_get_stats,
7530 dpif_netdev_port_add,
7531 dpif_netdev_port_del,
7532 dpif_netdev_port_set_config,
7533 dpif_netdev_port_query_by_number,
7534 dpif_netdev_port_query_by_name,
7535 NULL, /* port_get_pid */
7536 dpif_netdev_port_dump_start,
7537 dpif_netdev_port_dump_next,
7538 dpif_netdev_port_dump_done,
7539 dpif_netdev_port_poll,
7540 dpif_netdev_port_poll_wait,
7541 dpif_netdev_flow_flush,
7542 dpif_netdev_flow_dump_create,
7543 dpif_netdev_flow_dump_destroy,
7544 dpif_netdev_flow_dump_thread_create,
7545 dpif_netdev_flow_dump_thread_destroy,
7546 dpif_netdev_flow_dump_next,
7547 dpif_netdev_operate,
7548 NULL, /* recv_set */
7549 NULL, /* handlers_set */
7550 dpif_netdev_set_config,
7551 dpif_netdev_queue_to_priority,
7552 NULL, /* recv */
7553 NULL, /* recv_wait */
7554 NULL, /* recv_purge */
7555 dpif_netdev_register_dp_purge_cb,
7556 dpif_netdev_register_upcall_cb,
7557 dpif_netdev_enable_upcall,
7558 dpif_netdev_disable_upcall,
7559 dpif_netdev_get_datapath_version,
7560 dpif_netdev_ct_dump_start,
7561 dpif_netdev_ct_dump_next,
7562 dpif_netdev_ct_dump_done,
7563 dpif_netdev_ct_flush,
7564 dpif_netdev_ct_set_maxconns,
7565 dpif_netdev_ct_get_maxconns,
7566 dpif_netdev_ct_get_nconns,
7567 NULL, /* ct_set_limits */
7568 NULL, /* ct_get_limits */
7569 NULL, /* ct_del_limits */
7570 dpif_netdev_ipf_set_enabled,
7571 dpif_netdev_ipf_set_min_frag,
7572 dpif_netdev_ipf_set_max_nfrags,
7573 dpif_netdev_ipf_get_status,
7574 dpif_netdev_ipf_dump_start,
7575 dpif_netdev_ipf_dump_next,
7576 dpif_netdev_ipf_dump_done,
7577 dpif_netdev_meter_get_features,
7578 dpif_netdev_meter_set,
7579 dpif_netdev_meter_get,
7580 dpif_netdev_meter_del,
7581 };
7582
7583 static void
7584 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7585 const char *argv[], void *aux OVS_UNUSED)
7586 {
7587 struct dp_netdev_port *port;
7588 struct dp_netdev *dp;
7589 odp_port_t port_no;
7590
7591 ovs_mutex_lock(&dp_netdev_mutex);
7592 dp = shash_find_data(&dp_netdevs, argv[1]);
7593 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
7594 ovs_mutex_unlock(&dp_netdev_mutex);
7595 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7596 return;
7597 }
7598 ovs_refcount_ref(&dp->ref_cnt);
7599 ovs_mutex_unlock(&dp_netdev_mutex);
7600
7601 ovs_mutex_lock(&dp->port_mutex);
7602 if (get_port_by_name(dp, argv[2], &port)) {
7603 unixctl_command_reply_error(conn, "unknown port");
7604 goto exit;
7605 }
7606
7607 port_no = u32_to_odp(atoi(argv[3]));
7608 if (!port_no || port_no == ODPP_NONE) {
7609 unixctl_command_reply_error(conn, "bad port number");
7610 goto exit;
7611 }
7612 if (dp_netdev_lookup_port(dp, port_no)) {
7613 unixctl_command_reply_error(conn, "port number already in use");
7614 goto exit;
7615 }
7616
7617 /* Remove port. */
7618 hmap_remove(&dp->ports, &port->node);
7619 reconfigure_datapath(dp);
7620
7621 /* Reinsert with new port number. */
7622 port->port_no = port_no;
7623 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
7624 reconfigure_datapath(dp);
7625
7626 seq_change(dp->port_seq);
7627 unixctl_command_reply(conn, NULL);
7628
7629 exit:
7630 ovs_mutex_unlock(&dp->port_mutex);
7631 dp_netdev_unref(dp);
7632 }
7633
7634 static void
7635 dpif_dummy_register__(const char *type)
7636 {
7637 struct dpif_class *class;
7638
7639 class = xmalloc(sizeof *class);
7640 *class = dpif_netdev_class;
7641 class->type = xstrdup(type);
7642 dp_register_provider(class);
7643 }
7644
7645 static void
7646 dpif_dummy_override(const char *type)
7647 {
7648 int error;
7649
7650 /*
7651 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7652 * a userland-only build. It's useful for testsuite.
7653 */
7654 error = dp_unregister_provider(type);
7655 if (error == 0 || error == EAFNOSUPPORT) {
7656 dpif_dummy_register__(type);
7657 }
7658 }
7659
7660 void
7661 dpif_dummy_register(enum dummy_level level)
7662 {
7663 if (level == DUMMY_OVERRIDE_ALL) {
7664 struct sset types;
7665 const char *type;
7666
7667 sset_init(&types);
7668 dp_enumerate_types(&types);
7669 SSET_FOR_EACH (type, &types) {
7670 dpif_dummy_override(type);
7671 }
7672 sset_destroy(&types);
7673 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7674 dpif_dummy_override("system");
7675 }
7676
7677 dpif_dummy_register__("dummy");
7678
7679 unixctl_command_register("dpif-dummy/change-port-number",
7680 "dp port new-number",
7681 3, 3, dpif_dummy_change_port_number, NULL);
7682 }
7683 \f
7684 /* Datapath Classifier. */
7685
7686 /* Forward declaration for lookup_func typedef. */
7687 struct dpcls_subtable;
7688
7689 /* Lookup function for a subtable in the dpcls. This function is called
7690 * by each subtable with an array of packets, and a bitmask of packets to
7691 * perform the lookup on. Using a function pointer gives flexibility to
7692 * optimize the lookup function based on subtable properties and the
7693 * CPU instruction set available at runtime.
7694 */
7695 typedef
7696 uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable *subtable,
7697 uint32_t keys_map,
7698 const struct netdev_flow_key *keys[],
7699 struct dpcls_rule **rules);
7700
7701 /* Prototype for generic lookup func, using same code path as before. */
7702 uint32_t
7703 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
7704 uint32_t keys_map,
7705 const struct netdev_flow_key *keys[],
7706 struct dpcls_rule **rules);
7707
7708 /* A set of rules that all have the same fields wildcarded. */
7709 struct dpcls_subtable {
7710 /* The fields are only used by writers. */
7711 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
7712
7713 /* These fields are accessed by readers. */
7714 struct cmap rules; /* Contains "struct dpcls_rule"s. */
7715 uint32_t hit_cnt; /* Number of match hits in subtable in current
7716 optimization interval. */
7717
7718 /* The lookup function to use for this subtable. If there is a known
7719 * property of the subtable (eg: only 3 bits of miniflow metadata is
7720 * used for the lookup) then this can point at an optimized version of
7721 * the lookup function for this particular subtable. */
7722 dpcls_subtable_lookup_func lookup_func;
7723
7724 struct netdev_flow_key mask; /* Wildcards for fields (const). */
7725 /* 'mask' must be the last field, additional space is allocated here. */
7726 };
7727
7728 static void
7729 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7730 {
7731 cmap_destroy(&subtable->rules);
7732 ovsrcu_postpone(free, subtable);
7733 }
7734
7735 /* Initializes 'cls' as a classifier that initially contains no classification
7736 * rules. */
7737 static void
7738 dpcls_init(struct dpcls *cls)
7739 {
7740 cmap_init(&cls->subtables_map);
7741 pvector_init(&cls->subtables);
7742 }
7743
7744 static void
7745 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7746 {
7747 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
7748 pvector_remove(&cls->subtables, subtable);
7749 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7750 subtable->mask.hash);
7751 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
7752 }
7753
7754 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
7755 * caller's responsibility.
7756 * May only be called after all the readers have been terminated. */
7757 static void
7758 dpcls_destroy(struct dpcls *cls)
7759 {
7760 if (cls) {
7761 struct dpcls_subtable *subtable;
7762
7763 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
7764 ovs_assert(cmap_count(&subtable->rules) == 0);
7765 dpcls_destroy_subtable(cls, subtable);
7766 }
7767 cmap_destroy(&cls->subtables_map);
7768 pvector_destroy(&cls->subtables);
7769 }
7770 }
7771
7772 static struct dpcls_subtable *
7773 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7774 {
7775 struct dpcls_subtable *subtable;
7776
7777 /* Need to add one. */
7778 subtable = xmalloc(sizeof *subtable
7779 - sizeof subtable->mask.mf + mask->len);
7780 cmap_init(&subtable->rules);
7781 subtable->hit_cnt = 0;
7782 netdev_flow_key_clone(&subtable->mask, mask);
7783
7784 /* Decide which hash/lookup/verify function to use. */
7785 subtable->lookup_func = dpcls_subtable_lookup_generic;
7786
7787 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
7788 /* Add the new subtable at the end of the pvector (with no hits yet) */
7789 pvector_insert(&cls->subtables, subtable, 0);
7790 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
7791 cmap_count(&cls->subtables_map), subtable, cls->in_port);
7792 pvector_publish(&cls->subtables);
7793
7794 return subtable;
7795 }
7796
7797 static inline struct dpcls_subtable *
7798 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7799 {
7800 struct dpcls_subtable *subtable;
7801
7802 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7803 &cls->subtables_map) {
7804 if (netdev_flow_key_equal(&subtable->mask, mask)) {
7805 return subtable;
7806 }
7807 }
7808 return dpcls_create_subtable(cls, mask);
7809 }
7810
7811
7812 /* Periodically sort the dpcls subtable vectors according to hit counts */
7813 static void
7814 dpcls_sort_subtable_vector(struct dpcls *cls)
7815 {
7816 struct pvector *pvec = &cls->subtables;
7817 struct dpcls_subtable *subtable;
7818
7819 PVECTOR_FOR_EACH (subtable, pvec) {
7820 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
7821 subtable->hit_cnt = 0;
7822 }
7823 pvector_publish(pvec);
7824 }
7825
7826 static inline void
7827 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
7828 struct polled_queue *poll_list, int poll_cnt)
7829 {
7830 struct dpcls *cls;
7831 uint64_t tot_idle = 0, tot_proc = 0;
7832 unsigned int pmd_load = 0;
7833
7834 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
7835 uint64_t curr_tsc;
7836 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
7837 if (pmd_alb->is_enabled && !pmd->isolated
7838 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
7839 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
7840 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
7841 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
7842 {
7843 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
7844 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
7845 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
7846 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
7847
7848 if (tot_proc) {
7849 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
7850 }
7851
7852 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
7853 atomic_count_inc(&pmd->pmd_overloaded);
7854 } else {
7855 atomic_count_set(&pmd->pmd_overloaded, 0);
7856 }
7857 }
7858
7859 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
7860 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
7861 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
7862 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
7863
7864 /* Get the cycles that were used to process each queue and store. */
7865 for (unsigned i = 0; i < poll_cnt; i++) {
7866 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
7867 RXQ_CYCLES_PROC_CURR);
7868 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
7869 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
7870 0);
7871 }
7872 curr_tsc = cycles_counter_update(&pmd->perf_stats);
7873 if (pmd->intrvl_tsc_prev) {
7874 /* There is a prev timestamp, store a new intrvl cycle count. */
7875 atomic_store_relaxed(&pmd->intrvl_cycles,
7876 curr_tsc - pmd->intrvl_tsc_prev);
7877 }
7878 pmd->intrvl_tsc_prev = curr_tsc;
7879 /* Start new measuring interval */
7880 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
7881 }
7882
7883 if (pmd->ctx.now > pmd->next_optimization) {
7884 /* Try to obtain the flow lock to block out revalidator threads.
7885 * If not possible, just try next time. */
7886 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
7887 /* Optimize each classifier */
7888 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7889 dpcls_sort_subtable_vector(cls);
7890 }
7891 ovs_mutex_unlock(&pmd->flow_mutex);
7892 /* Start new measuring interval */
7893 pmd->next_optimization = pmd->ctx.now
7894 + DPCLS_OPTIMIZATION_INTERVAL;
7895 }
7896 }
7897 }
7898
7899 /* Insert 'rule' into 'cls'. */
7900 static void
7901 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
7902 const struct netdev_flow_key *mask)
7903 {
7904 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
7905
7906 /* Refer to subtable's mask, also for later removal. */
7907 rule->mask = &subtable->mask;
7908 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
7909 }
7910
7911 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
7912 static void
7913 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
7914 {
7915 struct dpcls_subtable *subtable;
7916
7917 ovs_assert(rule->mask);
7918
7919 /* Get subtable from reference in rule->mask. */
7920 INIT_CONTAINER(subtable, rule->mask, mask);
7921 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
7922 == 0) {
7923 /* Delete empty subtable. */
7924 dpcls_destroy_subtable(cls, subtable);
7925 pvector_publish(&cls->subtables);
7926 }
7927 }
7928
7929 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
7930 * in 'mask' the values in 'key' and 'target' are the same. */
7931 static bool
7932 dpcls_rule_matches_key(const struct dpcls_rule *rule,
7933 const struct netdev_flow_key *target)
7934 {
7935 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
7936 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
7937 uint64_t value;
7938
7939 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
7940 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
7941 return false;
7942 }
7943 }
7944 return true;
7945 }
7946
7947 uint32_t
7948 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
7949 uint32_t keys_map,
7950 const struct netdev_flow_key *keys[],
7951 struct dpcls_rule **rules)
7952 {
7953 int i;
7954 uint32_t found_map;
7955
7956 /* Compute hashes for the remaining keys. Each search-key is
7957 * masked with the subtable's mask to avoid hashing the wildcarded
7958 * bits. */
7959 uint32_t hashes[NETDEV_MAX_BURST];
7960 ULLONG_FOR_EACH_1 (i, keys_map) {
7961 hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
7962 &subtable->mask);
7963 }
7964
7965 /* Lookup. */
7966 const struct cmap_node *nodes[NETDEV_MAX_BURST];
7967 found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
7968
7969 /* Check results. When the i-th bit of found_map is set, it means
7970 * that a set of nodes with a matching hash value was found for the
7971 * i-th search-key. Due to possible hash collisions we need to check
7972 * which of the found rules, if any, really matches our masked
7973 * search-key. */
7974 ULLONG_FOR_EACH_1 (i, found_map) {
7975 struct dpcls_rule *rule;
7976
7977 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
7978 if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
7979 rules[i] = rule;
7980 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
7981 * within one second optimization interval. */
7982 subtable->hit_cnt++;
7983 goto next;
7984 }
7985 }
7986 /* None of the found rules was a match. Reset the i-th bit to
7987 * keep searching this key in the next subtable. */
7988 ULLONG_SET0(found_map, i); /* Did not match. */
7989 next:
7990 ; /* Keep Sparse happy. */
7991 }
7992
7993 return found_map;
7994 }
7995
7996 /* For each miniflow in 'keys' performs a classifier lookup writing the result
7997 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
7998 * NULL it is skipped.
7999 *
8000 * This function is optimized for use in the userspace datapath and therefore
8001 * does not implement a lot of features available in the standard
8002 * classifier_lookup() function. Specifically, it does not implement
8003 * priorities, instead returning any rule which matches the flow.
8004 *
8005 * Returns true if all miniflows found a corresponding rule. */
8006 static bool
8007 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8008 struct dpcls_rule **rules, const size_t cnt,
8009 int *num_lookups_p)
8010 {
8011 /* The received 'cnt' miniflows are the search-keys that will be processed
8012 * to find a matching entry into the available subtables.
8013 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8014 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8015 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8016
8017 struct dpcls_subtable *subtable;
8018
8019 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8020
8021 if (cnt != MAP_BITS) {
8022 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8023 }
8024 memset(rules, 0, cnt * sizeof *rules);
8025
8026 int lookups_match = 0, subtable_pos = 1;
8027 uint32_t found_map;
8028
8029 /* The Datapath classifier - aka dpcls - is composed of subtables.
8030 * Subtables are dynamically created as needed when new rules are inserted.
8031 * Each subtable collects rules with matches on a specific subset of packet
8032 * fields as defined by the subtable's mask. We proceed to process every
8033 * search-key against each subtable, but when a match is found for a
8034 * search-key, the search for that key can stop because the rules are
8035 * non-overlapping. */
8036 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8037 /* Call the subtable specific lookup function. */
8038 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8039
8040 /* Count the number of subtables searched for this packet match. This
8041 * estimates the "spread" of subtables looked at per matched packet. */
8042 uint32_t pkts_matched = count_1bits(found_map);
8043 lookups_match += pkts_matched * subtable_pos;
8044
8045 /* Clear the found rules, and return early if all packets are found. */
8046 keys_map &= ~found_map;
8047 if (!keys_map) {
8048 if (num_lookups_p) {
8049 *num_lookups_p = lookups_match;
8050 }
8051 return true;
8052 }
8053 subtable_pos++;
8054 }
8055
8056 if (num_lookups_p) {
8057 *num_lookups_p = lookups_match;
8058 }
8059 return false;
8060 }