]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netdev.c
ofproto-dpif-upcall: Echo HASH attribute back to datapath.
[mirror_ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
4ea96698 2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
f5ace7cd 19#include "dpif-netdev-private.h"
72865317 20
72865317
BP
21#include <ctype.h>
22#include <errno.h>
23#include <fcntl.h>
24#include <inttypes.h>
7f3adc00 25#include <net/if.h>
b2befd5b 26#include <sys/types.h>
7daedce4 27#include <netinet/in.h>
cdee00fd 28#include <stdint.h>
72865317
BP
29#include <stdlib.h>
30#include <string.h>
31#include <sys/ioctl.h>
7daedce4 32#include <sys/socket.h>
72865317 33#include <sys/stat.h>
72865317
BP
34#include <unistd.h>
35
9f861c91 36#include "bitmap.h"
59e6d833 37#include "cmap.h"
5cf3edb3 38#include "conntrack.h"
7daedce4 39#include "coverage.h"
4d4e68ed 40#include "ct-dpif.h"
72865317 41#include "csum.h"
e14deea0 42#include "dp-packet.h"
614c4892 43#include "dpif.h"
82a48ead 44#include "dpif-netdev-perf.h"
72865317 45#include "dpif-provider.h"
614c4892 46#include "dummy.h"
afae68b1 47#include "fat-rwlock.h"
72865317 48#include "flow.h"
762d146a 49#include "hmapx.h"
140dd699 50#include "id-pool.h"
4ea96698 51#include "ipf.h"
72865317 52#include "netdev.h"
b6cabb8f 53#include "netdev-offload.h"
79f36875 54#include "netdev-provider.h"
de281153 55#include "netdev-vport.h"
cdee00fd 56#include "netlink.h"
f094af7b 57#include "odp-execute.h"
72865317 58#include "odp-util.h"
25d436fb
BW
59#include "openvswitch/dynamic-string.h"
60#include "openvswitch/list.h"
61#include "openvswitch/match.h"
0d71302e 62#include "openvswitch/ofp-parse.h"
25d436fb 63#include "openvswitch/ofp-print.h"
64c96779 64#include "openvswitch/ofpbuf.h"
3eb67853 65#include "openvswitch/shash.h"
25d436fb 66#include "openvswitch/vlog.h"
5a034064 67#include "ovs-numa.h"
61e7deb1 68#include "ovs-rcu.h"
72865317 69#include "packets.h"
fd016ae3 70#include "openvswitch/poll-loop.h"
0de8783a 71#include "pvector.h"
26c6b6cd 72#include "random.h"
d33ed218 73#include "seq.h"
3eb67853 74#include "smap.h"
0cbfe35d 75#include "sset.h"
72865317 76#include "timeval.h"
53902038 77#include "tnl-neigh-cache.h"
7f9b8504 78#include "tnl-ports.h"
74cc3969 79#include "unixctl.h"
72865317 80#include "util.h"
241bad15 81#include "uuid.h"
7daedce4 82
d98e6007 83VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 84
5bf84282
NK
85/* Auto Load Balancing Defaults */
86#define ALB_ACCEPTABLE_IMPROVEMENT 25
87#define ALB_PMD_LOAD_THRESHOLD 95
88#define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
89#define MIN_TO_MSEC 60000
90
8bb113da 91#define FLOW_DUMP_MAX_BATCH 50
adcf00ba 92/* Use per thread recirc_depth to prevent recirculation loop. */
3f9d3836 93#define MAX_RECIRC_DEPTH 6
adcf00ba 94DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 95
c71ea3c4
IM
96/* Use instant packet send by default. */
97#define DEFAULT_TX_FLUSH_INTERVAL 0
98
72865317 99/* Configuration parameters. */
72865317 100enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
4b27db64
JR
101enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
102enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
103enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
72865317 104
8a4e3a85
BP
105/* Protects against changes to 'dp_netdevs'. */
106static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
107
108/* Contains all 'struct dp_netdev's. */
109static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
110 = SHASH_INITIALIZER(&dp_netdevs);
111
623540e4 112static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 113
5cf3edb3 114#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
4cddb1f0
DB
115 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
116 | CS_SRC_NAT | CS_DST_NAT)
5cf3edb3
DDP
117#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
118
2494ccd7 119static struct odp_support dp_netdev_support = {
f0fb825a 120 .max_vlan_headers = SIZE_MAX,
2494ccd7
JS
121 .max_mpls_depth = SIZE_MAX,
122 .recirc = true,
5cf3edb3
DDP
123 .ct_state = true,
124 .ct_zone = true,
125 .ct_mark = true,
126 .ct_label = true,
2575df07
JP
127 .ct_state_nat = true,
128 .ct_orig_tuple = true,
129 .ct_orig_tuple6 = true,
2494ccd7
JS
130};
131
60d8ccae
YW
132/* EMC cache and SMC cache compose the datapath flow cache (DFC)
133 *
134 * Exact match cache for frequently used flows
9bbf1c3d
DDP
135 *
136 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
137 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 138 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
139 *
140 * A cache entry holds a reference to its 'dp_netdev_flow'.
141 *
142 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
143 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
144 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
145 * value is the index of a cache entry where the miniflow could be.
146 *
147 *
60d8ccae
YW
148 * Signature match cache (SMC)
149 *
150 * This cache stores a 16-bit signature for each flow without storing keys, and
151 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
152 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
153 * SMC uses a set-associative design that each bucket contains
154 * SMC_ENTRY_PER_BUCKET number of entries.
155 * Since 16-bit flow_table index is used, if there are more than 2^16
156 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
157 *
158 *
9bbf1c3d
DDP
159 * Thread-safety
160 * =============
161 *
162 * Each pmd_thread has its own private exact match cache.
163 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
164 */
165
fc82e877 166#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
167#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
168#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
169#define EM_FLOW_HASH_SEGS 2
170
60d8ccae
YW
171/* SMC uses a set-associative design. A bucket contains a set of entries that
172 * a flow item can occupy. For now, it uses one hash function rather than two
173 * as for the EMC design. */
174#define SMC_ENTRY_PER_BUCKET 4
175#define SMC_ENTRIES (1u << 20)
176#define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
177#define SMC_MASK (SMC_BUCKET_CNT - 1)
178
4c30b246
CL
179/* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
180#define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
181#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
182 DEFAULT_EM_FLOW_INSERT_INV_PROB)
183
9bbf1c3d 184struct emc_entry {
9bbf1c3d 185 struct dp_netdev_flow *flow;
0de8783a 186 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
187};
188
189struct emc_cache {
190 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 191 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
192};
193
60d8ccae
YW
194struct smc_bucket {
195 uint16_t sig[SMC_ENTRY_PER_BUCKET];
196 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
197};
198
199/* Signature match cache, differentiate from EMC cache */
200struct smc_cache {
201 struct smc_bucket buckets[SMC_BUCKET_CNT];
202};
203
204struct dfc_cache {
205 struct emc_cache emc_cache;
206 struct smc_cache smc_cache;
207};
208
9bbf1c3d
DDP
209/* Iterate in the exact match cache through every entry that might contain a
210 * miniflow with hash 'HASH'. */
211#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
212 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
213 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
214 i__ < EM_FLOW_HASH_SEGS; \
215 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
216\f
217/* Simple non-wildcarding single-priority classifier. */
218
05f9e707
IM
219/* Time in microseconds between successive optimizations of the dpcls
220 * subtable vector */
221#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
3453b4d6 222
05f9e707
IM
223/* Time in microseconds of the interval in which rxq processing cycles used
224 * in rxq to pmd assignments is measured and stored. */
225#define PMD_RXQ_INTERVAL_LEN 10000000LL
4809891b 226
c59e759f
KT
227/* Number of intervals for which cycles are stored
228 * and used during rxq to pmd assignment. */
229#define PMD_RXQ_INTERVAL_MAX 6
230
0de8783a 231struct dpcls {
3453b4d6
JS
232 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
233 odp_port_t in_port;
0de8783a 234 struct cmap subtables_map;
da9cfca6 235 struct pvector subtables;
0de8783a 236};
9bbf1c3d 237
9b4f08cd
VDA
238/* Data structure to keep packet order till fastpath processing. */
239struct dp_packet_flow_map {
240 struct dp_packet *packet;
241 struct dp_netdev_flow *flow;
242 uint16_t tcp_flags;
243};
244
0de8783a
JR
245static void dpcls_init(struct dpcls *);
246static void dpcls_destroy(struct dpcls *);
3453b4d6 247static void dpcls_sort_subtable_vector(struct dpcls *);
0de8783a
JR
248static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
249 const struct netdev_flow_key *mask);
250static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
3453b4d6 251static bool dpcls_lookup(struct dpcls *cls,
60d8ccae 252 const struct netdev_flow_key *keys[],
3453b4d6
JS
253 struct dpcls_rule **rules, size_t cnt,
254 int *num_lookups_p);
92c7c870 255
4b27db64
JR
256/* Set of supported meter flags */
257#define DP_SUPPORTED_METER_FLAGS_MASK \
258 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
259
260/* Set of supported meter band types */
261#define DP_SUPPORTED_METER_BAND_TYPES \
262 ( 1 << OFPMBT13_DROP )
263
264struct dp_meter_band {
265 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
266 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
267 uint64_t packet_count;
268 uint64_t byte_count;
269};
270
271struct dp_meter {
272 uint16_t flags;
273 uint16_t n_bands;
274 uint32_t max_delta_t;
275 uint64_t used;
276 uint64_t packet_count;
277 uint64_t byte_count;
278 struct dp_meter_band bands[];
279};
280
5bf84282
NK
281struct pmd_auto_lb {
282 bool auto_lb_requested; /* Auto load balancing requested by user. */
283 bool is_enabled; /* Current status of Auto load balancing. */
284 uint64_t rebalance_intvl;
285 uint64_t rebalance_poll_timer;
286};
287
8a4e3a85
BP
288/* Datapath based on the network device interface from netdev.h.
289 *
290 *
291 * Thread-safety
292 * =============
293 *
294 * Some members, marked 'const', are immutable. Accessing other members
295 * requires synchronization, as noted in more detail below.
296 *
297 * Acquisition order is, from outermost to innermost:
298 *
299 * dp_netdev_mutex (global)
59e6d833 300 * port_mutex
d0cca6c3 301 * non_pmd_mutex
8a4e3a85 302 */
72865317 303struct dp_netdev {
8a4e3a85
BP
304 const struct dpif_class *const class;
305 const char *const name;
6b31e073 306 struct dpif *dpif;
6a8267c5
BP
307 struct ovs_refcount ref_cnt;
308 atomic_flag destroyed;
72865317 309
8a4e3a85
BP
310 /* Ports.
311 *
e9985d6a
DDP
312 * Any lookup into 'ports' or any access to the dp_netdev_ports found
313 * through 'ports' requires taking 'port_mutex'. */
59e6d833 314 struct ovs_mutex port_mutex;
e9985d6a 315 struct hmap ports;
d33ed218 316 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 317
c71ea3c4
IM
318 /* The time that a packet can wait in output batch for sending. */
319 atomic_uint32_t tx_flush_interval;
320
4b27db64
JR
321 /* Meters. */
322 struct ovs_mutex meter_locks[N_METER_LOCKS];
323 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
4b27db64 324
65dcf3da
BB
325 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
326 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
79f36875
JS
327 /* Enable collection of PMD performance metrics. */
328 atomic_bool pmd_perf_metrics;
60d8ccae
YW
329 /* Enable the SMC cache from ovsdb config */
330 atomic_bool smc_enable_db;
65dcf3da 331
6b31e073
RW
332 /* Protects access to ofproto-dpif-upcall interface during revalidator
333 * thread synchronization. */
334 struct fat_rwlock upcall_rwlock;
623540e4
EJ
335 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
336 void *upcall_aux;
6b31e073 337
e4e74c3a
AW
338 /* Callback function for notifying the purging of dp flows (during
339 * reseting pmd deletion). */
340 dp_purge_callback *dp_purge_cb;
341 void *dp_purge_aux;
342
65f13b50
AW
343 /* Stores all 'struct dp_netdev_pmd_thread's. */
344 struct cmap poll_threads;
140dd699
IM
345 /* id pool for per thread static_tx_qid. */
346 struct id_pool *tx_qid_pool;
347 struct ovs_mutex tx_qid_pool_mutex;
e77c97b9
KT
348 /* Use measured cycles for rxq to pmd assignment. */
349 bool pmd_rxq_assign_cyc;
65f13b50
AW
350
351 /* Protects the access of the 'struct dp_netdev_pmd_thread'
352 * instance for non-pmd thread. */
353 struct ovs_mutex non_pmd_mutex;
354
355 /* Each pmd thread will store its pointer to
356 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
357 ovsthread_key_t per_pmd_key;
f2eee189 358
a6a426d6
IM
359 struct seq *reconfigure_seq;
360 uint64_t last_reconfigure_seq;
361
a14b8947 362 /* Cpu mask for pin of pmd threads. */
f2eee189 363 char *pmd_cmask;
6e3c6fa4 364
a36de779 365 uint64_t last_tnl_conf_seq;
5cf3edb3 366
57593fd2 367 struct conntrack *conntrack;
5bf84282 368 struct pmd_auto_lb pmd_alb;
72865317
BP
369};
370
4b27db64
JR
371static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
372 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
373{
374 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
375}
376
377static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
378 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
379{
380 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
381}
382
383
8a4e3a85 384static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
e9985d6a
DDP
385 odp_port_t)
386 OVS_REQUIRES(dp->port_mutex);
ff073a71 387
c59e759f
KT
388enum rxq_cycles_counter_type {
389 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
390 processing packets during the current
391 interval. */
392 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
393 during rxq to pmd assignment. */
394 RXQ_N_CYCLES
395};
396
02bb2824
YL
397enum {
398 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
399 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
400 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
401};
402
403struct dp_flow_offload_item {
404 struct dp_netdev_pmd_thread *pmd;
405 struct dp_netdev_flow *flow;
406 int op;
407 struct match match;
408 struct nlattr *actions;
409 size_t actions_len;
410
411 struct ovs_list node;
412};
413
414struct dp_flow_offload {
415 struct ovs_mutex mutex;
416 struct ovs_list list;
417 pthread_cond_t cond;
418};
419
420static struct dp_flow_offload dp_flow_offload = {
421 .mutex = OVS_MUTEX_INITIALIZER,
422 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
423};
424
425static struct ovsthread_once offload_thread_once
426 = OVSTHREAD_ONCE_INITIALIZER;
427
05f9e707 428#define XPS_TIMEOUT 500000LL /* In microseconds. */
324c8374 429
3eb67853
IM
430/* Contained by struct dp_netdev_port's 'rxqs' member. */
431struct dp_netdev_rxq {
947dc567
DDP
432 struct dp_netdev_port *port;
433 struct netdev_rxq *rx;
434 unsigned core_id; /* Core to which this queue should be
435 pinned. OVS_CORE_UNSPEC if the
436 queue doesn't need to be pinned to a
437 particular core. */
ee42dd70 438 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
47a45d86 439 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
79f36875 440 bool is_vhost; /* Is rxq of a vhost port. */
c59e759f
KT
441
442 /* Counters of cycles spent successfully polling and processing pkts. */
443 atomic_ullong cycles[RXQ_N_CYCLES];
444 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
445 sum them to yield the cycles used for an rxq. */
446 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
3eb67853
IM
447};
448
72865317
BP
449/* A port in a netdev-based datapath. */
450struct dp_netdev_port {
35303d71 451 odp_port_t port_no;
ca62bb16
BB
452 bool dynamic_txqs; /* If true XPS will be used. */
453 bool need_reconfigure; /* True if we should reconfigure netdev. */
72865317 454 struct netdev *netdev;
e9985d6a 455 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 456 struct netdev_saved_flags *sf;
3eb67853 457 struct dp_netdev_rxq *rxqs;
85a4f238 458 unsigned n_rxq; /* Number of elements in 'rxqs' */
47a45d86 459 unsigned *txq_used; /* Number of threads that use each tx queue. */
324c8374 460 struct ovs_mutex txq_used_mutex;
2fbadeb6 461 bool emc_enabled; /* If true EMC will be used. */
0cbfe35d 462 char *type; /* Port type as requested by user. */
3eb67853 463 char *rxq_affinity_list; /* Requested affinity of rx queues. */
72865317
BP
464};
465
1c1e46ed
AW
466/* Contained by struct dp_netdev_flow's 'stats' member. */
467struct dp_netdev_flow_stats {
eb94da30
DDP
468 atomic_llong used; /* Last used time, in monotonic msecs. */
469 atomic_ullong packet_count; /* Number of packets matched. */
470 atomic_ullong byte_count; /* Number of bytes matched. */
471 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
472};
473
474/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
475 *
476 *
477 * Thread-safety
478 * =============
479 *
480 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 481 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
482 *
483 * Motivation
484 * ----------
485 *
486 * The thread safety rules described here for "struct dp_netdev_flow" are
487 * motivated by two goals:
488 *
489 * - Prevent threads that read members of "struct dp_netdev_flow" from
490 * reading bad data due to changes by some thread concurrently modifying
491 * those members.
492 *
493 * - Prevent two threads making changes to members of a given "struct
494 * dp_netdev_flow" from interfering with each other.
495 *
496 *
497 * Rules
498 * -----
499 *
ed79f89a
DDP
500 * A flow 'flow' may be accessed without a risk of being freed during an RCU
501 * grace period. Code that needs to hold onto a flow for a while
502 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
503 *
504 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
505 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
506 * from modification.
8a4e3a85
BP
507 *
508 * Some members, marked 'const', are immutable. Accessing other members
509 * requires synchronization, as noted in more detail below.
510 */
72865317 511struct dp_netdev_flow {
11e5cf1f 512 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 513 /* Hash table index by unmasked flow. */
1c1e46ed
AW
514 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
515 /* 'flow_table'. */
241bad15 516 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
70e5ed6f 517 const ovs_u128 ufid; /* Unique flow identifier. */
241bad15 518 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
bd5131ba 519 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 520 /* flow. */
72865317 521
ed79f89a
DDP
522 /* Number of references.
523 * The classifier owns one reference.
524 * Any thread trying to keep a rule from being freed should hold its own
525 * reference. */
526 struct ovs_refcount ref_cnt;
527
11e5cf1f 528 bool dead;
241bad15 529 uint32_t mark; /* Unique flow mark assigned to a flow */
11e5cf1f 530
1c1e46ed
AW
531 /* Statistics. */
532 struct dp_netdev_flow_stats stats;
8a4e3a85 533
45c626a3 534 /* Actions. */
61e7deb1 535 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 536
11e5cf1f
DDP
537 /* While processing a group of input packets, the datapath uses the next
538 * member to store a pointer to the output batch for the flow. It is
539 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
f7ce4811
PS
540 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
541 struct packet_batch_per_flow *batch;
11e5cf1f 542
0de8783a
JR
543 /* Packet classification. */
544 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
545 /* 'cr' must be the last member. */
72865317
BP
546};
547
ed79f89a 548static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 549static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f 550static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
f0fb825a 551 struct flow *, bool);
8a4e3a85 552
a84cb64a
BP
553/* A set of datapath actions within a "struct dp_netdev_flow".
554 *
555 *
556 * Thread-safety
557 * =============
558 *
45c626a3 559 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 560struct dp_netdev_actions {
a84cb64a
BP
561 /* These members are immutable: they do not change during the struct's
562 * lifetime. */
a84cb64a 563 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 564 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
565};
566
567struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
568 size_t);
61e7deb1
BP
569struct dp_netdev_actions *dp_netdev_flow_get_actions(
570 const struct dp_netdev_flow *);
571static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 572
947dc567 573struct polled_queue {
922b28d4 574 struct dp_netdev_rxq *rxq;
947dc567 575 odp_port_t port_no;
2fbadeb6 576 bool emc_enabled;
35c91567
DM
577 bool rxq_enabled;
578 uint64_t change_seq;
947dc567
DDP
579};
580
ae7ad0a1
IM
581/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
582struct rxq_poll {
947dc567
DDP
583 struct dp_netdev_rxq *rxq;
584 struct hmap_node node;
ae7ad0a1
IM
585};
586
57eebbb4
DDP
587/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
588 * 'tnl_port_cache' or 'tx_ports'. */
d0cca6c3 589struct tx_port {
324c8374
IM
590 struct dp_netdev_port *port;
591 int qid;
592 long long last_used;
d0cca6c3 593 struct hmap_node node;
c71ea3c4 594 long long flush_time;
009e0033 595 struct dp_packet_batch output_pkts;
58ed6df0 596 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
d0cca6c3
DDP
597};
598
b010be17
IM
599/* A set of properties for the current processing loop that is not directly
600 * associated with the pmd thread itself, but with the packets being
601 * processed or the short-term system configuration (for example, time).
602 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
603struct dp_netdev_pmd_thread_ctx {
604 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
605 long long now;
58ed6df0
IM
606 /* RX queue from which last packet was received. */
607 struct dp_netdev_rxq *last_rxq;
2fbadeb6
IM
608 /* EMC insertion probability context for the current processing cycle. */
609 uint32_t emc_insert_min;
d0cca6c3
DDP
610};
611
e4cfed38
PS
612/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
613 * the performance overhead of interrupt processing. Therefore netdev can
614 * not implement rx-wait for these devices. dpif-netdev needs to poll
615 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 616 * devices assigned to itself.
e4cfed38
PS
617 *
618 * DPDK used PMD for accessing NIC.
619 *
65f13b50
AW
620 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
621 * I/O of all non-pmd threads. There will be no actual thread created
622 * for the instance.
1c1e46ed 623 *
1859876c
BB
624 * Each struct has its own flow cache and classifier per managed ingress port.
625 * For packets received on ingress port, a look up is done on corresponding PMD
626 * thread's flow cache and in case of a miss, lookup is performed in the
627 * corresponding classifier of port. Packets are executed with the found
628 * actions in either case.
1c1e46ed 629 * */
65f13b50 630struct dp_netdev_pmd_thread {
d9d73f84
IM
631 struct dp_netdev *dp;
632 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
633 struct cmap_node node; /* In 'dp->poll_threads'. */
634
65f13b50
AW
635 /* Per thread exact-match cache. Note, the instance for cpu core
636 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
d0cca6c3
DDP
637 * need to be protected by 'non_pmd_mutex'. Every other instance
638 * will only be accessed by its own pmd thread. */
60d8ccae 639 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
1c1e46ed 640
3453b4d6 641 /* Flow-Table and classifiers
1c1e46ed
AW
642 *
643 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
3453b4d6
JS
644 * changes to 'classifiers' must be made while still holding the
645 * 'flow_mutex'.
1c1e46ed
AW
646 */
647 struct ovs_mutex flow_mutex;
d9d73f84
IM
648 struct cmap flow_table OVS_GUARDED; /* Flow table. */
649
650 /* One classifier per in_port polled by the pmd */
651 struct cmap classifiers;
652 /* Periodically sort subtable vectors according to hit frequencies */
653 long long int next_optimization;
654 /* End of the next time interval for which processing cycles
655 are stored for each polled rxq. */
656 long long int rxq_next_cycle_store;
657
2a2c67b4
KT
658 /* Last interval timestamp. */
659 uint64_t intrvl_tsc_prev;
660 /* Last interval cycles. */
661 atomic_ullong intrvl_cycles;
662
b010be17
IM
663 /* Current context of the PMD thread. */
664 struct dp_netdev_pmd_thread_ctx ctx;
d9d73f84 665
d9d73f84
IM
666 struct seq *reload_seq;
667 uint64_t last_reload_seq;
ec61d470
IM
668
669 /* These are atomic variables used as a synchronization and configuration
670 * points for thread reload/exit.
671 *
672 * 'reload' atomic is the main one and it's used as a memory
673 * synchronization point for all other knobs and data.
674 *
675 * For a thread that requests PMD reload:
676 *
677 * * All changes that should be visible to the PMD thread must be made
678 * before setting the 'reload'. These changes could use any memory
679 * ordering model including 'relaxed'.
680 * * Setting the 'reload' atomic should occur in the same thread where
681 * all other PMD configuration options updated.
682 * * Setting the 'reload' atomic should be done with 'release' memory
683 * ordering model or stricter. This will guarantee that all previous
684 * changes (including non-atomic and 'relaxed') will be visible to
685 * the PMD thread.
686 * * To check that reload is done, thread should poll the 'reload' atomic
687 * to become 'false'. Polling should be done with 'acquire' memory
688 * ordering model or stricter. This ensures that PMD thread completed
689 * the reload process.
690 *
691 * For the PMD thread:
692 *
693 * * PMD thread should read 'reload' atomic with 'acquire' memory
694 * ordering model or stricter. This will guarantee that all changes
695 * made before setting the 'reload' in the requesting thread will be
696 * visible to the PMD thread.
697 * * All other configuration data could be read with any memory
698 * ordering model (including non-atomic and 'relaxed') but *only after*
699 * reading the 'reload' atomic set to 'true'.
700 * * When the PMD reload done, PMD should (optionally) set all the below
701 * knobs except the 'reload' to their default ('false') values and
702 * (mandatory), as the last step, set the 'reload' to 'false' using
703 * 'release' memory ordering model or stricter. This will inform the
704 * requesting thread that PMD has completed a reload cycle.
705 */
d9d73f84 706 atomic_bool reload; /* Do we need to reload ports? */
6d9fead1 707 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
e2cafa86 708 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
299c8d61 709 atomic_bool exit; /* For terminating the pmd thread. */
ec61d470 710
d9d73f84
IM
711 pthread_t thread;
712 unsigned core_id; /* CPU core id of this pmd thread. */
713 int numa_id; /* numa node id of this pmd thread. */
714 bool isolated;
715
716 /* Queue id used by this pmd thread to send packets on all netdevs if
717 * XPS disabled for this netdev. All static_tx_qid's are unique and less
718 * than 'cmap_count(dp->poll_threads)'. */
719 uint32_t static_tx_qid;
720
c71ea3c4
IM
721 /* Number of filled output batches. */
722 int n_output_batches;
723
d9d73f84
IM
724 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
725 /* List of rx queues to poll. */
726 struct hmap poll_list OVS_GUARDED;
727 /* Map of 'tx_port's used for transmission. Written by the main thread,
728 * read by the pmd thread. */
729 struct hmap tx_ports OVS_GUARDED;
730
731 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
732 * ports (that support push_tunnel/pop_tunnel), the other contains ports
733 * with at least one txq (that support send). A port can be in both.
734 *
735 * There are two separate maps to make sure that we don't try to execute
736 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
737 *
738 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
739 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
740 * other instance will only be accessed by its own pmd thread. */
741 struct hmap tnl_port_cache;
742 struct hmap send_port_cache;
743
82a48ead
JS
744 /* Keep track of detailed PMD performance statistics. */
745 struct pmd_perf_stats perf_stats;
d9d73f84 746
5bf84282
NK
747 /* Stats from previous iteration used by automatic pmd
748 * load balance logic. */
749 uint64_t prev_stats[PMD_N_STATS];
750 atomic_count pmd_overloaded;
751
d9d73f84
IM
752 /* Set to true if the pmd thread needs to be reloaded. */
753 bool need_reload;
6c3eee82
BP
754};
755
72865317
BP
756/* Interface to netdev-based datapath. */
757struct dpif_netdev {
758 struct dpif dpif;
759 struct dp_netdev *dp;
d33ed218 760 uint64_t last_port_seq;
72865317
BP
761};
762
8a4e3a85 763static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
e9985d6a
DDP
764 struct dp_netdev_port **portp)
765 OVS_REQUIRES(dp->port_mutex);
8a4e3a85 766static int get_port_by_name(struct dp_netdev *dp, const char *devname,
e9985d6a
DDP
767 struct dp_netdev_port **portp)
768 OVS_REQUIRES(dp->port_mutex);
8a4e3a85
BP
769static void dp_netdev_free(struct dp_netdev *)
770 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
771static int do_add_port(struct dp_netdev *dp, const char *devname,
772 const char *type, odp_port_t port_no)
59e6d833 773 OVS_REQUIRES(dp->port_mutex);
c40b890f 774static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 775 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
776static int dpif_netdev_open(const struct dpif_class *, const char *name,
777 bool create, struct dpif **);
65f13b50 778static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 779 struct dp_packet_batch *,
7d7ded7a
DB
780 bool should_steal,
781 const struct flow *flow,
4edb9ae9 782 const struct nlattr *actions,
b010be17 783 size_t actions_len);
65f13b50 784static void dp_netdev_input(struct dp_netdev_pmd_thread *,
1895cc8d 785 struct dp_packet_batch *, odp_port_t port_no);
a90ed026 786static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
1895cc8d 787 struct dp_packet_batch *);
41ccaa24 788
6b31e073 789static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 790static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50 791static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
00873463
DDP
792 struct dp_netdev *dp, unsigned core_id,
793 int numa_id);
1c1e46ed 794static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
e9985d6a
DDP
795static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
796 OVS_REQUIRES(dp->port_mutex);
797
e32971b8 798static void *pmd_thread_main(void *);
b19befae 799static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 800 unsigned core_id);
1c1e46ed
AW
801static struct dp_netdev_pmd_thread *
802dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
140dd699
IM
803static void dp_netdev_del_pmd(struct dp_netdev *dp,
804 struct dp_netdev_pmd_thread *pmd);
e32971b8 805static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
d0cca6c3 806static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
d0cca6c3 807static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
e32971b8
DDP
808 struct dp_netdev_port *port)
809 OVS_REQUIRES(pmd->port_mutex);
810static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
811 struct tx_port *tx)
812 OVS_REQUIRES(pmd->port_mutex);
d0cca6c3 813static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
947dc567
DDP
814 struct dp_netdev_rxq *rxq)
815 OVS_REQUIRES(pmd->port_mutex);
e32971b8
DDP
816static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
817 struct rxq_poll *poll)
818 OVS_REQUIRES(pmd->port_mutex);
c71ea3c4
IM
819static int
820dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
821 bool force);
009e0033 822
e32971b8 823static void reconfigure_datapath(struct dp_netdev *dp)
3eb67853 824 OVS_REQUIRES(dp->port_mutex);
1c1e46ed
AW
825static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
826static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
827static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
d0cca6c3
DDP
828static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
829 OVS_REQUIRES(pmd->port_mutex);
3453b4d6 830static inline void
4809891b
KT
831dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
832 struct polled_queue *poll_list, int poll_cnt);
833static void
834dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
835 enum rxq_cycles_counter_type type,
836 unsigned long long cycles);
837static uint64_t
838dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
839 enum rxq_cycles_counter_type type);
840static void
841dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
842 unsigned long long cycles);
655856ef
KT
843static uint64_t
844dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
324c8374
IM
845static void
846dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 847 bool purge);
324c8374 848static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 849 struct tx_port *tx);
324c8374 850
67ad54cb 851static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d 852static void emc_clear_entry(struct emc_entry *ce);
60d8ccae 853static void smc_clear_entry(struct smc_bucket *b, int idx);
9bbf1c3d 854
cd995c73 855static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
79f36875
JS
856static inline bool
857pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
02bb2824
YL
858static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
859 struct dp_netdev_flow *flow);
cd995c73 860
9bbf1c3d
DDP
861static void
862emc_cache_init(struct emc_cache *flow_cache)
863{
864 int i;
865
67ad54cb 866 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
867 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
868 flow_cache->entries[i].flow = NULL;
0de8783a 869 flow_cache->entries[i].key.hash = 0;
09b0fa9c 870 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 871 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
872 }
873}
874
60d8ccae
YW
875static void
876smc_cache_init(struct smc_cache *smc_cache)
877{
878 int i, j;
879 for (i = 0; i < SMC_BUCKET_CNT; i++) {
880 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
881 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
882 }
883 }
884}
885
886static void
887dfc_cache_init(struct dfc_cache *flow_cache)
888{
889 emc_cache_init(&flow_cache->emc_cache);
890 smc_cache_init(&flow_cache->smc_cache);
891}
892
9bbf1c3d
DDP
893static void
894emc_cache_uninit(struct emc_cache *flow_cache)
895{
896 int i;
897
898 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
899 emc_clear_entry(&flow_cache->entries[i]);
900 }
901}
902
60d8ccae
YW
903static void
904smc_cache_uninit(struct smc_cache *smc)
905{
906 int i, j;
907
908 for (i = 0; i < SMC_BUCKET_CNT; i++) {
909 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
910 smc_clear_entry(&(smc->buckets[i]), j);
911 }
912 }
913}
914
915static void
916dfc_cache_uninit(struct dfc_cache *flow_cache)
917{
918 smc_cache_uninit(&flow_cache->smc_cache);
919 emc_cache_uninit(&flow_cache->emc_cache);
920}
921
67ad54cb
AW
922/* Check and clear dead flow references slowly (one entry at each
923 * invocation). */
924static void
925emc_cache_slow_sweep(struct emc_cache *flow_cache)
926{
927 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
928
929 if (!emc_entry_alive(entry)) {
930 emc_clear_entry(entry);
931 }
932 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
933}
934
b010be17
IM
935/* Updates the time in PMD threads context and should be called in three cases:
936 *
937 * 1. PMD structure initialization:
938 * - dp_netdev_configure_pmd()
939 *
940 * 2. Before processing of the new packet batch:
941 * - dpif_netdev_execute()
009e0033 942 * - dp_netdev_process_rxq_port()
b010be17
IM
943 *
944 * 3. At least once per polling iteration in main polling threads if no
945 * packets received on current iteration:
946 * - dpif_netdev_run()
947 * - pmd_thread_main()
948 *
949 * 'pmd->ctx.now' should be used without update in all other cases if possible.
950 */
951static inline void
952pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
953{
05f9e707 954 pmd->ctx.now = time_usec();
b010be17
IM
955}
956
c4ea7529
BP
957/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
958bool
959dpif_is_netdev(const struct dpif *dpif)
960{
961 return dpif->dpif_class->open == dpif_netdev_open;
962}
963
72865317
BP
964static struct dpif_netdev *
965dpif_netdev_cast(const struct dpif *dpif)
966{
c4ea7529 967 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
968 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
969}
970
971static struct dp_netdev *
972get_dp_netdev(const struct dpif *dpif)
973{
974 return dpif_netdev_cast(dpif)->dp;
975}
6553d06b
DDP
976\f
977enum pmd_info_type {
ce179f11
IM
978 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
979 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
79f36875
JS
980 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
981 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
6553d06b
DDP
982};
983
984static void
82a48ead 985format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
6553d06b 986{
6553d06b
DDP
987 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
988 ? "main thread" : "pmd thread");
6553d06b
DDP
989 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
990 ds_put_format(reply, " numa_id %d", pmd->numa_id);
991 }
d5c199ea 992 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 993 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
994 }
995 ds_put_cstr(reply, ":\n");
82a48ead
JS
996}
997
998static void
999pmd_info_show_stats(struct ds *reply,
1000 struct dp_netdev_pmd_thread *pmd)
1001{
1002 uint64_t stats[PMD_N_STATS];
1003 uint64_t total_cycles, total_packets;
1004 double passes_per_pkt = 0;
1005 double lookups_per_hit = 0;
1006 double packets_per_batch = 0;
1007
1008 pmd_perf_read_counters(&pmd->perf_stats, stats);
1009 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1010 + stats[PMD_CYCLES_ITER_BUSY];
1011 total_packets = stats[PMD_STAT_RECV];
1012
1013 format_pmd_thread(reply, pmd);
6553d06b 1014
82a48ead
JS
1015 if (total_packets > 0) {
1016 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1017 / (double) total_packets;
cc4891f3 1018 }
82a48ead
JS
1019 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1020 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1021 / (double) stats[PMD_STAT_MASKED_HIT];
1022 }
1023 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1024 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1025 / (double) stats[PMD_STAT_SENT_BATCHES];
cc4891f3
IM
1026 }
1027
6553d06b 1028 ds_put_format(reply,
5a0e4aec
BP
1029 " packets received: %"PRIu64"\n"
1030 " packet recirculations: %"PRIu64"\n"
1031 " avg. datapath passes per packet: %.02f\n"
1032 " emc hits: %"PRIu64"\n"
60d8ccae 1033 " smc hits: %"PRIu64"\n"
5a0e4aec
BP
1034 " megaflow hits: %"PRIu64"\n"
1035 " avg. subtable lookups per megaflow hit: %.02f\n"
1036 " miss with success upcall: %"PRIu64"\n"
1037 " miss with failed upcall: %"PRIu64"\n"
1038 " avg. packets per output batch: %.02f\n",
82a48ead
JS
1039 total_packets, stats[PMD_STAT_RECIRC],
1040 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
60d8ccae 1041 stats[PMD_STAT_SMC_HIT],
82a48ead
JS
1042 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1043 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
cc4891f3 1044 packets_per_batch);
6553d06b
DDP
1045
1046 if (total_cycles == 0) {
1047 return;
1048 }
1049
1050 ds_put_format(reply,
5a0e4aec
BP
1051 " idle cycles: %"PRIu64" (%.02f%%)\n"
1052 " processing cycles: %"PRIu64" (%.02f%%)\n",
82a48ead
JS
1053 stats[PMD_CYCLES_ITER_IDLE],
1054 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1055 stats[PMD_CYCLES_ITER_BUSY],
1056 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
6553d06b
DDP
1057
1058 if (total_packets == 0) {
1059 return;
1060 }
1061
1062 ds_put_format(reply,
5a0e4aec 1063 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
82a48ead 1064 total_cycles / (double) total_packets,
6553d06b
DDP
1065 total_cycles, total_packets);
1066
1067 ds_put_format(reply,
5a0e4aec 1068 " avg processing cycles per packet: "
82a48ead
JS
1069 "%.02f (%"PRIu64"/%"PRIu64")\n",
1070 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1071 stats[PMD_CYCLES_ITER_BUSY], total_packets);
6553d06b
DDP
1072}
1073
79f36875
JS
1074static void
1075pmd_info_show_perf(struct ds *reply,
1076 struct dp_netdev_pmd_thread *pmd,
1077 struct pmd_perf_params *par)
1078{
1079 if (pmd->core_id != NON_PMD_CORE_ID) {
1080 char *time_str =
1081 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1082 long long now = time_msec();
1083 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1084
1085 ds_put_cstr(reply, "\n");
1086 ds_put_format(reply, "Time: %s\n", time_str);
1087 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1088 ds_put_cstr(reply, "\n");
1089 format_pmd_thread(reply, pmd);
1090 ds_put_cstr(reply, "\n");
1091 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1092 if (pmd_perf_metrics_enabled(pmd)) {
1093 /* Prevent parallel clearing of perf metrics. */
1094 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1095 if (par->histograms) {
1096 ds_put_cstr(reply, "\n");
1097 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1098 }
1099 if (par->iter_hist_len > 0) {
1100 ds_put_cstr(reply, "\n");
1101 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1102 par->iter_hist_len);
1103 }
1104 if (par->ms_hist_len > 0) {
1105 ds_put_cstr(reply, "\n");
1106 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1107 par->ms_hist_len);
1108 }
1109 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1110 }
1111 free(time_str);
1112 }
1113}
1114
947dc567
DDP
1115static int
1116compare_poll_list(const void *a_, const void *b_)
1117{
1118 const struct rxq_poll *a = a_;
1119 const struct rxq_poll *b = b_;
1120
1121 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1122 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1123
1124 int cmp = strcmp(namea, nameb);
1125 if (!cmp) {
1126 return netdev_rxq_get_queue_id(a->rxq->rx)
1127 - netdev_rxq_get_queue_id(b->rxq->rx);
1128 } else {
1129 return cmp;
1130 }
1131}
1132
1133static void
1134sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1135 size_t *n)
216abd28 1136 OVS_REQUIRES(pmd->port_mutex)
947dc567
DDP
1137{
1138 struct rxq_poll *ret, *poll;
1139 size_t i;
1140
1141 *n = hmap_count(&pmd->poll_list);
1142 if (!*n) {
1143 ret = NULL;
1144 } else {
1145 ret = xcalloc(*n, sizeof *ret);
1146 i = 0;
1147 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1148 ret[i] = *poll;
1149 i++;
1150 }
1151 ovs_assert(i == *n);
1cc1b5f6 1152 qsort(ret, *n, sizeof *ret, compare_poll_list);
947dc567
DDP
1153 }
1154
947dc567
DDP
1155 *list = ret;
1156}
1157
ce179f11
IM
1158static void
1159pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1160{
1161 if (pmd->core_id != NON_PMD_CORE_ID) {
947dc567 1162 struct rxq_poll *list;
2a2c67b4
KT
1163 size_t n_rxq;
1164 uint64_t total_cycles = 0;
ce179f11 1165
3eb67853 1166 ds_put_format(reply,
5a0e4aec 1167 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
3eb67853
IM
1168 pmd->numa_id, pmd->core_id, (pmd->isolated)
1169 ? "true" : "false");
ce179f11 1170
d0cca6c3 1171 ovs_mutex_lock(&pmd->port_mutex);
2a2c67b4 1172 sorted_poll_list(pmd, &list, &n_rxq);
ce179f11 1173
2a2c67b4
KT
1174 /* Get the total pmd cycles for an interval. */
1175 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1176 /* Estimate the cycles to cover all intervals. */
1177 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1178
1179 for (int i = 0; i < n_rxq; i++) {
1180 struct dp_netdev_rxq *rxq = list[i].rxq;
1181 const char *name = netdev_rxq_get_name(rxq->rx);
1182 uint64_t proc_cycles = 0;
1183
1184 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1185 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
ce179f11 1186 }
5a0e4aec 1187 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
947dc567 1188 netdev_rxq_get_queue_id(list[i].rxq->rx));
35c91567
DM
1189 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1190 ? "(enabled) " : "(disabled)");
5a0e4aec 1191 ds_put_format(reply, " pmd usage: ");
2a2c67b4
KT
1192 if (total_cycles) {
1193 ds_put_format(reply, "%2"PRIu64"",
1194 proc_cycles * 100 / total_cycles);
1195 ds_put_cstr(reply, " %");
1196 } else {
1197 ds_put_format(reply, "%s", "NOT AVAIL");
1198 }
1199 ds_put_cstr(reply, "\n");
ce179f11 1200 }
d0cca6c3 1201 ovs_mutex_unlock(&pmd->port_mutex);
947dc567 1202 free(list);
ce179f11
IM
1203 }
1204}
1205
34d8e04b
EC
1206static int
1207compare_poll_thread_list(const void *a_, const void *b_)
1208{
1209 const struct dp_netdev_pmd_thread *a, *b;
1210
1211 a = *(struct dp_netdev_pmd_thread **)a_;
1212 b = *(struct dp_netdev_pmd_thread **)b_;
1213
1214 if (a->core_id < b->core_id) {
1215 return -1;
1216 }
1217 if (a->core_id > b->core_id) {
1218 return 1;
1219 }
1220 return 0;
1221}
1222
1223/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1224 * this list, as long as we do not go to quiescent state. */
1225static void
1226sorted_poll_thread_list(struct dp_netdev *dp,
1227 struct dp_netdev_pmd_thread ***list,
1228 size_t *n)
1229{
1230 struct dp_netdev_pmd_thread *pmd;
1231 struct dp_netdev_pmd_thread **pmd_list;
1232 size_t k = 0, n_pmds;
1233
1234 n_pmds = cmap_count(&dp->poll_threads);
1235 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1236
1237 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1238 if (k >= n_pmds) {
1239 break;
1240 }
1241 pmd_list[k++] = pmd;
1242 }
1243
1244 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1245
1246 *list = pmd_list;
1247 *n = k;
1248}
1249
cd995c73
KT
1250static void
1251dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1252 const char *argv[], void *aux OVS_UNUSED)
1253{
1254 struct ds reply = DS_EMPTY_INITIALIZER;
1255 struct dp_netdev *dp = NULL;
1256
1257 ovs_mutex_lock(&dp_netdev_mutex);
1258
1259 if (argc == 2) {
1260 dp = shash_find_data(&dp_netdevs, argv[1]);
1261 } else if (shash_count(&dp_netdevs) == 1) {
1262 /* There's only one datapath */
1263 dp = shash_first(&dp_netdevs)->data;
1264 }
1265
1266 if (!dp) {
1267 ovs_mutex_unlock(&dp_netdev_mutex);
1268 unixctl_command_reply_error(conn,
1269 "please specify an existing datapath");
1270 return;
1271 }
1272
1273 dp_netdev_request_reconfigure(dp);
1274 ovs_mutex_unlock(&dp_netdev_mutex);
1275 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1276 unixctl_command_reply(conn, ds_cstr(&reply));
1277 ds_destroy(&reply);
1278}
1279
6553d06b
DDP
1280static void
1281dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1282 void *aux)
1283{
1284 struct ds reply = DS_EMPTY_INITIALIZER;
34d8e04b 1285 struct dp_netdev_pmd_thread **pmd_list;
6553d06b
DDP
1286 struct dp_netdev *dp = NULL;
1287 enum pmd_info_type type = *(enum pmd_info_type *) aux;
82a48ead
JS
1288 unsigned int core_id;
1289 bool filter_on_pmd = false;
1290 size_t n;
6553d06b
DDP
1291
1292 ovs_mutex_lock(&dp_netdev_mutex);
1293
82a48ead 1294 while (argc > 1) {
79f36875 1295 if (!strcmp(argv[1], "-pmd") && argc > 2) {
82a48ead
JS
1296 if (str_to_uint(argv[2], 10, &core_id)) {
1297 filter_on_pmd = true;
1298 }
1299 argc -= 2;
1300 argv += 2;
1301 } else {
1302 dp = shash_find_data(&dp_netdevs, argv[1]);
1303 argc -= 1;
1304 argv += 1;
1305 }
6553d06b
DDP
1306 }
1307
1308 if (!dp) {
82a48ead
JS
1309 if (shash_count(&dp_netdevs) == 1) {
1310 /* There's only one datapath */
1311 dp = shash_first(&dp_netdevs)->data;
1312 } else {
1313 ovs_mutex_unlock(&dp_netdev_mutex);
1314 unixctl_command_reply_error(conn,
1315 "please specify an existing datapath");
1316 return;
1317 }
6553d06b
DDP
1318 }
1319
34d8e04b
EC
1320 sorted_poll_thread_list(dp, &pmd_list, &n);
1321 for (size_t i = 0; i < n; i++) {
1322 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1323 if (!pmd) {
1324 break;
1325 }
82a48ead
JS
1326 if (filter_on_pmd && pmd->core_id != core_id) {
1327 continue;
1328 }
ce179f11
IM
1329 if (type == PMD_INFO_SHOW_RXQ) {
1330 pmd_info_show_rxq(&reply, pmd);
82a48ead
JS
1331 } else if (type == PMD_INFO_CLEAR_STATS) {
1332 pmd_perf_stats_clear(&pmd->perf_stats);
1333 } else if (type == PMD_INFO_SHOW_STATS) {
1334 pmd_info_show_stats(&reply, pmd);
79f36875
JS
1335 } else if (type == PMD_INFO_PERF_SHOW) {
1336 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
6553d06b
DDP
1337 }
1338 }
34d8e04b 1339 free(pmd_list);
6553d06b
DDP
1340
1341 ovs_mutex_unlock(&dp_netdev_mutex);
1342
1343 unixctl_command_reply(conn, ds_cstr(&reply));
1344 ds_destroy(&reply);
1345}
79f36875
JS
1346
1347static void
1348pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1349 const char *argv[],
1350 void *aux OVS_UNUSED)
1351{
1352 struct pmd_perf_params par;
1353 long int it_hist = 0, ms_hist = 0;
1354 par.histograms = true;
1355
1356 while (argc > 1) {
1357 if (!strcmp(argv[1], "-nh")) {
1358 par.histograms = false;
1359 argc -= 1;
1360 argv += 1;
1361 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1362 it_hist = strtol(argv[2], NULL, 10);
1363 if (it_hist < 0) {
1364 it_hist = 0;
1365 } else if (it_hist > HISTORY_LEN) {
1366 it_hist = HISTORY_LEN;
1367 }
1368 argc -= 2;
1369 argv += 2;
1370 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1371 ms_hist = strtol(argv[2], NULL, 10);
1372 if (ms_hist < 0) {
1373 ms_hist = 0;
1374 } else if (ms_hist > HISTORY_LEN) {
1375 ms_hist = HISTORY_LEN;
1376 }
1377 argc -= 2;
1378 argv += 2;
1379 } else {
1380 break;
1381 }
1382 }
1383 par.iter_hist_len = it_hist;
1384 par.ms_hist_len = ms_hist;
1385 par.command_type = PMD_INFO_PERF_SHOW;
1386 dpif_netdev_pmd_info(conn, argc, argv, &par);
1387}
6553d06b
DDP
1388\f
1389static int
1390dpif_netdev_init(void)
1391{
1392 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
1393 clear_aux = PMD_INFO_CLEAR_STATS,
1394 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b 1395
82a48ead
JS
1396 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1397 0, 3, dpif_netdev_pmd_info,
6553d06b 1398 (void *)&show_aux);
82a48ead
JS
1399 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1400 0, 3, dpif_netdev_pmd_info,
6553d06b 1401 (void *)&clear_aux);
82a48ead
JS
1402 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1403 0, 3, dpif_netdev_pmd_info,
ce179f11 1404 (void *)&poll_aux);
79f36875
JS
1405 unixctl_command_register("dpif-netdev/pmd-perf-show",
1406 "[-nh] [-it iter-history-len]"
1407 " [-ms ms-history-len]"
1408 " [-pmd core] [dp]",
1409 0, 8, pmd_perf_show_cmd,
1410 NULL);
cd995c73
KT
1411 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1412 0, 1, dpif_netdev_pmd_rebalance,
1413 NULL);
7178fefb
JS
1414 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1415 "on|off [-b before] [-a after] [-e|-ne] "
1416 "[-us usec] [-q qlen]",
1417 0, 10, pmd_perf_log_set_cmd,
1418 NULL);
6553d06b
DDP
1419 return 0;
1420}
72865317 1421
2197d7ab 1422static int
2240af25
DDP
1423dpif_netdev_enumerate(struct sset *all_dps,
1424 const struct dpif_class *dpif_class)
2197d7ab
GL
1425{
1426 struct shash_node *node;
1427
97be1538 1428 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 1429 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
1430 struct dp_netdev *dp = node->data;
1431 if (dpif_class != dp->class) {
1432 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1433 * If the class doesn't match, skip this dpif. */
1434 continue;
1435 }
2197d7ab
GL
1436 sset_add(all_dps, node->name);
1437 }
97be1538 1438 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 1439
2197d7ab
GL
1440 return 0;
1441}
1442
add90f6f
EJ
1443static bool
1444dpif_netdev_class_is_dummy(const struct dpif_class *class)
1445{
1446 return class != &dpif_netdev_class;
1447}
1448
0aeaabc8
JP
1449static const char *
1450dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1451{
1452 return strcmp(type, "internal") ? type
e98d0cb3 1453 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
0aeaabc8
JP
1454 : "tap";
1455}
1456
72865317
BP
1457static struct dpif *
1458create_dpif_netdev(struct dp_netdev *dp)
1459{
462278db 1460 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 1461 struct dpif_netdev *dpif;
72865317 1462
6a8267c5 1463 ovs_refcount_ref(&dp->ref_cnt);
72865317 1464
72865317 1465 dpif = xmalloc(sizeof *dpif);
614c4892 1466 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 1467 dpif->dp = dp;
d33ed218 1468 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
1469
1470 return &dpif->dpif;
1471}
1472
4e022ec0
AW
1473/* Choose an unused, non-zero port number and return it on success.
1474 * Return ODPP_NONE on failure. */
1475static odp_port_t
e44768b7 1476choose_port(struct dp_netdev *dp, const char *name)
59e6d833 1477 OVS_REQUIRES(dp->port_mutex)
e44768b7 1478{
4e022ec0 1479 uint32_t port_no;
e44768b7
JP
1480
1481 if (dp->class != &dpif_netdev_class) {
1482 const char *p;
1483 int start_no = 0;
1484
1485 /* If the port name begins with "br", start the number search at
1486 * 100 to make writing tests easier. */
1487 if (!strncmp(name, "br", 2)) {
1488 start_no = 100;
1489 }
1490
1491 /* If the port name contains a number, try to assign that port number.
1492 * This can make writing unit tests easier because port numbers are
1493 * predictable. */
1494 for (p = name; *p != '\0'; p++) {
1495 if (isdigit((unsigned char) *p)) {
1496 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
1497 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1498 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1499 return u32_to_odp(port_no);
e44768b7
JP
1500 }
1501 break;
1502 }
1503 }
1504 }
1505
ff073a71
BP
1506 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1507 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1508 return u32_to_odp(port_no);
e44768b7
JP
1509 }
1510 }
1511
4e022ec0 1512 return ODPP_NONE;
e44768b7
JP
1513}
1514
72865317 1515static int
614c4892
BP
1516create_dp_netdev(const char *name, const struct dpif_class *class,
1517 struct dp_netdev **dpp)
8a4e3a85 1518 OVS_REQUIRES(dp_netdev_mutex)
72865317 1519{
1276e3db 1520 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
72865317
BP
1521 struct dp_netdev *dp;
1522 int error;
72865317 1523
1276e3db
IM
1524 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1525 * unit tests. */
1526 if (!dpif_netdev_class_is_dummy(class)
1527 && ovsthread_once_start(&tsc_freq_check)) {
1528 pmd_perf_estimate_tsc_frequency();
1529 ovsthread_once_done(&tsc_freq_check);
1530 }
1531
462278db 1532 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
1533 shash_add(&dp_netdevs, name, dp);
1534
1535 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1536 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 1537 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 1538 atomic_flag_clear(&dp->destroyed);
8a4e3a85 1539
59e6d833 1540 ovs_mutex_init(&dp->port_mutex);
e9985d6a 1541 hmap_init(&dp->ports);
d33ed218 1542 dp->port_seq = seq_create();
6b31e073
RW
1543 fat_rwlock_init(&dp->upcall_rwlock);
1544
a6a426d6
IM
1545 dp->reconfigure_seq = seq_create();
1546 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1547
4b27db64
JR
1548 for (int i = 0; i < N_METER_LOCKS; ++i) {
1549 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1550 }
1551
6b31e073
RW
1552 /* Disable upcalls by default. */
1553 dp_netdev_disable_upcall(dp);
623540e4 1554 dp->upcall_aux = NULL;
6b31e073 1555 dp->upcall_cb = NULL;
e44768b7 1556
57593fd2 1557 dp->conntrack = conntrack_init();
5cf3edb3 1558
4c30b246 1559 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
c71ea3c4 1560 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
4c30b246 1561
65f13b50 1562 cmap_init(&dp->poll_threads);
e77c97b9 1563 dp->pmd_rxq_assign_cyc = true;
140dd699
IM
1564
1565 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1566 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1567 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1568
65f13b50
AW
1569 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1570 ovsthread_key_create(&dp->per_pmd_key, NULL);
1571
e9985d6a 1572 ovs_mutex_lock(&dp->port_mutex);
140dd699
IM
1573 /* non-PMD will be created before all other threads and will
1574 * allocate static_tx_qid = 0. */
f2eee189 1575 dp_netdev_set_nonpmd(dp);
65f13b50 1576
a3e8437a
TLSC
1577 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1578 "internal"),
1579 ODPP_LOCAL);
59e6d833 1580 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
1581 if (error) {
1582 dp_netdev_free(dp);
462278db 1583 return error;
72865317
BP
1584 }
1585
a36de779 1586 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 1587 *dpp = dp;
72865317
BP
1588 return 0;
1589}
1590
a6a426d6
IM
1591static void
1592dp_netdev_request_reconfigure(struct dp_netdev *dp)
1593{
1594 seq_change(dp->reconfigure_seq);
1595}
1596
1597static bool
1598dp_netdev_is_reconf_required(struct dp_netdev *dp)
1599{
1600 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1601}
1602
72865317 1603static int
614c4892 1604dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 1605 bool create, struct dpif **dpifp)
72865317 1606{
462278db 1607 struct dp_netdev *dp;
5279f8fd 1608 int error;
462278db 1609
97be1538 1610 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
1611 dp = shash_find_data(&dp_netdevs, name);
1612 if (!dp) {
5279f8fd 1613 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 1614 } else {
5279f8fd
BP
1615 error = (dp->class != class ? EINVAL
1616 : create ? EEXIST
1617 : 0);
1618 }
1619 if (!error) {
1620 *dpifp = create_dpif_netdev(dp);
6b31e073 1621 dp->dpif = *dpifp;
72865317 1622 }
97be1538 1623 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 1624
5279f8fd 1625 return error;
72865317
BP
1626}
1627
88ace79b
DDP
1628static void
1629dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1630 OVS_NO_THREAD_SAFETY_ANALYSIS
1631{
1632 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1633 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1634
1635 /* Before freeing a lock we should release it */
1636 fat_rwlock_unlock(&dp->upcall_rwlock);
1637 fat_rwlock_destroy(&dp->upcall_rwlock);
1638}
1639
4b27db64
JR
1640static void
1641dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1642 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1643{
1644 if (dp->meters[meter_id]) {
1645 free(dp->meters[meter_id]);
1646 dp->meters[meter_id] = NULL;
1647 }
1648}
1649
8a4e3a85
BP
1650/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1651 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
1652static void
1653dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 1654 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 1655{
e9985d6a 1656 struct dp_netdev_port *port, *next;
4ad28026 1657
8a4e3a85
BP
1658 shash_find_and_delete(&dp_netdevs, dp->name);
1659
59e6d833 1660 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 1661 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
c40b890f 1662 do_del_port(dp, port);
1ba530f4 1663 }
59e6d833 1664 ovs_mutex_unlock(&dp->port_mutex);
4b27db64 1665
e32971b8 1666 dp_netdev_destroy_all_pmds(dp, true);
d916785c 1667 cmap_destroy(&dp->poll_threads);
51852a57 1668
140dd699
IM
1669 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1670 id_pool_destroy(dp->tx_qid_pool);
1671
b9584f21
DDP
1672 ovs_mutex_destroy(&dp->non_pmd_mutex);
1673 ovsthread_key_delete(dp->per_pmd_key);
1674
57593fd2 1675 conntrack_destroy(dp->conntrack);
b9584f21
DDP
1676
1677
a6a426d6
IM
1678 seq_destroy(dp->reconfigure_seq);
1679
d33ed218 1680 seq_destroy(dp->port_seq);
e9985d6a 1681 hmap_destroy(&dp->ports);
3186ea46 1682 ovs_mutex_destroy(&dp->port_mutex);
88ace79b
DDP
1683
1684 /* Upcalls must be disabled at this point */
1685 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 1686
4b27db64
JR
1687 int i;
1688
1689 for (i = 0; i < MAX_METERS; ++i) {
1690 meter_lock(dp, i);
1691 dp_delete_meter(dp, i);
1692 meter_unlock(dp, i);
1693 }
1694 for (i = 0; i < N_METER_LOCKS; ++i) {
1695 ovs_mutex_destroy(&dp->meter_locks[i]);
1696 }
1697
f2eee189 1698 free(dp->pmd_cmask);
8a4e3a85 1699 free(CONST_CAST(char *, dp->name));
72865317
BP
1700 free(dp);
1701}
1702
8a4e3a85
BP
1703static void
1704dp_netdev_unref(struct dp_netdev *dp)
1705{
1706 if (dp) {
1707 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1708 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1709 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1710 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1711 dp_netdev_free(dp);
1712 }
1713 ovs_mutex_unlock(&dp_netdev_mutex);
1714 }
1715}
1716
72865317
BP
1717static void
1718dpif_netdev_close(struct dpif *dpif)
1719{
1720 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1721
8a4e3a85 1722 dp_netdev_unref(dp);
72865317
BP
1723 free(dpif);
1724}
1725
1726static int
7dab847a 1727dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1728{
1729 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1730
6a8267c5 1731 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1732 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1733 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1734 OVS_NOT_REACHED();
1735 }
1736 }
5279f8fd 1737
72865317
BP
1738 return 0;
1739}
1740
eb94da30
DDP
1741/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1742 * load/store semantics. While the increment is not atomic, the load and
1743 * store operations are, making it impossible to read inconsistent values.
1744 *
1745 * This is used to update thread local stats counters. */
1746static void
1747non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1748{
1749 unsigned long long tmp;
1750
1751 atomic_read_relaxed(var, &tmp);
1752 tmp += n;
1753 atomic_store_relaxed(var, tmp);
1754}
1755
72865317 1756static int
a8d9304d 1757dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1758{
1759 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1760 struct dp_netdev_pmd_thread *pmd;
82a48ead 1761 uint64_t pmd_stats[PMD_N_STATS];
8a4e3a85 1762
1c1e46ed
AW
1763 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1764 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1765 stats->n_flows += cmap_count(&pmd->flow_table);
82a48ead
JS
1766 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1767 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
60d8ccae 1768 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
82a48ead
JS
1769 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1770 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1771 stats->n_lost += pmd_stats[PMD_STAT_LOST];
51852a57 1772 }
1ce3fa06 1773 stats->n_masks = UINT32_MAX;
847108dc 1774 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1775
72865317
BP
1776 return 0;
1777}
1778
e4cfed38 1779static void
65f13b50 1780dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1781{
accf8626 1782 if (pmd->core_id == NON_PMD_CORE_ID) {
d0cca6c3
DDP
1783 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1784 ovs_mutex_lock(&pmd->port_mutex);
1785 pmd_load_cached_ports(pmd);
1786 ovs_mutex_unlock(&pmd->port_mutex);
1787 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
accf8626
AW
1788 return;
1789 }
1790
2788a1b1 1791 seq_change(pmd->reload_seq);
299c8d61 1792 atomic_store_explicit(&pmd->reload, true, memory_order_release);
65f13b50 1793}
e4cfed38 1794
59e6d833
BP
1795static uint32_t
1796hash_port_no(odp_port_t port_no)
1797{
1798 return hash_int(odp_to_u32(port_no), 0);
1799}
1800
72865317 1801static int
a3e8437a 1802port_create(const char *devname, const char *type,
b8d29252 1803 odp_port_t port_no, struct dp_netdev_port **portp)
72865317 1804{
4b609110 1805 struct netdev_saved_flags *sf;
72865317 1806 struct dp_netdev_port *port;
2499a8ce 1807 enum netdev_flags flags;
b8d29252 1808 struct netdev *netdev;
e32971b8 1809 int error;
72865317 1810
b8d29252 1811 *portp = NULL;
72865317
BP
1812
1813 /* Open and validate network device. */
a3e8437a 1814 error = netdev_open(devname, type, &netdev);
72865317 1815 if (error) {
b8d29252 1816 return error;
72865317 1817 }
72865317
BP
1818 /* XXX reject non-Ethernet devices */
1819
2499a8ce
AC
1820 netdev_get_flags(netdev, &flags);
1821 if (flags & NETDEV_LOOPBACK) {
1822 VLOG_ERR("%s: cannot add a loopback device", devname);
d17f4f08 1823 error = EINVAL;
b8d29252 1824 goto out;
2499a8ce
AC
1825 }
1826
e32971b8
DDP
1827 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1828 if (error) {
1829 VLOG_ERR("%s: cannot set promisc flag", devname);
1830 goto out;
324c8374
IM
1831 }
1832
e4cfed38 1833 port = xzalloc(sizeof *port);
35303d71 1834 port->port_no = port_no;
e4cfed38
PS
1835 port->netdev = netdev;
1836 port->type = xstrdup(type);
4b609110 1837 port->sf = sf;
2fbadeb6 1838 port->emc_enabled = true;
e32971b8
DDP
1839 port->need_reconfigure = true;
1840 ovs_mutex_init(&port->txq_used_mutex);
e4cfed38 1841
b8d29252 1842 *portp = port;
72865317
BP
1843
1844 return 0;
d17f4f08 1845
d17f4f08 1846out:
b8d29252 1847 netdev_close(netdev);
d17f4f08 1848 return error;
72865317
BP
1849}
1850
b8d29252
DDP
1851static int
1852do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1853 odp_port_t port_no)
1854 OVS_REQUIRES(dp->port_mutex)
1855{
1856 struct dp_netdev_port *port;
1857 int error;
1858
1859 /* Reject devices already in 'dp'. */
1860 if (!get_port_by_name(dp, devname, &port)) {
1861 return EEXIST;
1862 }
1863
a3e8437a 1864 error = port_create(devname, type, port_no, &port);
b8d29252
DDP
1865 if (error) {
1866 return error;
1867 }
1868
e9985d6a 1869 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
b8d29252
DDP
1870 seq_change(dp->port_seq);
1871
e32971b8
DDP
1872 reconfigure_datapath(dp);
1873
3f51ea18
IM
1874 /* Check that port was successfully configured. */
1875 return dp_netdev_lookup_port(dp, port_no) ? 0 : EINVAL;
b8d29252
DDP
1876}
1877
247527db
BP
1878static int
1879dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1880 odp_port_t *port_nop)
247527db
BP
1881{
1882 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1883 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1884 const char *dpif_port;
4e022ec0 1885 odp_port_t port_no;
5279f8fd 1886 int error;
247527db 1887
59e6d833 1888 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1889 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1890 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1891 port_no = *port_nop;
1892 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1893 } else {
3aa30359 1894 port_no = choose_port(dp, dpif_port);
5279f8fd 1895 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1896 }
5279f8fd 1897 if (!error) {
247527db 1898 *port_nop = port_no;
5279f8fd 1899 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1900 }
59e6d833 1901 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1902
1903 return error;
72865317
BP
1904}
1905
1906static int
4e022ec0 1907dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1908{
1909 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1910 int error;
1911
59e6d833 1912 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1913 if (port_no == ODPP_LOCAL) {
1914 error = EINVAL;
1915 } else {
1916 struct dp_netdev_port *port;
1917
1918 error = get_port_by_number(dp, port_no, &port);
1919 if (!error) {
1920 do_del_port(dp, port);
1921 }
1922 }
59e6d833 1923 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1924
1925 return error;
72865317
BP
1926}
1927
1928static bool
4e022ec0 1929is_valid_port_number(odp_port_t port_no)
72865317 1930{
ff073a71
BP
1931 return port_no != ODPP_NONE;
1932}
1933
1934static struct dp_netdev_port *
1935dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
e9985d6a 1936 OVS_REQUIRES(dp->port_mutex)
ff073a71
BP
1937{
1938 struct dp_netdev_port *port;
1939
e9985d6a 1940 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 1941 if (port->port_no == port_no) {
ff073a71
BP
1942 return port;
1943 }
1944 }
1945 return NULL;
72865317
BP
1946}
1947
1948static int
1949get_port_by_number(struct dp_netdev *dp,
4e022ec0 1950 odp_port_t port_no, struct dp_netdev_port **portp)
e9985d6a 1951 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1952{
1953 if (!is_valid_port_number(port_no)) {
1954 *portp = NULL;
1955 return EINVAL;
1956 } else {
ff073a71 1957 *portp = dp_netdev_lookup_port(dp, port_no);
0f6a066f 1958 return *portp ? 0 : ENODEV;
72865317
BP
1959 }
1960}
1961
b284085e 1962static void
62453dad 1963port_destroy(struct dp_netdev_port *port)
b284085e 1964{
62453dad
DDP
1965 if (!port) {
1966 return;
b284085e 1967 }
b284085e 1968
62453dad
DDP
1969 netdev_close(port->netdev);
1970 netdev_restore_flags(port->sf);
accf8626 1971
62453dad 1972 for (unsigned i = 0; i < port->n_rxq; i++) {
947dc567 1973 netdev_rxq_close(port->rxqs[i].rx);
b284085e 1974 }
324c8374 1975 ovs_mutex_destroy(&port->txq_used_mutex);
3eb67853 1976 free(port->rxq_affinity_list);
324c8374 1977 free(port->txq_used);
3eb67853 1978 free(port->rxqs);
62453dad
DDP
1979 free(port->type);
1980 free(port);
b284085e
PS
1981}
1982
72865317
BP
1983static int
1984get_port_by_name(struct dp_netdev *dp,
1985 const char *devname, struct dp_netdev_port **portp)
59e6d833 1986 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1987{
1988 struct dp_netdev_port *port;
1989
e9985d6a 1990 HMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 1991 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
1992 *portp = port;
1993 return 0;
1994 }
1995 }
0f6a066f
DDP
1996
1997 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1998 * existing port. */
1999 return ENODEV;
72865317
BP
2000}
2001
b9584f21 2002/* Returns 'true' if there is a port with pmd netdev. */
65f13b50 2003static bool
b9584f21 2004has_pmd_port(struct dp_netdev *dp)
e9985d6a 2005 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
2006{
2007 struct dp_netdev_port *port;
2008
e9985d6a 2009 HMAP_FOR_EACH (port, node, &dp->ports) {
5dd57e80 2010 if (netdev_is_pmd(port->netdev)) {
b9584f21 2011 return true;
65f13b50
AW
2012 }
2013 }
2014
2015 return false;
2016}
2017
c40b890f
BP
2018static void
2019do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 2020 OVS_REQUIRES(dp->port_mutex)
72865317 2021{
e9985d6a 2022 hmap_remove(&dp->ports, &port->node);
d33ed218 2023 seq_change(dp->port_seq);
d0cca6c3 2024
e32971b8 2025 reconfigure_datapath(dp);
72865317 2026
62453dad 2027 port_destroy(port);
72865317
BP
2028}
2029
2030static void
4c738a8d
BP
2031answer_port_query(const struct dp_netdev_port *port,
2032 struct dpif_port *dpif_port)
72865317 2033{
3efb6063 2034 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 2035 dpif_port->type = xstrdup(port->type);
35303d71 2036 dpif_port->port_no = port->port_no;
72865317
BP
2037}
2038
2039static int
4e022ec0 2040dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 2041 struct dpif_port *dpif_port)
72865317
BP
2042{
2043 struct dp_netdev *dp = get_dp_netdev(dpif);
2044 struct dp_netdev_port *port;
2045 int error;
2046
e9985d6a 2047 ovs_mutex_lock(&dp->port_mutex);
72865317 2048 error = get_port_by_number(dp, port_no, &port);
4afba28d 2049 if (!error && dpif_port) {
4c738a8d 2050 answer_port_query(port, dpif_port);
72865317 2051 }
e9985d6a 2052 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2053
72865317
BP
2054 return error;
2055}
2056
2057static int
2058dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 2059 struct dpif_port *dpif_port)
72865317
BP
2060{
2061 struct dp_netdev *dp = get_dp_netdev(dpif);
2062 struct dp_netdev_port *port;
2063 int error;
2064
59e6d833 2065 ovs_mutex_lock(&dp->port_mutex);
72865317 2066 error = get_port_by_name(dp, devname, &port);
4afba28d 2067 if (!error && dpif_port) {
4c738a8d 2068 answer_port_query(port, dpif_port);
72865317 2069 }
59e6d833 2070 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2071
72865317
BP
2072 return error;
2073}
2074
61e7deb1
BP
2075static void
2076dp_netdev_flow_free(struct dp_netdev_flow *flow)
2077{
61e7deb1 2078 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
2079 free(flow);
2080}
2081
ed79f89a
DDP
2082static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2083{
2084 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2085 ovsrcu_postpone(dp_netdev_flow_free, flow);
2086 }
2087}
2088
70e5ed6f
JS
2089static uint32_t
2090dp_netdev_flow_hash(const ovs_u128 *ufid)
2091{
2092 return ufid->u32[0];
2093}
2094
3453b4d6
JS
2095static inline struct dpcls *
2096dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2097 odp_port_t in_port)
2098{
2099 struct dpcls *cls;
2100 uint32_t hash = hash_port_no(in_port);
2101 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2102 if (cls->in_port == in_port) {
2103 /* Port classifier exists already */
2104 return cls;
2105 }
2106 }
2107 return NULL;
2108}
2109
2110static inline struct dpcls *
2111dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2112 odp_port_t in_port)
2113 OVS_REQUIRES(pmd->flow_mutex)
2114{
2115 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2116 uint32_t hash = hash_port_no(in_port);
2117
2118 if (!cls) {
2119 /* Create new classifier for in_port */
2120 cls = xmalloc(sizeof(*cls));
2121 dpcls_init(cls);
2122 cls->in_port = in_port;
2123 cmap_insert(&pmd->classifiers, &cls->node, hash);
2124 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2125 }
2126 return cls;
2127}
2128
241bad15
YL
2129#define MAX_FLOW_MARK (UINT32_MAX - 1)
2130#define INVALID_FLOW_MARK (UINT32_MAX)
2131
2132struct megaflow_to_mark_data {
2133 const struct cmap_node node;
2134 ovs_u128 mega_ufid;
2135 uint32_t mark;
2136};
2137
2138struct flow_mark {
2139 struct cmap megaflow_to_mark;
2140 struct cmap mark_to_flow;
2141 struct id_pool *pool;
241bad15
YL
2142};
2143
2144static struct flow_mark flow_mark = {
2145 .megaflow_to_mark = CMAP_INITIALIZER,
2146 .mark_to_flow = CMAP_INITIALIZER,
241bad15
YL
2147};
2148
2149static uint32_t
2150flow_mark_alloc(void)
2151{
2152 uint32_t mark;
2153
2154 if (!flow_mark.pool) {
2155 /* Haven't initiated yet, do it here */
2156 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2157 }
2158
2159 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2160 return mark;
2161 }
2162
2163 return INVALID_FLOW_MARK;
2164}
2165
2166static void
2167flow_mark_free(uint32_t mark)
2168{
2169 id_pool_free_id(flow_mark.pool, mark);
2170}
2171
2172/* associate megaflow with a mark, which is a 1:1 mapping */
2173static void
2174megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2175{
2176 size_t hash = dp_netdev_flow_hash(mega_ufid);
2177 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2178
2179 data->mega_ufid = *mega_ufid;
2180 data->mark = mark;
2181
2182 cmap_insert(&flow_mark.megaflow_to_mark,
2183 CONST_CAST(struct cmap_node *, &data->node), hash);
2184}
2185
2186/* disassociate meagaflow with a mark */
2187static void
2188megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2189{
2190 size_t hash = dp_netdev_flow_hash(mega_ufid);
2191 struct megaflow_to_mark_data *data;
2192
2193 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2194 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2195 cmap_remove(&flow_mark.megaflow_to_mark,
2196 CONST_CAST(struct cmap_node *, &data->node), hash);
5752eae4 2197 ovsrcu_postpone(free, data);
241bad15
YL
2198 return;
2199 }
2200 }
2201
2202 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2203 UUID_ARGS((struct uuid *)mega_ufid));
2204}
2205
2206static inline uint32_t
2207megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2208{
2209 size_t hash = dp_netdev_flow_hash(mega_ufid);
2210 struct megaflow_to_mark_data *data;
2211
2212 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2213 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2214 return data->mark;
2215 }
2216 }
2217
5d1765d3
IM
2218 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2219 UUID_ARGS((struct uuid *)mega_ufid));
241bad15
YL
2220 return INVALID_FLOW_MARK;
2221}
2222
2223/* associate mark with a flow, which is 1:N mapping */
2224static void
2225mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2226{
2227 dp_netdev_flow_ref(flow);
2228
2229 cmap_insert(&flow_mark.mark_to_flow,
2230 CONST_CAST(struct cmap_node *, &flow->mark_node),
2231 hash_int(mark, 0));
2232 flow->mark = mark;
2233
2234 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2235}
2236
2237static bool
2238flow_mark_has_no_ref(uint32_t mark)
2239{
2240 struct dp_netdev_flow *flow;
2241
2242 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2243 &flow_mark.mark_to_flow) {
2244 if (flow->mark == mark) {
2245 return false;
2246 }
2247 }
2248
2249 return true;
2250}
2251
2252static int
2253mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2254 struct dp_netdev_flow *flow)
2255{
2256 int ret = 0;
2257 uint32_t mark = flow->mark;
2258 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2259 &flow->mark_node);
2260
2261 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2262 flow->mark = INVALID_FLOW_MARK;
2263
2264 /*
2265 * no flow is referencing the mark any more? If so, let's
2266 * remove the flow from hardware and free the mark.
2267 */
2268 if (flow_mark_has_no_ref(mark)) {
2269 struct dp_netdev_port *port;
2270 odp_port_t in_port = flow->flow.in_port.odp_port;
2271
2272 ovs_mutex_lock(&pmd->dp->port_mutex);
2273 port = dp_netdev_lookup_port(pmd->dp, in_port);
2274 if (port) {
2275 ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2276 }
2277 ovs_mutex_unlock(&pmd->dp->port_mutex);
2278
2279 flow_mark_free(mark);
2280 VLOG_DBG("Freed flow mark %u\n", mark);
2281
2282 megaflow_to_mark_disassociate(&flow->mega_ufid);
2283 }
2284 dp_netdev_flow_unref(flow);
2285
2286 return ret;
2287}
2288
2289static void
2290flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2291{
2292 struct dp_netdev_flow *flow;
2293
2294 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2295 if (flow->pmd_id == pmd->core_id) {
02bb2824 2296 queue_netdev_flow_del(pmd, flow);
241bad15
YL
2297 }
2298 }
2299}
2300
aab96ec4
YL
2301static struct dp_netdev_flow *
2302mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2303 const uint32_t mark)
2304{
2305 struct dp_netdev_flow *flow;
2306
2307 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2308 &flow_mark.mark_to_flow) {
2309 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2310 flow->dead == false) {
2311 return flow;
2312 }
2313 }
2314
2315 return NULL;
2316}
2317
02bb2824
YL
2318static struct dp_flow_offload_item *
2319dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2320 struct dp_netdev_flow *flow,
2321 int op)
2322{
2323 struct dp_flow_offload_item *offload;
2324
2325 offload = xzalloc(sizeof(*offload));
2326 offload->pmd = pmd;
2327 offload->flow = flow;
2328 offload->op = op;
2329
2330 dp_netdev_flow_ref(flow);
2331 dp_netdev_pmd_try_ref(pmd);
2332
2333 return offload;
2334}
2335
2336static void
2337dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2338{
2339 dp_netdev_pmd_unref(offload->pmd);
2340 dp_netdev_flow_unref(offload->flow);
2341
2342 free(offload->actions);
2343 free(offload);
2344}
2345
2346static void
2347dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2348{
2349 ovs_mutex_lock(&dp_flow_offload.mutex);
2350 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2351 xpthread_cond_signal(&dp_flow_offload.cond);
2352 ovs_mutex_unlock(&dp_flow_offload.mutex);
2353}
2354
2355static int
2356dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2357{
2358 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2359}
2360
2361/*
2362 * There are two flow offload operations here: addition and modification.
2363 *
2364 * For flow addition, this function does:
2365 * - allocate a new flow mark id
2366 * - perform hardware flow offload
2367 * - associate the flow mark with flow and mega flow
2368 *
2369 * For flow modification, both flow mark and the associations are still
2370 * valid, thus only item 2 needed.
2371 */
2372static int
2373dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2374{
2375 struct dp_netdev_port *port;
2376 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2377 struct dp_netdev_flow *flow = offload->flow;
2378 odp_port_t in_port = flow->flow.in_port.odp_port;
2379 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2380 struct offload_info info;
2381 uint32_t mark;
2382 int ret;
2383
2384 if (flow->dead) {
2385 return -1;
2386 }
2387
2388 if (modification) {
2389 mark = flow->mark;
2390 ovs_assert(mark != INVALID_FLOW_MARK);
2391 } else {
2392 /*
2393 * If a mega flow has already been offloaded (from other PMD
2394 * instances), do not offload it again.
2395 */
2396 mark = megaflow_to_mark_find(&flow->mega_ufid);
2397 if (mark != INVALID_FLOW_MARK) {
2398 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2399 if (flow->mark != INVALID_FLOW_MARK) {
2400 ovs_assert(flow->mark == mark);
2401 } else {
2402 mark_to_flow_associate(mark, flow);
2403 }
2404 return 0;
2405 }
2406
2407 mark = flow_mark_alloc();
2408 if (mark == INVALID_FLOW_MARK) {
2409 VLOG_ERR("Failed to allocate flow mark!\n");
2410 }
2411 }
2412 info.flow_mark = mark;
2413
2414 ovs_mutex_lock(&pmd->dp->port_mutex);
2415 port = dp_netdev_lookup_port(pmd->dp, in_port);
0da667e3 2416 if (!port || netdev_vport_is_vport_class(port->netdev->netdev_class)) {
02bb2824 2417 ovs_mutex_unlock(&pmd->dp->port_mutex);
0a5cba65 2418 goto err_free;
02bb2824
YL
2419 }
2420 ret = netdev_flow_put(port->netdev, &offload->match,
2421 CONST_CAST(struct nlattr *, offload->actions),
2422 offload->actions_len, &flow->mega_ufid, &info,
2423 NULL);
2424 ovs_mutex_unlock(&pmd->dp->port_mutex);
2425
2426 if (ret) {
0a5cba65 2427 goto err_free;
02bb2824
YL
2428 }
2429
2430 if (!modification) {
2431 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2432 mark_to_flow_associate(mark, flow);
2433 }
02bb2824 2434 return 0;
0a5cba65
IM
2435
2436err_free:
2437 if (!modification) {
2438 flow_mark_free(mark);
2439 } else {
2440 mark_to_flow_disassociate(pmd, flow);
2441 }
2442 return -1;
02bb2824
YL
2443}
2444
2445static void *
2446dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2447{
2448 struct dp_flow_offload_item *offload;
2449 struct ovs_list *list;
2450 const char *op;
2451 int ret;
2452
2453 for (;;) {
2454 ovs_mutex_lock(&dp_flow_offload.mutex);
2455 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2456 ovsrcu_quiesce_start();
2457 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2458 &dp_flow_offload.mutex);
6c95dbf9 2459 ovsrcu_quiesce_end();
02bb2824
YL
2460 }
2461 list = ovs_list_pop_front(&dp_flow_offload.list);
2462 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2463 ovs_mutex_unlock(&dp_flow_offload.mutex);
2464
2465 switch (offload->op) {
2466 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2467 op = "add";
2468 ret = dp_netdev_flow_offload_put(offload);
2469 break;
2470 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2471 op = "modify";
2472 ret = dp_netdev_flow_offload_put(offload);
2473 break;
2474 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2475 op = "delete";
2476 ret = dp_netdev_flow_offload_del(offload);
2477 break;
2478 default:
2479 OVS_NOT_REACHED();
2480 }
2481
2482 VLOG_DBG("%s to %s netdev flow\n",
2483 ret == 0 ? "succeed" : "failed", op);
2484 dp_netdev_free_flow_offload(offload);
2485 }
2486
2487 return NULL;
2488}
2489
2490static void
2491queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2492 struct dp_netdev_flow *flow)
2493{
2494 struct dp_flow_offload_item *offload;
2495
2496 if (ovsthread_once_start(&offload_thread_once)) {
2497 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2498 ovs_thread_create("dp_netdev_flow_offload",
2499 dp_netdev_flow_offload_main, NULL);
2500 ovsthread_once_done(&offload_thread_once);
2501 }
2502
2503 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2504 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2505 dp_netdev_append_flow_offload(offload);
2506}
2507
2508static void
2509queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2510 struct dp_netdev_flow *flow, struct match *match,
2511 const struct nlattr *actions, size_t actions_len)
2512{
2513 struct dp_flow_offload_item *offload;
2514 int op;
2515
2516 if (!netdev_is_flow_api_enabled()) {
2517 return;
2518 }
2519
2520 if (ovsthread_once_start(&offload_thread_once)) {
2521 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2522 ovs_thread_create("dp_netdev_flow_offload",
2523 dp_netdev_flow_offload_main, NULL);
2524 ovsthread_once_done(&offload_thread_once);
2525 }
2526
2527 if (flow->mark != INVALID_FLOW_MARK) {
2528 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2529 } else {
2530 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2531 }
2532 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2533 offload->match = *match;
2534 offload->actions = xmalloc(actions_len);
2535 memcpy(offload->actions, actions, actions_len);
2536 offload->actions_len = actions_len;
2537
2538 dp_netdev_append_flow_offload(offload);
2539}
2540
72865317 2541static void
1c1e46ed
AW
2542dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2543 struct dp_netdev_flow *flow)
2544 OVS_REQUIRES(pmd->flow_mutex)
72865317 2545{
9f361d6b 2546 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3453b4d6
JS
2547 struct dpcls *cls;
2548 odp_port_t in_port = flow->flow.in_port.odp_port;
2c0ea78f 2549
3453b4d6
JS
2550 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2551 ovs_assert(cls != NULL);
2552 dpcls_remove(cls, &flow->cr);
1c1e46ed 2553 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
241bad15 2554 if (flow->mark != INVALID_FLOW_MARK) {
02bb2824 2555 queue_netdev_flow_del(pmd, flow);
241bad15 2556 }
9bbf1c3d 2557 flow->dead = true;
ed79f89a
DDP
2558
2559 dp_netdev_flow_unref(flow);
72865317
BP
2560}
2561
2562static void
1c1e46ed 2563dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 2564{
78c8df12 2565 struct dp_netdev_flow *netdev_flow;
72865317 2566
1c1e46ed
AW
2567 ovs_mutex_lock(&pmd->flow_mutex);
2568 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2569 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2570 }
1c1e46ed 2571 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
2572}
2573
2574static int
2575dpif_netdev_flow_flush(struct dpif *dpif)
2576{
2577 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
2578 struct dp_netdev_pmd_thread *pmd;
2579
2580 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2581 dp_netdev_pmd_flow_flush(pmd);
2582 }
5279f8fd 2583
72865317
BP
2584 return 0;
2585}
2586
b0ec0f27 2587struct dp_netdev_port_state {
e9985d6a 2588 struct hmap_position position;
4c738a8d 2589 char *name;
b0ec0f27
BP
2590};
2591
2592static int
2593dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2594{
2595 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2596 return 0;
2597}
2598
72865317 2599static int
b0ec0f27 2600dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 2601 struct dpif_port *dpif_port)
72865317 2602{
b0ec0f27 2603 struct dp_netdev_port_state *state = state_;
72865317 2604 struct dp_netdev *dp = get_dp_netdev(dpif);
e9985d6a 2605 struct hmap_node *node;
ff073a71 2606 int retval;
72865317 2607
e9985d6a
DDP
2608 ovs_mutex_lock(&dp->port_mutex);
2609 node = hmap_at_position(&dp->ports, &state->position);
ff073a71
BP
2610 if (node) {
2611 struct dp_netdev_port *port;
5279f8fd 2612
ff073a71
BP
2613 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2614
2615 free(state->name);
2616 state->name = xstrdup(netdev_get_name(port->netdev));
2617 dpif_port->name = state->name;
2618 dpif_port->type = port->type;
35303d71 2619 dpif_port->port_no = port->port_no;
ff073a71
BP
2620
2621 retval = 0;
2622 } else {
2623 retval = EOF;
72865317 2624 }
e9985d6a 2625 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2626
ff073a71 2627 return retval;
b0ec0f27
BP
2628}
2629
2630static int
4c738a8d 2631dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 2632{
4c738a8d
BP
2633 struct dp_netdev_port_state *state = state_;
2634 free(state->name);
b0ec0f27
BP
2635 free(state);
2636 return 0;
72865317
BP
2637}
2638
2639static int
67a4917b 2640dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
2641{
2642 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 2643 uint64_t new_port_seq;
5279f8fd
BP
2644 int error;
2645
d33ed218
BP
2646 new_port_seq = seq_read(dpif->dp->port_seq);
2647 if (dpif->last_port_seq != new_port_seq) {
2648 dpif->last_port_seq = new_port_seq;
5279f8fd 2649 error = ENOBUFS;
72865317 2650 } else {
5279f8fd 2651 error = EAGAIN;
72865317 2652 }
5279f8fd
BP
2653
2654 return error;
72865317
BP
2655}
2656
2657static void
2658dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2659{
2660 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 2661
d33ed218 2662 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
2663}
2664
2665static struct dp_netdev_flow *
0de8783a 2666dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
2667{
2668 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
2669}
2670
9bbf1c3d
DDP
2671static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2672{
2673 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2674}
2675
79df317f
DDP
2676/* netdev_flow_key utilities.
2677 *
2678 * netdev_flow_key is basically a miniflow. We use these functions
2679 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2680 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2681 *
2682 * - Since we are dealing exclusively with miniflows created by
2683 * miniflow_extract(), if the map is different the miniflow is different.
2684 * Therefore we can be faster by comparing the map and the miniflow in a
2685 * single memcmp().
5fcff47b 2686 * - These functions can be inlined by the compiler. */
79df317f 2687
361d808d 2688/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 2689 * 'netdev_flow_key.mf' */
361d808d
JR
2690static inline size_t
2691netdev_flow_key_size(size_t flow_u64s)
79df317f 2692{
361d808d 2693 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
2694}
2695
79df317f
DDP
2696static inline bool
2697netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
2698 const struct netdev_flow_key *b)
2699{
caeb4906
JR
2700 /* 'b->len' may be not set yet. */
2701 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
2702}
2703
2704/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
d79a39fe 2705 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
0de8783a
JR
2706 * generated by miniflow_extract. */
2707static inline bool
2708netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2709 const struct miniflow *mf)
79df317f 2710{
caeb4906 2711 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
2712}
2713
2714static inline void
2715netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
2716 const struct netdev_flow_key *src)
2717{
caeb4906
JR
2718 memcpy(dst, src,
2719 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
2720}
2721
0de8783a
JR
2722/* Initialize a netdev_flow_key 'mask' from 'match'. */
2723static inline void
2724netdev_flow_mask_init(struct netdev_flow_key *mask,
2725 const struct match *match)
2726{
09b0fa9c 2727 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 2728 struct flowmap fmap;
0de8783a 2729 uint32_t hash = 0;
5fcff47b 2730 size_t idx;
0de8783a
JR
2731
2732 /* Only check masks that make sense for the flow. */
5fcff47b
JR
2733 flow_wc_map(&match->flow, &fmap);
2734 flowmap_init(&mask->mf.map);
0de8783a 2735
5fcff47b
JR
2736 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2737 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 2738
5fcff47b
JR
2739 if (mask_u64) {
2740 flowmap_set(&mask->mf.map, idx, 1);
2741 *dst++ = mask_u64;
2742 hash = hash_add64(hash, mask_u64);
0de8783a 2743 }
0de8783a
JR
2744 }
2745
5fcff47b 2746 map_t map;
0de8783a 2747
5fcff47b
JR
2748 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2749 hash = hash_add64(hash, map);
2750 }
0de8783a 2751
5fcff47b 2752 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 2753
d70e8c28 2754 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
2755 mask->len = netdev_flow_key_size(n);
2756}
2757
361d808d 2758/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
2759static inline void
2760netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2761 const struct flow *flow,
2762 const struct netdev_flow_key *mask)
79df317f 2763{
09b0fa9c
JR
2764 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2765 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 2766 uint32_t hash = 0;
d70e8c28 2767 uint64_t value;
0de8783a
JR
2768
2769 dst->len = mask->len;
361d808d 2770 dst->mf = mask->mf; /* Copy maps. */
0de8783a 2771
5fcff47b 2772 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
2773 *dst_u64 = value & *mask_u64++;
2774 hash = hash_add64(hash, *dst_u64++);
0de8783a 2775 }
09b0fa9c
JR
2776 dst->hash = hash_finish(hash,
2777 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
2778}
2779
9bbf1c3d
DDP
2780static inline bool
2781emc_entry_alive(struct emc_entry *ce)
2782{
2783 return ce->flow && !ce->flow->dead;
2784}
2785
2786static void
2787emc_clear_entry(struct emc_entry *ce)
2788{
2789 if (ce->flow) {
2790 dp_netdev_flow_unref(ce->flow);
2791 ce->flow = NULL;
2792 }
2793}
2794
2795static inline void
2796emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 2797 const struct netdev_flow_key *key)
9bbf1c3d
DDP
2798{
2799 if (ce->flow != flow) {
2800 if (ce->flow) {
2801 dp_netdev_flow_unref(ce->flow);
2802 }
2803
2804 if (dp_netdev_flow_ref(flow)) {
2805 ce->flow = flow;
2806 } else {
2807 ce->flow = NULL;
2808 }
2809 }
0de8783a
JR
2810 if (key) {
2811 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
2812 }
2813}
2814
2815static inline void
0de8783a 2816emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
2817 struct dp_netdev_flow *flow)
2818{
2819 struct emc_entry *to_be_replaced = NULL;
2820 struct emc_entry *current_entry;
2821
0de8783a
JR
2822 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2823 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 2824 /* We found the entry with the 'mf' miniflow */
0de8783a 2825 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
2826 return;
2827 }
2828
2829 /* Replacement policy: put the flow in an empty (not alive) entry, or
2830 * in the first entry where it can be */
2831 if (!to_be_replaced
2832 || (emc_entry_alive(to_be_replaced)
2833 && !emc_entry_alive(current_entry))
0de8783a 2834 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
2835 to_be_replaced = current_entry;
2836 }
2837 }
2838 /* We didn't find the miniflow in the cache.
2839 * The 'to_be_replaced' entry is where the new flow will be stored */
2840
0de8783a 2841 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
2842}
2843
4c30b246
CL
2844static inline void
2845emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2846 const struct netdev_flow_key *key,
2847 struct dp_netdev_flow *flow)
2848{
2849 /* Insert an entry into the EMC based on probability value 'min'. By
2850 * default the value is UINT32_MAX / 100 which yields an insertion
2851 * probability of 1/100 ie. 1% */
2852
2fbadeb6 2853 uint32_t min = pmd->ctx.emc_insert_min;
4c30b246 2854
656238ee 2855 if (min && random_uint32() <= min) {
60d8ccae 2856 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
4c30b246
CL
2857 }
2858}
2859
9bbf1c3d 2860static inline struct dp_netdev_flow *
0de8783a 2861emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
2862{
2863 struct emc_entry *current_entry;
2864
0de8783a
JR
2865 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2866 if (current_entry->key.hash == key->hash
2867 && emc_entry_alive(current_entry)
2868 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 2869
0de8783a 2870 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
2871 return current_entry->flow;
2872 }
2873 }
2874
2875 return NULL;
2876}
2877
60d8ccae
YW
2878static inline const struct cmap_node *
2879smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2880{
2881 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2882 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2883 uint16_t sig = hash >> 16;
2884 uint16_t index = UINT16_MAX;
2885
2886 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2887 if (bucket->sig[i] == sig) {
2888 index = bucket->flow_idx[i];
2889 break;
2890 }
2891 }
2892 if (index != UINT16_MAX) {
2893 return cmap_find_by_index(&pmd->flow_table, index);
2894 }
2895 return NULL;
2896}
2897
2898static void
2899smc_clear_entry(struct smc_bucket *b, int idx)
2900{
2901 b->flow_idx[idx] = UINT16_MAX;
2902}
2903
2904/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2905 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2906 * If there is already an SMC entry having same signature, the index will be
2907 * updated. If there is no existing entry, but an empty entry is available,
2908 * the empty entry will be taken. If no empty entry or existing same signature,
2909 * a random entry from the hashed bucket will be picked. */
2910static inline void
2911smc_insert(struct dp_netdev_pmd_thread *pmd,
2912 const struct netdev_flow_key *key,
2913 uint32_t hash)
2914{
2915 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2916 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2917 uint16_t index;
2918 uint32_t cmap_index;
2919 bool smc_enable_db;
2920 int i;
2921
2922 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2923 if (!smc_enable_db) {
2924 return;
2925 }
2926
2927 cmap_index = cmap_find_index(&pmd->flow_table, hash);
2928 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2929
2930 /* If the index is larger than SMC can handle (uint16_t), we don't
2931 * insert */
2932 if (index == UINT16_MAX) {
2933 return;
2934 }
2935
2936 /* If an entry with same signature already exists, update the index */
2937 uint16_t sig = key->hash >> 16;
2938 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2939 if (bucket->sig[i] == sig) {
2940 bucket->flow_idx[i] = index;
2941 return;
2942 }
2943 }
2944 /* If there is an empty entry, occupy it. */
2945 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2946 if (bucket->flow_idx[i] == UINT16_MAX) {
2947 bucket->sig[i] = sig;
2948 bucket->flow_idx[i] = index;
2949 return;
2950 }
2951 }
2952 /* Otherwise, pick a random entry. */
2953 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2954 bucket->sig[i] = sig;
2955 bucket->flow_idx[i] = index;
2956}
2957
72865317 2958static struct dp_netdev_flow *
3453b4d6
JS
2959dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2960 const struct netdev_flow_key *key,
2961 int *lookup_num_p)
2c0ea78f 2962{
3453b4d6 2963 struct dpcls *cls;
0de8783a 2964 struct dpcls_rule *rule;
f825fdd4
BP
2965 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2966 in_port.odp_port));
3453b4d6 2967 struct dp_netdev_flow *netdev_flow = NULL;
2c0ea78f 2968
3453b4d6
JS
2969 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2970 if (OVS_LIKELY(cls)) {
60d8ccae 2971 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3453b4d6
JS
2972 netdev_flow = dp_netdev_flow_cast(rule);
2973 }
8a4e3a85 2974 return netdev_flow;
2c0ea78f
GS
2975}
2976
2977static struct dp_netdev_flow *
1c1e46ed
AW
2978dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2979 const ovs_u128 *ufidp, const struct nlattr *key,
2980 size_t key_len)
72865317 2981{
1763b4b8 2982 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
2983 struct flow flow;
2984 ovs_u128 ufid;
2985
2986 /* If a UFID is not provided, determine one based on the key. */
2987 if (!ufidp && key && key_len
f0fb825a 2988 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
1c1e46ed 2989 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
70e5ed6f
JS
2990 ufidp = &ufid;
2991 }
72865317 2992
70e5ed6f
JS
2993 if (ufidp) {
2994 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 2995 &pmd->flow_table) {
2ff8484b 2996 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
70e5ed6f
JS
2997 return netdev_flow;
2998 }
72865317
BP
2999 }
3000 }
8a4e3a85 3001
72865317
BP
3002 return NULL;
3003}
3004
3005static void
eb94da30 3006get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1763b4b8 3007 struct dpif_flow_stats *stats)
feebdea2 3008{
eb94da30
DDP
3009 struct dp_netdev_flow *netdev_flow;
3010 unsigned long long n;
3011 long long used;
3012 uint16_t flags;
3013
3014 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3015
3016 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3017 stats->n_packets = n;
3018 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3019 stats->n_bytes = n;
3020 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3021 stats->used = used;
3022 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3023 stats->tcp_flags = flags;
72865317
BP
3024}
3025
7af12bd7
JS
3026/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3027 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3028 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3029 * protect them. */
6fe09f8c 3030static void
70e5ed6f 3031dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
7af12bd7 3032 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 3033 struct dpif_flow *flow, bool terse)
6fe09f8c 3034{
64bb477f
JS
3035 if (terse) {
3036 memset(flow, 0, sizeof *flow);
3037 } else {
3038 struct flow_wildcards wc;
3039 struct dp_netdev_actions *actions;
3040 size_t offset;
5262eea1
JG
3041 struct odp_flow_key_parms odp_parms = {
3042 .flow = &netdev_flow->flow,
3043 .mask = &wc.masks,
2494ccd7 3044 .support = dp_netdev_support,
5262eea1 3045 };
64bb477f
JS
3046
3047 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
f4b835bb
JR
3048 /* in_port is exact matched, but we have left it out from the mask for
3049 * optimnization reasons. Add in_port back to the mask. */
3050 wc.masks.in_port.odp_port = ODPP_NONE;
64bb477f
JS
3051
3052 /* Key */
6fd6ed71 3053 offset = key_buf->size;
64bb477f 3054 flow->key = ofpbuf_tail(key_buf);
5262eea1 3055 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 3056 flow->key_len = key_buf->size - offset;
64bb477f
JS
3057
3058 /* Mask */
6fd6ed71 3059 offset = mask_buf->size;
64bb477f 3060 flow->mask = ofpbuf_tail(mask_buf);
ec1f6f32 3061 odp_parms.key_buf = key_buf;
5262eea1 3062 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 3063 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
3064
3065 /* Actions */
3066 actions = dp_netdev_flow_get_actions(netdev_flow);
3067 flow->actions = actions->actions;
3068 flow->actions_len = actions->size;
3069 }
6fe09f8c 3070
70e5ed6f
JS
3071 flow->ufid = netdev_flow->ufid;
3072 flow->ufid_present = true;
1c1e46ed 3073 flow->pmd_id = netdev_flow->pmd_id;
6fe09f8c 3074 get_dpif_flow_stats(netdev_flow, &flow->stats);
0d6b401c
GT
3075
3076 flow->attrs.offloaded = false;
3077 flow->attrs.dp_layer = "ovs";
6fe09f8c
JS
3078}
3079
36956a7d 3080static int
8c301900
JR
3081dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3082 const struct nlattr *mask_key,
3083 uint32_t mask_key_len, const struct flow *flow,
f0fb825a 3084 struct flow_wildcards *wc, bool probe)
8c301900 3085{
ca8d3442
DDP
3086 enum odp_key_fitness fitness;
3087
d40533fc 3088 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
ca8d3442 3089 if (fitness) {
f0fb825a
EG
3090 if (!probe) {
3091 /* This should not happen: it indicates that
3092 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3093 * disagree on the acceptable form of a mask. Log the problem
3094 * as an error, with enough details to enable debugging. */
3095 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3096
3097 if (!VLOG_DROP_ERR(&rl)) {
3098 struct ds s;
3099
3100 ds_init(&s);
3101 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3102 true);
3103 VLOG_ERR("internal error parsing flow mask %s (%s)",
3104 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3105 ds_destroy(&s);
3106 }
8c301900 3107 }
ca8d3442
DDP
3108
3109 return EINVAL;
8c301900
JR
3110 }
3111
3112 return 0;
3113}
3114
3115static int
3116dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
f0fb825a 3117 struct flow *flow, bool probe)
36956a7d 3118{
d40533fc 3119 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
f0fb825a
EG
3120 if (!probe) {
3121 /* This should not happen: it indicates that
3122 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3123 * the acceptable form of a flow. Log the problem as an error,
3124 * with enough details to enable debugging. */
3125 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3126
3127 if (!VLOG_DROP_ERR(&rl)) {
3128 struct ds s;
3129
3130 ds_init(&s);
3131 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3132 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3133 ds_destroy(&s);
3134 }
36956a7d
BP
3135 }
3136
3137 return EINVAL;
3138 }
3139
5cf3edb3 3140 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
07659514
JS
3141 return EINVAL;
3142 }
3143
36956a7d
BP
3144 return 0;
3145}
3146
72865317 3147static int
6fe09f8c 3148dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
3149{
3150 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 3151 struct dp_netdev_flow *netdev_flow;
1c1e46ed 3152 struct dp_netdev_pmd_thread *pmd;
c673049c
IM
3153 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3154 struct hmapx_node *node;
3155 int error = EINVAL;
3156
3157 if (get->pmd_id == PMD_ID_NULL) {
3158 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3159 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3160 dp_netdev_pmd_unref(pmd);
3161 }
3162 }
3163 } else {
3164 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3165 if (!pmd) {
3166 goto out;
3167 }
3168 hmapx_add(&to_find, pmd);
1c1e46ed
AW
3169 }
3170
c673049c
IM
3171 if (!hmapx_count(&to_find)) {
3172 goto out;
72865317 3173 }
1c1e46ed 3174
c673049c
IM
3175 HMAPX_FOR_EACH (node, &to_find) {
3176 pmd = (struct dp_netdev_pmd_thread *) node->data;
3177 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3178 get->key_len);
3179 if (netdev_flow) {
3180 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
3181 get->flow, false);
3182 error = 0;
3183 break;
3184 } else {
3185 error = ENOENT;
3186 }
3187 }
bc4a05c6 3188
c673049c
IM
3189 HMAPX_FOR_EACH (node, &to_find) {
3190 pmd = (struct dp_netdev_pmd_thread *) node->data;
3191 dp_netdev_pmd_unref(pmd);
3192 }
3193out:
3194 hmapx_destroy(&to_find);
5279f8fd 3195 return error;
72865317
BP
3196}
3197
241bad15
YL
3198static void
3199dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3200{
3201 struct flow masked_flow;
3202 size_t i;
3203
3204 for (i = 0; i < sizeof(struct flow); i++) {
3205 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3206 ((uint8_t *)&match->wc)[i];
3207 }
3208 dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
3209}
3210
0de8783a 3211static struct dp_netdev_flow *
1c1e46ed
AW
3212dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3213 struct match *match, const ovs_u128 *ufid,
ae2ceebd 3214 const struct nlattr *actions, size_t actions_len)
1c1e46ed 3215 OVS_REQUIRES(pmd->flow_mutex)
72865317 3216{
0de8783a
JR
3217 struct dp_netdev_flow *flow;
3218 struct netdev_flow_key mask;
3453b4d6 3219 struct dpcls *cls;
f4b835bb
JR
3220
3221 /* Make sure in_port is exact matched before we read it. */
3222 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3453b4d6 3223 odp_port_t in_port = match->flow.in_port.odp_port;
ed79f89a 3224
f4b835bb
JR
3225 /* As we select the dpcls based on the port number, each netdev flow
3226 * belonging to the same dpcls will have the same odp_port value.
3227 * For performance reasons we wildcard odp_port here in the mask. In the
3228 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3229 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3230 * will not be part of the subtable mask.
3231 * This will speed up the hash computation during dpcls_lookup() because
3232 * there is one less call to hash_add64() in this case. */
3233 match->wc.masks.in_port.odp_port = 0;
0de8783a 3234 netdev_flow_mask_init(&mask, match);
f4b835bb
JR
3235 match->wc.masks.in_port.odp_port = ODPP_NONE;
3236
0de8783a 3237 /* Make sure wc does not have metadata. */
5fcff47b
JR
3238 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3239 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 3240
0de8783a 3241 /* Do not allocate extra space. */
caeb4906 3242 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 3243 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 3244 flow->dead = false;
11e5cf1f 3245 flow->batch = NULL;
241bad15 3246 flow->mark = INVALID_FLOW_MARK;
bd5131ba 3247 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 3248 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 3249 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 3250 ovs_refcount_init(&flow->ref_cnt);
0de8783a 3251 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 3252
241bad15 3253 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
0de8783a 3254 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3453b4d6 3255
f4b835bb 3256 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3453b4d6
JS
3257 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3258 dpcls_insert(cls, &flow->cr, &mask);
72865317 3259
4c75aaab
EJ
3260 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3261 dp_netdev_flow_hash(&flow->ufid));
3262
02bb2824 3263 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
241bad15 3264
beb75a40 3265 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
623540e4 3266 struct ds ds = DS_EMPTY_INITIALIZER;
9044f2c1
JG
3267 struct ofpbuf key_buf, mask_buf;
3268 struct odp_flow_key_parms odp_parms = {
3269 .flow = &match->flow,
3270 .mask = &match->wc.masks,
3271 .support = dp_netdev_support,
3272 };
3273
3274 ofpbuf_init(&key_buf, 0);
3275 ofpbuf_init(&mask_buf, 0);
623540e4 3276
9044f2c1
JG
3277 odp_flow_key_from_flow(&odp_parms, &key_buf);
3278 odp_parms.key_buf = &key_buf;
3279 odp_flow_key_from_mask(&odp_parms, &mask_buf);
0de8783a 3280
623540e4 3281 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
3282 odp_format_ufid(ufid, &ds);
3283 ds_put_cstr(&ds, " ");
9044f2c1
JG
3284 odp_flow_format(key_buf.data, key_buf.size,
3285 mask_buf.data, mask_buf.size,
3286 NULL, &ds, false);
623540e4 3287 ds_put_cstr(&ds, ", actions:");
0722f341 3288 format_odp_actions(&ds, actions, actions_len, NULL);
623540e4 3289
beb75a40 3290 VLOG_DBG("%s", ds_cstr(&ds));
623540e4 3291
9044f2c1
JG
3292 ofpbuf_uninit(&key_buf);
3293 ofpbuf_uninit(&mask_buf);
beb75a40
JS
3294
3295 /* Add a printout of the actual match installed. */
3296 struct match m;
3297 ds_clear(&ds);
3298 ds_put_cstr(&ds, "flow match: ");
3299 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3300 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
b2f4b622 3301 memset(&m.tun_md, 0, sizeof m.tun_md);
beb75a40
JS
3302 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3303
3304 VLOG_DBG("%s", ds_cstr(&ds));
3305
623540e4
EJ
3306 ds_destroy(&ds);
3307 }
3308
0de8783a 3309 return flow;
72865317
BP
3310}
3311
72865317 3312static int
f5d317a1
DDP
3313flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3314 struct netdev_flow_key *key,
3315 struct match *match,
3316 ovs_u128 *ufid,
3317 const struct dpif_flow_put *put,
3318 struct dpif_flow_stats *stats)
72865317 3319{
1763b4b8 3320 struct dp_netdev_flow *netdev_flow;
f5d317a1 3321 int error = 0;
72865317 3322
f5d317a1
DDP
3323 if (stats) {
3324 memset(stats, 0, sizeof *stats);
70e5ed6f
JS
3325 }
3326
1c1e46ed 3327 ovs_mutex_lock(&pmd->flow_mutex);
f5d317a1 3328 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
1763b4b8 3329 if (!netdev_flow) {
89625d1e 3330 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 3331 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
f5d317a1 3332 dp_netdev_flow_add(pmd, match, ufid, put->actions,
70e5ed6f 3333 put->actions_len);
0de8783a 3334 error = 0;
72865317 3335 } else {
5279f8fd 3336 error = EFBIG;
72865317
BP
3337 }
3338 } else {
5279f8fd 3339 error = ENOENT;
72865317
BP
3340 }
3341 } else {
beb75a40 3342 if (put->flags & DPIF_FP_MODIFY) {
8a4e3a85
BP
3343 struct dp_netdev_actions *new_actions;
3344 struct dp_netdev_actions *old_actions;
3345
3346 new_actions = dp_netdev_actions_create(put->actions,
3347 put->actions_len);
3348
61e7deb1
BP
3349 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3350 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 3351
02bb2824
YL
3352 queue_netdev_flow_put(pmd, netdev_flow, match,
3353 put->actions, put->actions_len);
241bad15 3354
f5d317a1
DDP
3355 if (stats) {
3356 get_dpif_flow_stats(netdev_flow, stats);
a84cb64a
BP
3357 }
3358 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
3359 /* XXX: The userspace datapath uses thread local statistics
3360 * (for flows), which should be updated only by the owning
3361 * thread. Since we cannot write on stats memory here,
3362 * we choose not to support this flag. Please note:
3363 * - This feature is currently used only by dpctl commands with
3364 * option --clear.
3365 * - Should the need arise, this operation can be implemented
3366 * by keeping a base value (to be update here) for each
3367 * counter, and subtracting it before outputting the stats */
3368 error = EOPNOTSUPP;
72865317 3369 }
8a4e3a85 3370
61e7deb1 3371 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 3372 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 3373 error = EEXIST;
2c0ea78f
GS
3374 } else {
3375 /* Overlapping flow. */
3376 error = EINVAL;
72865317
BP
3377 }
3378 }
1c1e46ed 3379 ovs_mutex_unlock(&pmd->flow_mutex);
5279f8fd 3380 return error;
72865317
BP
3381}
3382
72865317 3383static int
f5d317a1 3384dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
3385{
3386 struct dp_netdev *dp = get_dp_netdev(dpif);
beb75a40 3387 struct netdev_flow_key key, mask;
1c1e46ed 3388 struct dp_netdev_pmd_thread *pmd;
f5d317a1
DDP
3389 struct match match;
3390 ovs_u128 ufid;
3391 int error;
f0fb825a 3392 bool probe = put->flags & DPIF_FP_PROBE;
72865317 3393
f5d317a1
DDP
3394 if (put->stats) {
3395 memset(put->stats, 0, sizeof *put->stats);
3396 }
f0fb825a
EG
3397 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3398 probe);
f5d317a1
DDP
3399 if (error) {
3400 return error;
3401 }
3402 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3403 put->mask, put->mask_len,
f0fb825a 3404 &match.flow, &match.wc, probe);
f5d317a1
DDP
3405 if (error) {
3406 return error;
1c1e46ed
AW
3407 }
3408
f5d317a1
DDP
3409 if (put->ufid) {
3410 ufid = *put->ufid;
3411 } else {
3412 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3413 }
3414
35fe9efb
IM
3415 /* The Netlink encoding of datapath flow keys cannot express
3416 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3417 * tag is interpreted as exact match on the fact that there is no
3418 * VLAN. Unless we refactor a lot of code that translates between
3419 * Netlink and struct flow representations, we have to do the same
3420 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3421 if (!match.wc.masks.vlans[0].tci) {
3422 match.wc.masks.vlans[0].tci = htons(0xffff);
3423 }
3424
f5d317a1 3425 /* Must produce a netdev_flow_key for lookup.
beb75a40
JS
3426 * Use the same method as employed to create the key when adding
3427 * the flow to the dplcs to make sure they match. */
3428 netdev_flow_mask_init(&mask, &match);
3429 netdev_flow_key_init_masked(&key, &match.flow, &mask);
f5d317a1
DDP
3430
3431 if (put->pmd_id == PMD_ID_NULL) {
3432 if (cmap_count(&dp->poll_threads) == 0) {
3433 return EINVAL;
3434 }
3435 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3436 struct dpif_flow_stats pmd_stats;
3437 int pmd_error;
3438
3439 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3440 &pmd_stats);
3441 if (pmd_error) {
3442 error = pmd_error;
3443 } else if (put->stats) {
3444 put->stats->n_packets += pmd_stats.n_packets;
3445 put->stats->n_bytes += pmd_stats.n_bytes;
3446 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3447 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3448 }
3449 }
3450 } else {
3451 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3452 if (!pmd) {
3453 return EINVAL;
3454 }
3455 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3456 dp_netdev_pmd_unref(pmd);
3457 }
3458
3459 return error;
3460}
3461
3462static int
3463flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3464 struct dpif_flow_stats *stats,
3465 const struct dpif_flow_del *del)
3466{
3467 struct dp_netdev_flow *netdev_flow;
3468 int error = 0;
3469
1c1e46ed
AW
3470 ovs_mutex_lock(&pmd->flow_mutex);
3471 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3472 del->key_len);
1763b4b8 3473 if (netdev_flow) {
f5d317a1
DDP
3474 if (stats) {
3475 get_dpif_flow_stats(netdev_flow, stats);
feebdea2 3476 }
1c1e46ed 3477 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 3478 } else {
5279f8fd 3479 error = ENOENT;
72865317 3480 }
1c1e46ed 3481 ovs_mutex_unlock(&pmd->flow_mutex);
f5d317a1
DDP
3482
3483 return error;
3484}
3485
3486static int
3487dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3488{
3489 struct dp_netdev *dp = get_dp_netdev(dpif);
3490 struct dp_netdev_pmd_thread *pmd;
3491 int error = 0;
3492
3493 if (del->stats) {
3494 memset(del->stats, 0, sizeof *del->stats);
3495 }
3496
3497 if (del->pmd_id == PMD_ID_NULL) {
3498 if (cmap_count(&dp->poll_threads) == 0) {
3499 return EINVAL;
3500 }
3501 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3502 struct dpif_flow_stats pmd_stats;
3503 int pmd_error;
3504
3505 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3506 if (pmd_error) {
3507 error = pmd_error;
3508 } else if (del->stats) {
3509 del->stats->n_packets += pmd_stats.n_packets;
3510 del->stats->n_bytes += pmd_stats.n_bytes;
3511 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3512 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3513 }
3514 }
3515 } else {
3516 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3517 if (!pmd) {
3518 return EINVAL;
3519 }
3520 error = flow_del_on_pmd(pmd, del->stats, del);
3521 dp_netdev_pmd_unref(pmd);
3522 }
3523
5279f8fd
BP
3524
3525 return error;
72865317
BP
3526}
3527
ac64794a
BP
3528struct dpif_netdev_flow_dump {
3529 struct dpif_flow_dump up;
1c1e46ed
AW
3530 struct cmap_position poll_thread_pos;
3531 struct cmap_position flow_pos;
3532 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
3533 int status;
3534 struct ovs_mutex mutex;
e723fd32
JS
3535};
3536
ac64794a
BP
3537static struct dpif_netdev_flow_dump *
3538dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 3539{
ac64794a 3540 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
3541}
3542
ac64794a 3543static struct dpif_flow_dump *
7e8b7199 3544dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
a692410a 3545 struct dpif_flow_dump_types *types OVS_UNUSED)
e723fd32 3546{
ac64794a 3547 struct dpif_netdev_flow_dump *dump;
e723fd32 3548
1c1e46ed 3549 dump = xzalloc(sizeof *dump);
ac64794a 3550 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 3551 dump->up.terse = terse;
ac64794a
BP
3552 ovs_mutex_init(&dump->mutex);
3553
3554 return &dump->up;
e723fd32
JS
3555}
3556
3557static int
ac64794a 3558dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 3559{
ac64794a 3560 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 3561
ac64794a
BP
3562 ovs_mutex_destroy(&dump->mutex);
3563 free(dump);
704a1e09
BP
3564 return 0;
3565}
3566
ac64794a
BP
3567struct dpif_netdev_flow_dump_thread {
3568 struct dpif_flow_dump_thread up;
3569 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
3570 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3571 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
3572};
3573
3574static struct dpif_netdev_flow_dump_thread *
3575dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3576{
3577 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3578}
3579
3580static struct dpif_flow_dump_thread *
3581dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3582{
3583 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3584 struct dpif_netdev_flow_dump_thread *thread;
3585
3586 thread = xmalloc(sizeof *thread);
3587 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3588 thread->dump = dump;
3589 return &thread->up;
3590}
3591
3592static void
3593dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3594{
3595 struct dpif_netdev_flow_dump_thread *thread
3596 = dpif_netdev_flow_dump_thread_cast(thread_);
3597
3598 free(thread);
3599}
3600
704a1e09 3601static int
ac64794a 3602dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 3603 struct dpif_flow *flows, int max_flows)
ac64794a
BP
3604{
3605 struct dpif_netdev_flow_dump_thread *thread
3606 = dpif_netdev_flow_dump_thread_cast(thread_);
3607 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 3608 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
8bb113da
RW
3609 int n_flows = 0;
3610 int i;
14608a15 3611
ac64794a 3612 ovs_mutex_lock(&dump->mutex);
8bb113da 3613 if (!dump->status) {
1c1e46ed
AW
3614 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3615 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3616 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3617 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3618
3619 /* First call to dump_next(), extracts the first pmd thread.
3620 * If there is no pmd thread, returns immediately. */
3621 if (!pmd) {
3622 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3623 if (!pmd) {
3624 ovs_mutex_unlock(&dump->mutex);
3625 return n_flows;
8bb113da 3626
8bb113da 3627 }
d2ad7ef1 3628 }
1c1e46ed
AW
3629
3630 do {
3631 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3632 struct cmap_node *node;
3633
3634 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3635 if (!node) {
3636 break;
3637 }
3638 netdev_flows[n_flows] = CONTAINER_OF(node,
3639 struct dp_netdev_flow,
3640 node);
3641 }
3642 /* When finishing dumping the current pmd thread, moves to
3643 * the next. */
3644 if (n_flows < flow_limit) {
3645 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3646 dp_netdev_pmd_unref(pmd);
3647 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3648 if (!pmd) {
3649 dump->status = EOF;
3650 break;
3651 }
3652 }
3653 /* Keeps the reference to next caller. */
3654 dump->cur_pmd = pmd;
3655
3656 /* If the current dump is empty, do not exit the loop, since the
3657 * remaining pmds could have flows to be dumped. Just dumps again
3658 * on the new 'pmd'. */
3659 } while (!n_flows);
8a4e3a85 3660 }
ac64794a 3661 ovs_mutex_unlock(&dump->mutex);
ac64794a 3662
8bb113da
RW
3663 for (i = 0; i < n_flows; i++) {
3664 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3665 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3666 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3667 struct dpif_flow *f = &flows[i];
7af12bd7 3668 struct ofpbuf key, mask;
8bb113da 3669
7af12bd7
JS
3670 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3671 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
64bb477f
JS
3672 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3673 dump->up.terse);
8bb113da 3674 }
feebdea2 3675
8bb113da 3676 return n_flows;
72865317
BP
3677}
3678
3679static int
758c456d 3680dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 3681 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
3682{
3683 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 3684 struct dp_netdev_pmd_thread *pmd;
1895cc8d 3685 struct dp_packet_batch pp;
72865317 3686
cf62fa4c
PS
3687 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3688 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
3689 return EINVAL;
3690 }
3691
65f13b50
AW
3692 /* Tries finding the 'pmd'. If NULL is returned, that means
3693 * the current thread is a non-pmd thread and should use
b19befae 3694 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
3695 pmd = ovsthread_getspecific(dp->per_pmd_key);
3696 if (!pmd) {
b19befae 3697 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
546e57d4
DDP
3698 if (!pmd) {
3699 return EBUSY;
3700 }
65f13b50
AW
3701 }
3702
05267613
AZ
3703 if (execute->probe) {
3704 /* If this is part of a probe, Drop the packet, since executing
3705 * the action may actually cause spurious packets be sent into
3706 * the network. */
d1ce9c20
YS
3707 if (pmd->core_id == NON_PMD_CORE_ID) {
3708 dp_netdev_pmd_unref(pmd);
3709 }
05267613
AZ
3710 return 0;
3711 }
3712
65f13b50
AW
3713 /* If the current thread is non-pmd thread, acquires
3714 * the 'non_pmd_mutex'. */
3715 if (pmd->core_id == NON_PMD_CORE_ID) {
3716 ovs_mutex_lock(&dp->non_pmd_mutex);
3717 }
1c1e46ed 3718
2fbadeb6
IM
3719 /* Update current time in PMD context. We don't care about EMC insertion
3720 * probability, because we are on a slow path. */
b010be17
IM
3721 pmd_thread_ctx_time_update(pmd);
3722
36d8de17
DDP
3723 /* The action processing expects the RSS hash to be valid, because
3724 * it's always initialized at the beginning of datapath processing.
3725 * In this case, though, 'execute->packet' may not have gone through
3726 * the datapath at all, it may have been generated by the upper layer
3727 * (OpenFlow packet-out, BFD frame, ...). */
3728 if (!dp_packet_rss_valid(execute->packet)) {
3729 dp_packet_set_rss_hash(execute->packet,
3730 flow_hash_5tuple(execute->flow, 0));
3731 }
3732
72c84bc2 3733 dp_packet_batch_init_packet(&pp, execute->packet);
9f17f104 3734 pp.do_not_steal = true;
66e4ad8a 3735 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
b010be17 3736 execute->actions, execute->actions_len);
c71ea3c4 3737 dp_netdev_pmd_flush_output_packets(pmd, true);
36d8de17 3738
65f13b50
AW
3739 if (pmd->core_id == NON_PMD_CORE_ID) {
3740 ovs_mutex_unlock(&dp->non_pmd_mutex);
e9985d6a 3741 dp_netdev_pmd_unref(pmd);
65f13b50 3742 }
8a4e3a85 3743
758c456d 3744 return 0;
72865317
BP
3745}
3746
1a0c894a 3747static void
57924fc9
SB
3748dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3749 enum dpif_offload_type offload_type OVS_UNUSED)
1a0c894a
BP
3750{
3751 size_t i;
3752
3753 for (i = 0; i < n_ops; i++) {
3754 struct dpif_op *op = ops[i];
3755
3756 switch (op->type) {
3757 case DPIF_OP_FLOW_PUT:
fa37affa 3758 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
1a0c894a
BP
3759 break;
3760
3761 case DPIF_OP_FLOW_DEL:
fa37affa 3762 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
1a0c894a
BP
3763 break;
3764
3765 case DPIF_OP_EXECUTE:
fa37affa 3766 op->error = dpif_netdev_execute(dpif, &op->execute);
1a0c894a 3767 break;
6fe09f8c
JS
3768
3769 case DPIF_OP_FLOW_GET:
fa37affa 3770 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
6fe09f8c 3771 break;
1a0c894a
BP
3772 }
3773 }
3774}
3775
5bf84282
NK
3776/* Enable or Disable PMD auto load balancing. */
3777static void
3778set_pmd_auto_lb(struct dp_netdev *dp)
3779{
3780 unsigned int cnt = 0;
3781 struct dp_netdev_pmd_thread *pmd;
3782 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3783
3784 bool enable_alb = false;
3785 bool multi_rxq = false;
3786 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3787
3788 /* Ensure that there is at least 2 non-isolated PMDs and
3789 * one of them is polling more than one rxq. */
3790 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3791 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3792 continue;
3793 }
3794
3795 if (hmap_count(&pmd->poll_list) > 1) {
3796 multi_rxq = true;
3797 }
3798 if (cnt && multi_rxq) {
3799 enable_alb = true;
3800 break;
3801 }
3802 cnt++;
3803 }
3804
3805 /* Enable auto LB if it is requested and cycle based assignment is true. */
3806 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3807 pmd_alb->auto_lb_requested;
3808
3809 if (pmd_alb->is_enabled != enable_alb) {
3810 pmd_alb->is_enabled = enable_alb;
3811 if (pmd_alb->is_enabled) {
3812 VLOG_INFO("PMD auto load balance is enabled "
3813 "(with rebalance interval:%"PRIu64" msec)",
3814 pmd_alb->rebalance_intvl);
3815 } else {
3816 pmd_alb->rebalance_poll_timer = 0;
3817 VLOG_INFO("PMD auto load balance is disabled");
3818 }
3819 }
3820
3821}
3822
d4f6865c
DDP
3823/* Applies datapath configuration from the database. Some of the changes are
3824 * actually applied in dpif_netdev_run(). */
f2eee189 3825static int
d4f6865c 3826dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
f2eee189
AW
3827{
3828 struct dp_netdev *dp = get_dp_netdev(dpif);
d4f6865c 3829 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
e77c97b9
KT
3830 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3831 "cycles");
4c30b246
CL
3832 unsigned long long insert_prob =
3833 smap_get_ullong(other_config, "emc-insert-inv-prob",
3834 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3835 uint32_t insert_min, cur_min;
c71ea3c4 3836 uint32_t tx_flush_interval, cur_tx_flush_interval;
5bf84282 3837 uint64_t rebalance_intvl;
c71ea3c4
IM
3838
3839 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3840 DEFAULT_TX_FLUSH_INTERVAL);
3841 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3842 if (tx_flush_interval != cur_tx_flush_interval) {
3843 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3844 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3845 tx_flush_interval);
3846 }
f2eee189 3847
a6a426d6
IM
3848 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3849 free(dp->pmd_cmask);
3850 dp->pmd_cmask = nullable_xstrdup(cmask);
3851 dp_netdev_request_reconfigure(dp);
f2eee189
AW
3852 }
3853
4c30b246
CL
3854 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3855 if (insert_prob <= UINT32_MAX) {
3856 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3857 } else {
3858 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3859 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3860 }
3861
3862 if (insert_min != cur_min) {
3863 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3864 if (insert_min == 0) {
2fbadeb6 3865 VLOG_INFO("EMC insertion probability changed to zero");
4c30b246
CL
3866 } else {
3867 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3868 insert_prob, (100 / (float)insert_prob));
3869 }
3870 }
3871
79f36875
JS
3872 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3873 bool cur_perf_enabled;
3874 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3875 if (perf_enabled != cur_perf_enabled) {
3876 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3877 if (perf_enabled) {
3878 VLOG_INFO("PMD performance metrics collection enabled");
3879 } else {
3880 VLOG_INFO("PMD performance metrics collection disabled");
3881 }
3882 }
3883
60d8ccae
YW
3884 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3885 bool cur_smc;
3886 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3887 if (smc_enable != cur_smc) {
3888 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3889 if (smc_enable) {
3890 VLOG_INFO("SMC cache is enabled");
3891 } else {
3892 VLOG_INFO("SMC cache is disabled");
3893 }
3894 }
e77c97b9
KT
3895
3896 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3897 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3898 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3899 "Defaulting to 'cycles'.");
3900 pmd_rxq_assign_cyc = true;
3901 pmd_rxq_assign = "cycles";
3902 }
3903 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
3904 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
3905 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3906 pmd_rxq_assign);
3907 dp_netdev_request_reconfigure(dp);
3908 }
5bf84282
NK
3909
3910 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3911 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
3912 false);
3913
3914 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
3915 ALB_PMD_REBALANCE_POLL_INTERVAL);
3916
3917 /* Input is in min, convert it to msec. */
3918 rebalance_intvl =
3919 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
3920
3921 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
3922 pmd_alb->rebalance_intvl = rebalance_intvl;
3923 }
3924
3925 set_pmd_auto_lb(dp);
f2eee189
AW
3926 return 0;
3927}
3928
3eb67853
IM
3929/* Parses affinity list and returns result in 'core_ids'. */
3930static int
3931parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3932{
3933 unsigned i;
3934 char *list, *copy, *key, *value;
3935 int error = 0;
3936
3937 for (i = 0; i < n_rxq; i++) {
51c37a56 3938 core_ids[i] = OVS_CORE_UNSPEC;
3eb67853
IM
3939 }
3940
3941 if (!affinity_list) {
3942 return 0;
3943 }
3944
3945 list = copy = xstrdup(affinity_list);
3946
3947 while (ofputil_parse_key_value(&list, &key, &value)) {
3948 int rxq_id, core_id;
3949
3950 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3951 || !str_to_int(value, 0, &core_id) || core_id < 0) {
3952 error = EINVAL;
3953 break;
3954 }
3955
3956 if (rxq_id < n_rxq) {
3957 core_ids[rxq_id] = core_id;
3958 }
3959 }
3960
3961 free(copy);
3962 return error;
3963}
3964
3965/* Parses 'affinity_list' and applies configuration if it is valid. */
3966static int
3967dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3968 const char *affinity_list)
3969{
3970 unsigned *core_ids, i;
3971 int error = 0;
3972
3973 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
3974 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
3975 error = EINVAL;
3976 goto exit;
3977 }
3978
3979 for (i = 0; i < port->n_rxq; i++) {
3980 port->rxqs[i].core_id = core_ids[i];
3981 }
3982
3983exit:
3984 free(core_ids);
3985 return error;
3986}
3987
2fbadeb6
IM
3988/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
3989 * of given PMD thread. */
3990static bool
3991dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
3992 struct dp_netdev_port *port)
3993 OVS_EXCLUDED(pmd->port_mutex)
3994{
3995 struct rxq_poll *poll;
3996 bool found = false;
3997
3998 ovs_mutex_lock(&pmd->port_mutex);
3999 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4000 if (port == poll->rxq->port) {
4001 found = true;
4002 break;
4003 }
4004 }
4005 ovs_mutex_unlock(&pmd->port_mutex);
4006 return found;
4007}
4008
4009/* Updates port configuration from the database. The changes are actually
4010 * applied in dpif_netdev_run(). */
3eb67853
IM
4011static int
4012dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4013 const struct smap *cfg)
4014{
4015 struct dp_netdev *dp = get_dp_netdev(dpif);
4016 struct dp_netdev_port *port;
4017 int error = 0;
4018 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2fbadeb6 4019 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
3eb67853
IM
4020
4021 ovs_mutex_lock(&dp->port_mutex);
4022 error = get_port_by_number(dp, port_no, &port);
2fbadeb6
IM
4023 if (error) {
4024 goto unlock;
4025 }
4026
4027 if (emc_enabled != port->emc_enabled) {
4028 struct dp_netdev_pmd_thread *pmd;
4029 struct ds ds = DS_EMPTY_INITIALIZER;
4030 uint32_t cur_min, insert_prob;
4031
4032 port->emc_enabled = emc_enabled;
4033 /* Mark for reload all the threads that polls this port and request
4034 * for reconfiguration for the actual reloading of threads. */
4035 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4036 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4037 pmd->need_reload = true;
4038 }
4039 }
4040 dp_netdev_request_reconfigure(dp);
4041
4042 ds_put_format(&ds, "%s: EMC has been %s.",
4043 netdev_get_name(port->netdev),
4044 (emc_enabled) ? "enabled" : "disabled");
4045 if (emc_enabled) {
4046 ds_put_cstr(&ds, " Current insertion probability is ");
4047 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4048 if (!cur_min) {
4049 ds_put_cstr(&ds, "zero.");
4050 } else {
4051 insert_prob = UINT32_MAX / cur_min;
4052 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4053 insert_prob, 100 / (float) insert_prob);
4054 }
4055 }
4056 VLOG_INFO("%s", ds_cstr(&ds));
4057 ds_destroy(&ds);
4058 }
4059
4060 /* Checking for RXq affinity changes. */
4061 if (!netdev_is_pmd(port->netdev)
3eb67853
IM
4062 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4063 goto unlock;
4064 }
4065
4066 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4067 if (error) {
4068 goto unlock;
4069 }
4070 free(port->rxq_affinity_list);
4071 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4072
4073 dp_netdev_request_reconfigure(dp);
4074unlock:
4075 ovs_mutex_unlock(&dp->port_mutex);
4076 return error;
4077}
4078
5bf93d67
EJ
4079static int
4080dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4081 uint32_t queue_id, uint32_t *priority)
4082{
4083 *priority = queue_id;
4084 return 0;
4085}
4086
72865317 4087\f
9ff55ae2 4088/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
1401f6de 4089 * a copy of the 'size' bytes of 'actions' input parameters. */
a84cb64a
BP
4090struct dp_netdev_actions *
4091dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4092{
4093 struct dp_netdev_actions *netdev_actions;
4094
9ff55ae2
DDP
4095 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4096 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
4097 netdev_actions->size = size;
4098
4099 return netdev_actions;
4100}
4101
a84cb64a 4102struct dp_netdev_actions *
61e7deb1 4103dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 4104{
61e7deb1 4105 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
4106}
4107
61e7deb1
BP
4108static void
4109dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 4110{
61e7deb1 4111 free(actions);
a84cb64a
BP
4112}
4113\f
a19896ab
JS
4114static void
4115dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4116 enum rxq_cycles_counter_type type,
4117 unsigned long long cycles)
a2ac666d 4118{
a19896ab 4119 atomic_store_relaxed(&rx->cycles[type], cycles);
a2ac666d
CL
4120}
4121
4809891b 4122static void
a19896ab 4123dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4809891b
KT
4124 enum rxq_cycles_counter_type type,
4125 unsigned long long cycles)
4126{
a19896ab 4127 non_atomic_ullong_add(&rx->cycles[type], cycles);
4809891b
KT
4128}
4129
4130static uint64_t
4131dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4132 enum rxq_cycles_counter_type type)
4133{
4134 unsigned long long processing_cycles;
4135 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4136 return processing_cycles;
4137}
4138
4139static void
4140dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4141 unsigned long long cycles)
4142{
4ee87ad3
BP
4143 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4144 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4809891b
KT
4145}
4146
655856ef
KT
4147static uint64_t
4148dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4149{
4150 unsigned long long processing_cycles;
4151 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4152 return processing_cycles;
4153}
4154
79f36875
JS
4155#if ATOMIC_ALWAYS_LOCK_FREE_8B
4156static inline bool
4157pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4158{
4159 bool pmd_perf_enabled;
4160 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4161 return pmd_perf_enabled;
4162}
4163#else
4164/* If stores and reads of 64-bit integers are not atomic, the full PMD
4165 * performance metrics are not available as locked access to 64 bit
4166 * integers would be prohibitively expensive. */
4167static inline bool
4168pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4169{
4170 return false;
4171}
4172#endif
4173
c71ea3c4 4174static int
009e0033
IM
4175dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4176 struct tx_port *p)
4177{
58ed6df0 4178 int i;
009e0033 4179 int tx_qid;
cc4891f3 4180 int output_cnt;
009e0033 4181 bool dynamic_txqs;
58ed6df0
IM
4182 struct cycle_timer timer;
4183 uint64_t cycles;
c71ea3c4 4184 uint32_t tx_flush_interval;
58ed6df0
IM
4185
4186 cycle_timer_start(&pmd->perf_stats, &timer);
009e0033
IM
4187
4188 dynamic_txqs = p->port->dynamic_txqs;
4189 if (dynamic_txqs) {
4190 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4191 } else {
4192 tx_qid = pmd->static_tx_qid;
4193 }
4194
cc4891f3 4195 output_cnt = dp_packet_batch_size(&p->output_pkts);
58ed6df0 4196 ovs_assert(output_cnt > 0);
cc4891f3 4197
b30896c9 4198 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
009e0033 4199 dp_packet_batch_init(&p->output_pkts);
cc4891f3 4200
c71ea3c4
IM
4201 /* Update time of the next flush. */
4202 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4203 p->flush_time = pmd->ctx.now + tx_flush_interval;
4204
4205 ovs_assert(pmd->n_output_batches > 0);
4206 pmd->n_output_batches--;
4207
82a48ead
JS
4208 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4209 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
58ed6df0
IM
4210
4211 /* Distribute send cycles evenly among transmitted packets and assign to
4212 * their respective rx queues. */
4213 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4214 for (i = 0; i < output_cnt; i++) {
4215 if (p->output_pkts_rxqs[i]) {
4216 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4217 RXQ_CYCLES_PROC_CURR, cycles);
4218 }
4219 }
c71ea3c4
IM
4220
4221 return output_cnt;
009e0033
IM
4222}
4223
c71ea3c4
IM
4224static int
4225dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4226 bool force)
009e0033
IM
4227{
4228 struct tx_port *p;
c71ea3c4
IM
4229 int output_cnt = 0;
4230
4231 if (!pmd->n_output_batches) {
4232 return 0;
4233 }
009e0033
IM
4234
4235 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
c71ea3c4
IM
4236 if (!dp_packet_batch_is_empty(&p->output_pkts)
4237 && (force || pmd->ctx.now >= p->flush_time)) {
4238 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
009e0033
IM
4239 }
4240 }
c71ea3c4 4241 return output_cnt;
009e0033
IM
4242}
4243
a2ac666d 4244static int
65f13b50 4245dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
a19896ab 4246 struct dp_netdev_rxq *rxq,
947dc567 4247 odp_port_t port_no)
e4cfed38 4248{
79f36875 4249 struct pmd_perf_stats *s = &pmd->perf_stats;
1895cc8d 4250 struct dp_packet_batch batch;
a19896ab 4251 struct cycle_timer timer;
1895cc8d 4252 int error;
79f36875
JS
4253 int batch_cnt = 0;
4254 int rem_qlen = 0, *qlen_p = NULL;
58ed6df0 4255 uint64_t cycles;
e4cfed38 4256
a19896ab
JS
4257 /* Measure duration for polling and processing rx burst. */
4258 cycle_timer_start(&pmd->perf_stats, &timer);
58ed6df0
IM
4259
4260 pmd->ctx.last_rxq = rxq;
1895cc8d 4261 dp_packet_batch_init(&batch);
58ed6df0 4262
79f36875
JS
4263 /* Fetch the rx queue length only for vhostuser ports. */
4264 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4265 qlen_p = &rem_qlen;
4266 }
4267
4268 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
e4cfed38 4269 if (!error) {
a19896ab 4270 /* At least one packet received. */
3c33f0ff 4271 *recirc_depth_get() = 0;
009e0033 4272 pmd_thread_ctx_time_update(pmd);
940ac2ce 4273 batch_cnt = dp_packet_batch_size(&batch);
79f36875
JS
4274 if (pmd_perf_metrics_enabled(pmd)) {
4275 /* Update batch histogram. */
4276 s->current.batches++;
4277 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4278 /* Update the maximum vhost rx queue fill level. */
4279 if (rxq->is_vhost && rem_qlen >= 0) {
4280 uint32_t qfill = batch_cnt + rem_qlen;
4281 if (qfill > s->current.max_vhost_qfill) {
4282 s->current.max_vhost_qfill = qfill;
4283 }
4284 }
4285 }
4286 /* Process packet batch. */
947dc567 4287 dp_netdev_input(pmd, &batch, port_no);
e4cfed38 4288
a19896ab 4289 /* Assign processing cycles to rx queue. */
58ed6df0 4290 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
a19896ab
JS
4291 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4292
79f36875 4293 dp_netdev_pmd_flush_output_packets(pmd, false);
a19896ab
JS
4294 } else {
4295 /* Discard cycles. */
4296 cycle_timer_stop(&pmd->perf_stats, &timer);
4297 if (error != EAGAIN && error != EOPNOTSUPP) {
4298 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4299
4300 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4301 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4302 }
e4cfed38 4303 }
a2ac666d 4304
58ed6df0
IM
4305 pmd->ctx.last_rxq = NULL;
4306
79f36875 4307 return batch_cnt;
e4cfed38
PS
4308}
4309
e32971b8
DDP
4310static struct tx_port *
4311tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4312{
4313 struct tx_port *tx;
4314
4315 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4316 if (tx->port->port_no == port_no) {
4317 return tx;
4318 }
4319 }
4320
4321 return NULL;
4322}
4323
dc36593c
DDP
4324static int
4325port_reconfigure(struct dp_netdev_port *port)
4326{
4327 struct netdev *netdev = port->netdev;
dc36593c
DDP
4328 int i, err;
4329
dc36593c
DDP
4330 /* Closes the existing 'rxq's. */
4331 for (i = 0; i < port->n_rxq; i++) {
947dc567
DDP
4332 netdev_rxq_close(port->rxqs[i].rx);
4333 port->rxqs[i].rx = NULL;
dc36593c 4334 }
4809891b 4335 unsigned last_nrxq = port->n_rxq;
dc36593c
DDP
4336 port->n_rxq = 0;
4337
050c60bf 4338 /* Allows 'netdev' to apply the pending configuration changes. */
606f6650 4339 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
e32971b8
DDP
4340 err = netdev_reconfigure(netdev);
4341 if (err && (err != EOPNOTSUPP)) {
4342 VLOG_ERR("Failed to set interface %s new configuration",
4343 netdev_get_name(netdev));
4344 return err;
4345 }
dc36593c 4346 }
050c60bf 4347 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3eb67853
IM
4348 port->rxqs = xrealloc(port->rxqs,
4349 sizeof *port->rxqs * netdev_n_rxq(netdev));
324c8374
IM
4350 /* Realloc 'used' counters for tx queues. */
4351 free(port->txq_used);
4352 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4353
dc36593c 4354 for (i = 0; i < netdev_n_rxq(netdev); i++) {
38259bd7
BP
4355 bool new_queue = i >= last_nrxq;
4356 if (new_queue) {
4357 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4358 }
4359
947dc567 4360 port->rxqs[i].port = port;
79f36875 4361 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
38259bd7 4362
947dc567 4363 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
dc36593c
DDP
4364 if (err) {
4365 return err;
4366 }
4367 port->n_rxq++;
4368 }
4369
3eb67853
IM
4370 /* Parse affinity list to apply configuration for new queues. */
4371 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4372
606f6650
EC
4373 /* If reconfiguration was successful mark it as such, so we can use it */
4374 port->need_reconfigure = false;
4375
dc36593c
DDP
4376 return 0;
4377}
4378
e32971b8
DDP
4379struct rr_numa_list {
4380 struct hmap numas; /* Contains 'struct rr_numa' */
4381};
4382
4383struct rr_numa {
4384 struct hmap_node node;
4385
4386 int numa_id;
4387
4388 /* Non isolated pmds on numa node 'numa_id' */
4389 struct dp_netdev_pmd_thread **pmds;
4390 int n_pmds;
4391
4392 int cur_index;
79da1e41 4393 bool idx_inc;
e32971b8
DDP
4394};
4395
4396static struct rr_numa *
4397rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4398{
4399 struct rr_numa *numa;
4400
4401 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4402 if (numa->numa_id == numa_id) {
4403 return numa;
4404 }
4405 }
4406
4407 return NULL;
4408}
4409
c37813fd
BM
4410/* Returns the next node in numa list following 'numa' in round-robin fashion.
4411 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4412 * Returns NULL if 'rr' numa list is empty. */
4413static struct rr_numa *
4414rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4415{
4416 struct hmap_node *node = NULL;
4417
4418 if (numa) {
4419 node = hmap_next(&rr->numas, &numa->node);
4420 }
4421 if (!node) {
4422 node = hmap_first(&rr->numas);
4423 }
4424
4425 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4426}
4427
e32971b8
DDP
4428static void
4429rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4430{
4431 struct dp_netdev_pmd_thread *pmd;
4432 struct rr_numa *numa;
4433
4434 hmap_init(&rr->numas);
4435
4436 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4437 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4438 continue;
4439 }
4440
4441 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4442 if (!numa) {
4443 numa = xzalloc(sizeof *numa);
4444 numa->numa_id = pmd->numa_id;
4445 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4446 }
4447 numa->n_pmds++;
4448 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4449 numa->pmds[numa->n_pmds - 1] = pmd;
79da1e41
KT
4450 /* At least one pmd so initialise curr_idx and idx_inc. */
4451 numa->cur_index = 0;
4452 numa->idx_inc = true;
e32971b8
DDP
4453 }
4454}
4455
e77c97b9
KT
4456/*
4457 * Returns the next pmd from the numa node.
4458 *
4459 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4460 * either an up or down walk, switching between up/down when the first or last
4461 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4462 *
4463 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4464 * core reached. e.g. 1,2,3,1,2,3,1,2...
4465 */
e32971b8 4466static struct dp_netdev_pmd_thread *
e77c97b9 4467rr_numa_get_pmd(struct rr_numa *numa, bool updown)
e32971b8 4468{
79da1e41
KT
4469 int numa_idx = numa->cur_index;
4470
4471 if (numa->idx_inc == true) {
4472 /* Incrementing through list of pmds. */
4473 if (numa->cur_index == numa->n_pmds-1) {
4474 /* Reached the last pmd. */
e77c97b9
KT
4475 if (updown) {
4476 numa->idx_inc = false;
4477 } else {
4478 numa->cur_index = 0;
4479 }
79da1e41
KT
4480 } else {
4481 numa->cur_index++;
4482 }
4483 } else {
4484 /* Decrementing through list of pmds. */
4485 if (numa->cur_index == 0) {
4486 /* Reached the first pmd. */
4487 numa->idx_inc = true;
4488 } else {
4489 numa->cur_index--;
4490 }
4491 }
4492 return numa->pmds[numa_idx];
e32971b8
DDP
4493}
4494
4495static void
4496rr_numa_list_destroy(struct rr_numa_list *rr)
4497{
4498 struct rr_numa *numa;
4499
4500 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4501 free(numa->pmds);
4502 free(numa);
4503 }
4504 hmap_destroy(&rr->numas);
4505}
4506
655856ef
KT
4507/* Sort Rx Queues by the processing cycles they are consuming. */
4508static int
cc131ac1 4509compare_rxq_cycles(const void *a, const void *b)
655856ef 4510{
28080276
KT
4511 struct dp_netdev_rxq *qa;
4512 struct dp_netdev_rxq *qb;
8368866e 4513 uint64_t cycles_qa, cycles_qb;
655856ef
KT
4514
4515 qa = *(struct dp_netdev_rxq **) a;
4516 qb = *(struct dp_netdev_rxq **) b;
4517
8368866e
KT
4518 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4519 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
655856ef 4520
8368866e
KT
4521 if (cycles_qa != cycles_qb) {
4522 return (cycles_qa < cycles_qb) ? 1 : -1;
a130f1a8
KT
4523 } else {
4524 /* Cycles are the same so tiebreak on port/queue id.
4525 * Tiebreaking (as opposed to return 0) ensures consistent
4526 * sort results across multiple OS's. */
f0aa3801
BP
4527 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4528 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4529 if (port_qa != port_qb) {
4530 return port_qa > port_qb ? 1 : -1;
a130f1a8
KT
4531 } else {
4532 return netdev_rxq_get_queue_id(qa->rx)
4533 - netdev_rxq_get_queue_id(qb->rx);
4534 }
655856ef 4535 }
655856ef
KT
4536}
4537
e32971b8
DDP
4538/* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4539 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4540 * pmds to unpinned queues.
4541 *
4542 * The function doesn't touch the pmd threads, it just stores the assignment
4543 * in the 'pmd' member of each rxq. */
4544static void
4545rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4546{
4547 struct dp_netdev_port *port;
4548 struct rr_numa_list rr;
c37813fd 4549 struct rr_numa *non_local_numa = NULL;
655856ef 4550 struct dp_netdev_rxq ** rxqs = NULL;
97bf8f47 4551 int n_rxqs = 0;
655856ef
KT
4552 struct rr_numa *numa = NULL;
4553 int numa_id;
e77c97b9 4554 bool assign_cyc = dp->pmd_rxq_assign_cyc;
e32971b8
DDP
4555
4556 HMAP_FOR_EACH (port, node, &dp->ports) {
e32971b8
DDP
4557 if (!netdev_is_pmd(port->netdev)) {
4558 continue;
4559 }
4560
e32971b8
DDP
4561 for (int qid = 0; qid < port->n_rxq; qid++) {
4562 struct dp_netdev_rxq *q = &port->rxqs[qid];
4563
4564 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4565 struct dp_netdev_pmd_thread *pmd;
4566
4567 pmd = dp_netdev_get_pmd(dp, q->core_id);
4568 if (!pmd) {
4569 VLOG_WARN("There is no PMD thread on core %d. Queue "
4570 "%d on port \'%s\' will not be polled.",
4571 q->core_id, qid, netdev_get_name(port->netdev));
4572 } else {
4573 q->pmd = pmd;
4574 pmd->isolated = true;
433a3fa5
GM
4575 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4576 "rx queue %d.", pmd->core_id, pmd->numa_id,
4577 netdev_rxq_get_name(q->rx),
4578 netdev_rxq_get_queue_id(q->rx));
e32971b8
DDP
4579 dp_netdev_pmd_unref(pmd);
4580 }
4581 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
8368866e
KT
4582 uint64_t cycle_hist = 0;
4583
655856ef
KT
4584 if (n_rxqs == 0) {
4585 rxqs = xmalloc(sizeof *rxqs);
e32971b8 4586 } else {
655856ef 4587 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
e32971b8 4588 }
8368866e 4589
e77c97b9
KT
4590 if (assign_cyc) {
4591 /* Sum the queue intervals and store the cycle history. */
4592 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4593 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4594 }
4595 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4596 cycle_hist);
4597 }
655856ef
KT
4598 /* Store the queue. */
4599 rxqs[n_rxqs++] = q;
e32971b8
DDP
4600 }
4601 }
4602 }
4603
e77c97b9 4604 if (n_rxqs > 1 && assign_cyc) {
655856ef
KT
4605 /* Sort the queues in order of the processing cycles
4606 * they consumed during their last pmd interval. */
cc131ac1 4607 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
655856ef
KT
4608 }
4609
4610 rr_numa_list_populate(dp, &rr);
4611 /* Assign the sorted queues to pmds in round robin. */
97bf8f47 4612 for (int i = 0; i < n_rxqs; i++) {
655856ef
KT
4613 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4614 numa = rr_numa_list_lookup(&rr, numa_id);
4615 if (!numa) {
4616 /* There are no pmds on the queue's local NUMA node.
4617 Round robin on the NUMA nodes that do have pmds. */
4618 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4619 if (!non_local_numa) {
4620 VLOG_ERR("There is no available (non-isolated) pmd "
4621 "thread for port \'%s\' queue %d. This queue "
4622 "will not be polled. Is pmd-cpu-mask set to "
4623 "zero? Or are all PMDs isolated to other "
4624 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4625 netdev_rxq_get_queue_id(rxqs[i]->rx));
4626 continue;
4627 }
e77c97b9 4628 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
655856ef
KT
4629 VLOG_WARN("There's no available (non-isolated) pmd thread "
4630 "on numa node %d. Queue %d on port \'%s\' will "
4631 "be assigned to the pmd on core %d "
4632 "(numa node %d). Expect reduced performance.",
4633 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4634 netdev_rxq_get_name(rxqs[i]->rx),
4635 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4636 } else {
e77c97b9
KT
4637 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4638 if (assign_cyc) {
4639 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4640 "rx queue %d "
4641 "(measured processing cycles %"PRIu64").",
4642 rxqs[i]->pmd->core_id, numa_id,
4643 netdev_rxq_get_name(rxqs[i]->rx),
4644 netdev_rxq_get_queue_id(rxqs[i]->rx),
4645 dp_netdev_rxq_get_cycles(rxqs[i],
4646 RXQ_CYCLES_PROC_HIST));
4647 } else {
4648 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4649 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4650 netdev_rxq_get_name(rxqs[i]->rx),
4651 netdev_rxq_get_queue_id(rxqs[i]->rx));
4652 }
655856ef
KT
4653 }
4654 }
4655
e32971b8 4656 rr_numa_list_destroy(&rr);
655856ef 4657 free(rxqs);
e32971b8
DDP
4658}
4659
140dd699
IM
4660static void
4661reload_affected_pmds(struct dp_netdev *dp)
4662{
4663 struct dp_netdev_pmd_thread *pmd;
4664
4665 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4666 if (pmd->need_reload) {
241bad15 4667 flow_mark_flush(pmd);
140dd699 4668 dp_netdev_reload_pmd__(pmd);
8f077b31
DM
4669 }
4670 }
4671
4672 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4673 if (pmd->need_reload) {
4674 if (pmd->core_id != NON_PMD_CORE_ID) {
4675 bool reload;
4676
4677 do {
4678 atomic_read_explicit(&pmd->reload, &reload,
4679 memory_order_acquire);
4680 } while (reload);
4681 }
140dd699
IM
4682 pmd->need_reload = false;
4683 }
4684 }
4685}
4686
6e3c6fa4
DDP
4687static void
4688reconfigure_pmd_threads(struct dp_netdev *dp)
4689 OVS_REQUIRES(dp->port_mutex)
4690{
e32971b8
DDP
4691 struct dp_netdev_pmd_thread *pmd;
4692 struct ovs_numa_dump *pmd_cores;
140dd699
IM
4693 struct ovs_numa_info_core *core;
4694 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4695 struct hmapx_node *node;
e32971b8 4696 bool changed = false;
140dd699 4697 bool need_to_adjust_static_tx_qids = false;
e32971b8
DDP
4698
4699 /* The pmd threads should be started only if there's a pmd port in the
4700 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4701 * NR_PMD_THREADS per numa node. */
4702 if (!has_pmd_port(dp)) {
4703 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4704 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4705 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4706 } else {
4707 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4708 }
4709
140dd699
IM
4710 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4711 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4712 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4713 /* Adjustment is required to keep 'static_tx_qid's sequential and
4714 * avoid possible issues, for example, imbalanced tx queue usage
4715 * and unnecessary locking caused by remapping on netdev level. */
4716 need_to_adjust_static_tx_qids = true;
4717 }
4718
4719 /* Check for unwanted pmd threads */
4720 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4721 if (pmd->core_id == NON_PMD_CORE_ID) {
4722 continue;
4723 }
4724 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4725 pmd->core_id)) {
4726 hmapx_add(&to_delete, pmd);
4727 } else if (need_to_adjust_static_tx_qids) {
e2cafa86 4728 atomic_store_relaxed(&pmd->reload_tx_qid, true);
140dd699 4729 pmd->need_reload = true;
e32971b8
DDP
4730 }
4731 }
4732
140dd699
IM
4733 HMAPX_FOR_EACH (node, &to_delete) {
4734 pmd = (struct dp_netdev_pmd_thread *) node->data;
4735 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4736 pmd->numa_id, pmd->core_id);
4737 dp_netdev_del_pmd(dp, pmd);
4738 }
4739 changed = !hmapx_is_empty(&to_delete);
4740 hmapx_destroy(&to_delete);
e32971b8 4741
140dd699
IM
4742 if (need_to_adjust_static_tx_qids) {
4743 /* 'static_tx_qid's are not sequential now.
4744 * Reload remaining threads to fix this. */
4745 reload_affected_pmds(dp);
4746 }
e32971b8 4747
140dd699
IM
4748 /* Check for required new pmd threads */
4749 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4750 pmd = dp_netdev_get_pmd(dp, core->core_id);
4751 if (!pmd) {
8afbf2fa
IM
4752 struct ds name = DS_EMPTY_INITIALIZER;
4753
140dd699 4754 pmd = xzalloc(sizeof *pmd);
e32971b8 4755 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
8afbf2fa
IM
4756
4757 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4758 pmd->thread = ovs_thread_create(ds_cstr(&name),
4759 pmd_thread_main, pmd);
4760 ds_destroy(&name);
4761
140dd699
IM
4762 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4763 pmd->numa_id, pmd->core_id);
4764 changed = true;
4765 } else {
4766 dp_netdev_pmd_unref(pmd);
e32971b8 4767 }
140dd699
IM
4768 }
4769
4770 if (changed) {
4771 struct ovs_numa_info_numa *numa;
e32971b8
DDP
4772
4773 /* Log the number of pmd threads per numa node. */
4774 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
140dd699 4775 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
e32971b8
DDP
4776 numa->n_cores, numa->numa_id);
4777 }
4778 }
4779
4780 ovs_numa_dump_destroy(pmd_cores);
4781}
4782
e32971b8
DDP
4783static void
4784pmd_remove_stale_ports(struct dp_netdev *dp,
4785 struct dp_netdev_pmd_thread *pmd)
4786 OVS_EXCLUDED(pmd->port_mutex)
4787 OVS_REQUIRES(dp->port_mutex)
4788{
4789 struct rxq_poll *poll, *poll_next;
4790 struct tx_port *tx, *tx_next;
4791
4792 ovs_mutex_lock(&pmd->port_mutex);
4793 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4794 struct dp_netdev_port *port = poll->rxq->port;
4795
4796 if (port->need_reconfigure
4797 || !hmap_contains(&dp->ports, &port->node)) {
4798 dp_netdev_del_rxq_from_pmd(pmd, poll);
4799 }
4800 }
4801 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4802 struct dp_netdev_port *port = tx->port;
4803
4804 if (port->need_reconfigure
4805 || !hmap_contains(&dp->ports, &port->node)) {
4806 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4807 }
4808 }
4809 ovs_mutex_unlock(&pmd->port_mutex);
4810}
4811
4812/* Must be called each time a port is added/removed or the cmask changes.
4813 * This creates and destroys pmd threads, reconfigures ports, opens their
4814 * rxqs and assigns all rxqs/txqs to pmd threads. */
4815static void
4816reconfigure_datapath(struct dp_netdev *dp)
4817 OVS_REQUIRES(dp->port_mutex)
4818{
6d9fead1 4819 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
e32971b8
DDP
4820 struct dp_netdev_pmd_thread *pmd;
4821 struct dp_netdev_port *port;
4822 int wanted_txqs;
6e3c6fa4 4823
a6a426d6
IM
4824 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4825
e32971b8
DDP
4826 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4827 * on the system and the user configuration. */
4828 reconfigure_pmd_threads(dp);
6e3c6fa4 4829
e32971b8 4830 wanted_txqs = cmap_count(&dp->poll_threads);
324c8374 4831
e32971b8
DDP
4832 /* The number of pmd threads might have changed, or a port can be new:
4833 * adjust the txqs. */
4834 HMAP_FOR_EACH (port, node, &dp->ports) {
4835 netdev_set_tx_multiq(port->netdev, wanted_txqs);
324c8374
IM
4836 }
4837
e32971b8
DDP
4838 /* Step 2: Remove from the pmd threads ports that have been removed or
4839 * need reconfiguration. */
4840
4841 /* Check for all the ports that need reconfiguration. We cache this in
85a4f238
IM
4842 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4843 * change at any time. */
e32971b8
DDP
4844 HMAP_FOR_EACH (port, node, &dp->ports) {
4845 if (netdev_is_reconf_required(port->netdev)) {
4846 port->need_reconfigure = true;
4847 }
4848 }
4849
4850 /* Remove from the pmd threads all the ports that have been deleted or
4851 * need reconfiguration. */
4852 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4853 pmd_remove_stale_ports(dp, pmd);
4854 }
4855
4856 /* Reload affected pmd threads. We must wait for the pmd threads before
4857 * reconfiguring the ports, because a port cannot be reconfigured while
4858 * it's being used. */
4859 reload_affected_pmds(dp);
4860
4861 /* Step 3: Reconfigure ports. */
4862
4863 /* We only reconfigure the ports that we determined above, because they're
4864 * not being used by any pmd thread at the moment. If a port fails to
4865 * reconfigure we remove it from the datapath. */
f582b6df
BP
4866 struct dp_netdev_port *next_port;
4867 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
dc36593c 4868 int err;
6e3c6fa4 4869
e32971b8
DDP
4870 if (!port->need_reconfigure) {
4871 continue;
4872 }
4873
dc36593c
DDP
4874 err = port_reconfigure(port);
4875 if (err) {
4876 hmap_remove(&dp->ports, &port->node);
4877 seq_change(dp->port_seq);
4878 port_destroy(port);
324c8374 4879 } else {
e32971b8 4880 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
6e3c6fa4
DDP
4881 }
4882 }
e32971b8
DDP
4883
4884 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4885 * for now, we just update the 'pmd' pointer in each rxq to point to the
4886 * wanted thread according to the scheduling policy. */
4887
4888 /* Reset all the pmd threads to non isolated. */
4889 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4890 pmd->isolated = false;
4891 }
4892
4893 /* Reset all the queues to unassigned */
4894 HMAP_FOR_EACH (port, node, &dp->ports) {
4895 for (int i = 0; i < port->n_rxq; i++) {
4896 port->rxqs[i].pmd = NULL;
4897 }
4898 }
4899
4900 /* Add pinned queues and mark pmd threads isolated. */
4901 rxq_scheduling(dp, true);
4902
4903 /* Add non-pinned queues. */
4904 rxq_scheduling(dp, false);
4905
4906 /* Step 5: Remove queues not compliant with new scheduling. */
6d9fead1
DM
4907
4908 /* Count all the threads that will have at least one queue to poll. */
4909 HMAP_FOR_EACH (port, node, &dp->ports) {
4910 for (int qid = 0; qid < port->n_rxq; qid++) {
4911 struct dp_netdev_rxq *q = &port->rxqs[qid];
4912
4913 if (q->pmd) {
4914 hmapx_add(&busy_threads, q->pmd);
4915 }
4916 }
4917 }
4918
e32971b8
DDP
4919 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4920 struct rxq_poll *poll, *poll_next;
4921
4922 ovs_mutex_lock(&pmd->port_mutex);
4923 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4924 if (poll->rxq->pmd != pmd) {
4925 dp_netdev_del_rxq_from_pmd(pmd, poll);
6d9fead1
DM
4926
4927 /* This pmd might sleep after this step if it has no rxq
4928 * remaining. Tell it to busy wait for new assignment if it
4929 * has at least one scheduled queue. */
4930 if (hmap_count(&pmd->poll_list) == 0 &&
4931 hmapx_contains(&busy_threads, pmd)) {
4932 atomic_store_relaxed(&pmd->wait_for_reload, true);
4933 }
e32971b8
DDP
4934 }
4935 }
4936 ovs_mutex_unlock(&pmd->port_mutex);
4937 }
4938
6d9fead1
DM
4939 hmapx_destroy(&busy_threads);
4940
e32971b8
DDP
4941 /* Reload affected pmd threads. We must wait for the pmd threads to remove
4942 * the old queues before readding them, otherwise a queue can be polled by
4943 * two threads at the same time. */
4944 reload_affected_pmds(dp);
4945
4946 /* Step 6: Add queues from scheduling, if they're not there already. */
4947 HMAP_FOR_EACH (port, node, &dp->ports) {
4948 if (!netdev_is_pmd(port->netdev)) {
4949 continue;
4950 }
4951
4952 for (int qid = 0; qid < port->n_rxq; qid++) {
4953 struct dp_netdev_rxq *q = &port->rxqs[qid];
4954
4955 if (q->pmd) {
4956 ovs_mutex_lock(&q->pmd->port_mutex);
4957 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4958 ovs_mutex_unlock(&q->pmd->port_mutex);
4959 }
4960 }
4961 }
4962
4963 /* Add every port to the tx cache of every pmd thread, if it's not
4964 * there already and if this pmd has at least one rxq to poll. */
4965 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4966 ovs_mutex_lock(&pmd->port_mutex);
4967 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4968 HMAP_FOR_EACH (port, node, &dp->ports) {
4969 dp_netdev_add_port_tx_to_pmd(pmd, port);
4970 }
4971 }
4972 ovs_mutex_unlock(&pmd->port_mutex);
4973 }
4974
4975 /* Reload affected pmd threads. */
4976 reload_affected_pmds(dp);
5bf84282
NK
4977
4978 /* Check if PMD Auto LB is to be enabled */
4979 set_pmd_auto_lb(dp);
6e3c6fa4
DDP
4980}
4981
050c60bf
DDP
4982/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
4983static bool
4984ports_require_restart(const struct dp_netdev *dp)
4985 OVS_REQUIRES(dp->port_mutex)
4986{
4987 struct dp_netdev_port *port;
4988
4989 HMAP_FOR_EACH (port, node, &dp->ports) {
4990 if (netdev_is_reconf_required(port->netdev)) {
4991 return true;
4992 }
4993 }
4994
4995 return false;
4996}
4997
5bf84282
NK
4998/* Calculates variance in the values stored in array 'a'. 'n' is the number
4999 * of elements in array to be considered for calculating vairance.
5000 * Usage example: data array 'a' contains the processing load of each pmd and
5001 * 'n' is the number of PMDs. It returns the variance in processing load of
5002 * PMDs*/
5003static uint64_t
5004variance(uint64_t a[], int n)
5005{
5006 /* Compute mean (average of elements). */
5007 uint64_t sum = 0;
5008 uint64_t mean = 0;
5009 uint64_t sqDiff = 0;
5010
5011 if (!n) {
5012 return 0;
5013 }
5014
5015 for (int i = 0; i < n; i++) {
5016 sum += a[i];
5017 }
5018
5019 if (sum) {
5020 mean = sum / n;
5021
5022 /* Compute sum squared differences with mean. */
5023 for (int i = 0; i < n; i++) {
5024 sqDiff += (a[i] - mean)*(a[i] - mean);
5025 }
5026 }
5027 return (sqDiff ? (sqDiff / n) : 0);
5028}
5029
5030
5031/* Returns the variance in the PMDs usage as part of dry run of rxqs
5032 * assignment to PMDs. */
5033static bool
5034get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5035 uint32_t num_pmds, uint64_t *predicted_variance)
5036 OVS_REQUIRES(dp->port_mutex)
5037{
5038 struct dp_netdev_port *port;
5039 struct dp_netdev_pmd_thread *pmd;
5040 struct dp_netdev_rxq **rxqs = NULL;
5041 struct rr_numa *numa = NULL;
5042 struct rr_numa_list rr;
5043 int n_rxqs = 0;
5044 bool ret = false;
5045 uint64_t *pmd_usage;
5046
5047 if (!predicted_variance) {
5048 return ret;
5049 }
5050
5051 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5052
5053 HMAP_FOR_EACH (port, node, &dp->ports) {
5054 if (!netdev_is_pmd(port->netdev)) {
5055 continue;
5056 }
5057
5058 for (int qid = 0; qid < port->n_rxq; qid++) {
5059 struct dp_netdev_rxq *q = &port->rxqs[qid];
5060 uint64_t cycle_hist = 0;
5061
5062 if (q->pmd->isolated) {
5063 continue;
5064 }
5065
5066 if (n_rxqs == 0) {
5067 rxqs = xmalloc(sizeof *rxqs);
5068 } else {
5069 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5070 }
5071
5072 /* Sum the queue intervals and store the cycle history. */
5073 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5074 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5075 }
5076 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5077 cycle_hist);
5078 /* Store the queue. */
5079 rxqs[n_rxqs++] = q;
5080 }
5081 }
5082 if (n_rxqs > 1) {
5083 /* Sort the queues in order of the processing cycles
5084 * they consumed during their last pmd interval. */
5085 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5086 }
5087 rr_numa_list_populate(dp, &rr);
5088
5089 for (int i = 0; i < n_rxqs; i++) {
5090 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5091 numa = rr_numa_list_lookup(&rr, numa_id);
5092 if (!numa) {
5093 /* Abort if cross NUMA polling. */
5094 VLOG_DBG("PMD auto lb dry run."
5095 " Aborting due to cross-numa polling.");
5096 goto cleanup;
5097 }
5098
5099 pmd = rr_numa_get_pmd(numa, true);
5100 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5101 "to be assigned port \'%s\' rx queue %d "
5102 "(measured processing cycles %"PRIu64").",
5103 pmd->core_id, numa_id,
5104 netdev_rxq_get_name(rxqs[i]->rx),
5105 netdev_rxq_get_queue_id(rxqs[i]->rx),
5106 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5107
5108 for (int id = 0; id < num_pmds; id++) {
5109 if (pmd->core_id == core_list[id]) {
5110 /* Add the processing cycles of rxq to pmd polling it. */
5111 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5112 RXQ_CYCLES_PROC_HIST);
5113 }
5114 }
5115 }
5116
5117 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5118 uint64_t total_cycles = 0;
5119
5120 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5121 continue;
5122 }
5123
5124 /* Get the total pmd cycles for an interval. */
5125 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5126 /* Estimate the cycles to cover all intervals. */
5127 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5128 for (int id = 0; id < num_pmds; id++) {
5129 if (pmd->core_id == core_list[id]) {
5130 if (pmd_usage[id]) {
5131 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5132 }
5133 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5134 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5135 }
5136 }
5137 }
5138 *predicted_variance = variance(pmd_usage, num_pmds);
5139 ret = true;
5140
5141cleanup:
5142 rr_numa_list_destroy(&rr);
5143 free(rxqs);
5144 free(pmd_usage);
5145 return ret;
5146}
5147
5148/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5149 * better distribution of load on PMDs. */
5150static bool
5151pmd_rebalance_dry_run(struct dp_netdev *dp)
5152 OVS_REQUIRES(dp->port_mutex)
5153{
5154 struct dp_netdev_pmd_thread *pmd;
5155 uint64_t *curr_pmd_usage;
5156
5157 uint64_t curr_variance;
5158 uint64_t new_variance;
5159 uint64_t improvement = 0;
5160 uint32_t num_pmds;
5161 uint32_t *pmd_corelist;
eef85380 5162 struct rxq_poll *poll;
5bf84282
NK
5163 bool ret;
5164
5165 num_pmds = cmap_count(&dp->poll_threads);
5166
5167 if (num_pmds > 1) {
5168 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5169 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5170 } else {
5171 return false;
5172 }
5173
5174 num_pmds = 0;
5175 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5176 uint64_t total_cycles = 0;
5177 uint64_t total_proc = 0;
5178
5179 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5180 continue;
5181 }
5182
5183 /* Get the total pmd cycles for an interval. */
5184 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5185 /* Estimate the cycles to cover all intervals. */
5186 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5187
eef85380
IM
5188 ovs_mutex_lock(&pmd->port_mutex);
5189 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5bf84282 5190 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
eef85380 5191 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5bf84282 5192 }
5bf84282 5193 }
eef85380
IM
5194 ovs_mutex_unlock(&pmd->port_mutex);
5195
5bf84282
NK
5196 if (total_proc) {
5197 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5198 }
5199
5200 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5201 pmd->core_id, curr_pmd_usage[num_pmds]);
5202
5203 if (atomic_count_get(&pmd->pmd_overloaded)) {
5204 atomic_count_set(&pmd->pmd_overloaded, 0);
5205 }
5206
5207 pmd_corelist[num_pmds] = pmd->core_id;
5208 num_pmds++;
5209 }
5210
5211 curr_variance = variance(curr_pmd_usage, num_pmds);
5212 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5213
5214 if (ret) {
5215 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5216 " Predicted PMD variance: %"PRIu64"",
5217 curr_variance, new_variance);
5218
5219 if (new_variance < curr_variance) {
5220 improvement =
5221 ((curr_variance - new_variance) * 100) / curr_variance;
5222 }
5223 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5224 ret = false;
5225 }
5226 }
5227
5228 free(curr_pmd_usage);
5229 free(pmd_corelist);
5230 return ret;
5231}
5232
5233
a36de779
PS
5234/* Return true if needs to revalidate datapath flows. */
5235static bool
e4cfed38
PS
5236dpif_netdev_run(struct dpif *dpif)
5237{
5238 struct dp_netdev_port *port;
5239 struct dp_netdev *dp = get_dp_netdev(dpif);
546e57d4 5240 struct dp_netdev_pmd_thread *non_pmd;
a36de779 5241 uint64_t new_tnl_seq;
c71ea3c4 5242 bool need_to_flush = true;
5bf84282
NK
5243 bool pmd_rebalance = false;
5244 long long int now = time_msec();
5245 struct dp_netdev_pmd_thread *pmd;
e4cfed38 5246
e9985d6a 5247 ovs_mutex_lock(&dp->port_mutex);
546e57d4
DDP
5248 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5249 if (non_pmd) {
5250 ovs_mutex_lock(&dp->non_pmd_mutex);
5251 HMAP_FOR_EACH (port, node, &dp->ports) {
5252 if (!netdev_is_pmd(port->netdev)) {
5253 int i;
55c955bd 5254
2fbadeb6
IM
5255 if (port->emc_enabled) {
5256 atomic_read_relaxed(&dp->emc_insert_min,
5257 &non_pmd->ctx.emc_insert_min);
5258 } else {
5259 non_pmd->ctx.emc_insert_min = 0;
5260 }
5261
546e57d4 5262 for (i = 0; i < port->n_rxq; i++) {
35c91567
DM
5263
5264 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5265 continue;
5266 }
5267
c71ea3c4
IM
5268 if (dp_netdev_process_rxq_port(non_pmd,
5269 &port->rxqs[i],
5270 port->port_no)) {
5271 need_to_flush = false;
5272 }
546e57d4 5273 }
55c955bd 5274 }
e4cfed38 5275 }
c71ea3c4
IM
5276 if (need_to_flush) {
5277 /* We didn't receive anything in the process loop.
5278 * Check if we need to send something.
5279 * There was no time updates on current iteration. */
5280 pmd_thread_ctx_time_update(non_pmd);
5281 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5282 }
5283
b010be17 5284 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
546e57d4 5285 ovs_mutex_unlock(&dp->non_pmd_mutex);
6e3c6fa4 5286
546e57d4
DDP
5287 dp_netdev_pmd_unref(non_pmd);
5288 }
1c1e46ed 5289
5bf84282
NK
5290 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5291 if (pmd_alb->is_enabled) {
5292 if (!pmd_alb->rebalance_poll_timer) {
5293 pmd_alb->rebalance_poll_timer = now;
5294 } else if ((pmd_alb->rebalance_poll_timer +
5295 pmd_alb->rebalance_intvl) < now) {
5296 pmd_alb->rebalance_poll_timer = now;
5297 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5298 if (atomic_count_get(&pmd->pmd_overloaded) >=
5299 PMD_RXQ_INTERVAL_MAX) {
5300 pmd_rebalance = true;
5301 break;
5302 }
5303 }
5304
5305 if (pmd_rebalance &&
5306 !dp_netdev_is_reconf_required(dp) &&
5307 !ports_require_restart(dp) &&
5308 pmd_rebalance_dry_run(dp)) {
5309 VLOG_INFO("PMD auto lb dry run."
5310 " requesting datapath reconfigure.");
5311 dp_netdev_request_reconfigure(dp);
5312 }
5313 }
5314 }
5315
a6a426d6 5316 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
e32971b8 5317 reconfigure_datapath(dp);
6e3c6fa4
DDP
5318 }
5319 ovs_mutex_unlock(&dp->port_mutex);
5320
53902038 5321 tnl_neigh_cache_run();
7f9b8504 5322 tnl_port_map_run();
a36de779
PS
5323 new_tnl_seq = seq_read(tnl_conf_seq);
5324
5325 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5326 dp->last_tnl_conf_seq = new_tnl_seq;
5327 return true;
5328 }
5329 return false;
e4cfed38
PS
5330}
5331
5332static void
5333dpif_netdev_wait(struct dpif *dpif)
5334{
5335 struct dp_netdev_port *port;
5336 struct dp_netdev *dp = get_dp_netdev(dpif);
5337
59e6d833 5338 ovs_mutex_lock(&dp_netdev_mutex);
e9985d6a
DDP
5339 ovs_mutex_lock(&dp->port_mutex);
5340 HMAP_FOR_EACH (port, node, &dp->ports) {
050c60bf 5341 netdev_wait_reconf_required(port->netdev);
55c955bd
PS
5342 if (!netdev_is_pmd(port->netdev)) {
5343 int i;
5344
490e82af 5345 for (i = 0; i < port->n_rxq; i++) {
947dc567 5346 netdev_rxq_wait(port->rxqs[i].rx);
55c955bd 5347 }
e4cfed38
PS
5348 }
5349 }
e9985d6a 5350 ovs_mutex_unlock(&dp->port_mutex);
59e6d833 5351 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 5352 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
5353}
5354
d0cca6c3
DDP
5355static void
5356pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5357{
5358 struct tx_port *tx_port_cached;
5359
c71ea3c4
IM
5360 /* Flush all the queued packets. */
5361 dp_netdev_pmd_flush_output_packets(pmd, true);
324c8374 5362 /* Free all used tx queue ids. */
b010be17 5363 dpif_netdev_xps_revalidate_pmd(pmd, true);
324c8374 5364
57eebbb4
DDP
5365 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5366 free(tx_port_cached);
5367 }
5368 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
d0cca6c3
DDP
5369 free(tx_port_cached);
5370 }
5371}
5372
5373/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
899363ed
BB
5374 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5375 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5376 * one txq. */
d0cca6c3
DDP
5377static void
5378pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5379 OVS_REQUIRES(pmd->port_mutex)
5380{
5381 struct tx_port *tx_port, *tx_port_cached;
5382
5383 pmd_free_cached_ports(pmd);
57eebbb4
DDP
5384 hmap_shrink(&pmd->send_port_cache);
5385 hmap_shrink(&pmd->tnl_port_cache);
d0cca6c3
DDP
5386
5387 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
57eebbb4
DDP
5388 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5389 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5390 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5391 hash_port_no(tx_port_cached->port->port_no));
5392 }
5393
5394 if (netdev_n_txq(tx_port->port->netdev)) {
5395 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5396 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5397 hash_port_no(tx_port_cached->port->port_no));
5398 }
d0cca6c3
DDP
5399 }
5400}
5401
140dd699
IM
5402static void
5403pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5404{
5405 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5406 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5407 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5408 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5409 }
5410 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5411
5412 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5413 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5414}
5415
5416static void
5417pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5418{
5419 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5420 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5421 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5422}
5423
e4cfed38 5424static int
d0cca6c3 5425pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
947dc567 5426 struct polled_queue **ppoll_list)
e4cfed38 5427{
947dc567 5428 struct polled_queue *poll_list = *ppoll_list;
ae7ad0a1
IM
5429 struct rxq_poll *poll;
5430 int i;
e4cfed38 5431
d0cca6c3 5432 ovs_mutex_lock(&pmd->port_mutex);
947dc567
DDP
5433 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5434 * sizeof *poll_list);
a1fdee13 5435
ae7ad0a1 5436 i = 0;
947dc567 5437 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
922b28d4 5438 poll_list[i].rxq = poll->rxq;
947dc567 5439 poll_list[i].port_no = poll->rxq->port->port_no;
2fbadeb6 5440 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
35c91567
DM
5441 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5442 poll_list[i].change_seq =
5443 netdev_get_change_seq(poll->rxq->port->netdev);
947dc567 5444 i++;
e4cfed38 5445 }
d0cca6c3
DDP
5446
5447 pmd_load_cached_ports(pmd);
5448
5449 ovs_mutex_unlock(&pmd->port_mutex);
e4cfed38 5450
e4cfed38 5451 *ppoll_list = poll_list;
d42f9307 5452 return i;
e4cfed38
PS
5453}
5454
6c3eee82 5455static void *
e4cfed38 5456pmd_thread_main(void *f_)
6c3eee82 5457{
65f13b50 5458 struct dp_netdev_pmd_thread *pmd = f_;
82a48ead 5459 struct pmd_perf_stats *s = &pmd->perf_stats;
e4cfed38 5460 unsigned int lc = 0;
947dc567 5461 struct polled_queue *poll_list;
6d9fead1 5462 bool wait_for_reload = false;
e2cafa86 5463 bool reload_tx_qid;
d42f9307 5464 bool exiting;
6d9fead1 5465 bool reload;
e4cfed38
PS
5466 int poll_cnt;
5467 int i;
a2ac666d 5468 int process_packets = 0;
6c3eee82 5469
e4cfed38
PS
5470 poll_list = NULL;
5471
65f13b50
AW
5472 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5473 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6930c7e0
DDP
5474 ovs_numa_thread_setaffinity_core(pmd->core_id);
5475 dpdk_set_lcore_id(pmd->core_id);
d0cca6c3 5476 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
60d8ccae 5477 dfc_cache_init(&pmd->flow_cache);
140dd699 5478 pmd_alloc_static_tx_qid(pmd);
ae7ad0a1 5479
e2cafa86 5480reload:
5bf84282
NK
5481 atomic_count_init(&pmd->pmd_overloaded, 0);
5482
7dd671f0
MK
5483 /* List port/core affinity */
5484 for (i = 0; i < poll_cnt; i++) {
ce179f11 5485 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
922b28d4
KT
5486 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5487 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4f5d13e2
KT
5488 /* Reset the rxq current cycles counter. */
5489 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
7dd671f0
MK
5490 }
5491
2788a1b1 5492 if (!poll_cnt) {
6d9fead1
DM
5493 if (wait_for_reload) {
5494 /* Don't sleep, control thread will ask for a reload shortly. */
5495 do {
5496 atomic_read_explicit(&pmd->reload, &reload,
5497 memory_order_acquire);
5498 } while (!reload);
5499 } else {
5500 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5501 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5502 poll_block();
5503 }
2788a1b1 5504 }
2788a1b1
DDP
5505 }
5506
2a2c67b4
KT
5507 pmd->intrvl_tsc_prev = 0;
5508 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
a19896ab 5509 cycles_counter_update(s);
79f36875
JS
5510 /* Protect pmd stats from external clearing while polling. */
5511 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
e4cfed38 5512 for (;;) {
79f36875 5513 uint64_t rx_packets = 0, tx_packets = 0;
c71ea3c4 5514
a19896ab 5515 pmd_perf_start_iteration(s);
79f36875 5516
e4cfed38 5517 for (i = 0; i < poll_cnt; i++) {
2fbadeb6 5518
35c91567
DM
5519 if (!poll_list[i].rxq_enabled) {
5520 continue;
5521 }
5522
2fbadeb6
IM
5523 if (poll_list[i].emc_enabled) {
5524 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5525 &pmd->ctx.emc_insert_min);
5526 } else {
5527 pmd->ctx.emc_insert_min = 0;
5528 }
5529
a2ac666d 5530 process_packets =
a19896ab 5531 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
a2ac666d 5532 poll_list[i].port_no);
79f36875 5533 rx_packets += process_packets;
e4cfed38
PS
5534 }
5535
79f36875 5536 if (!rx_packets) {
c71ea3c4
IM
5537 /* We didn't receive anything in the process loop.
5538 * Check if we need to send something.
5539 * There was no time updates on current iteration. */
5540 pmd_thread_ctx_time_update(pmd);
79f36875 5541 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
c71ea3c4
IM
5542 }
5543
e4cfed38 5544 if (lc++ > 1024) {
e4cfed38 5545 lc = 0;
84067a4c 5546
fbe0962b 5547 coverage_try_clear();
4809891b 5548 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
9dede5cf 5549 if (!ovsrcu_try_quiesce()) {
60d8ccae 5550 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
9dede5cf 5551 }
84067a4c 5552
35c91567
DM
5553 for (i = 0; i < poll_cnt; i++) {
5554 uint64_t current_seq =
5555 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5556 if (poll_list[i].change_seq != current_seq) {
5557 poll_list[i].change_seq = current_seq;
5558 poll_list[i].rxq_enabled =
5559 netdev_rxq_enabled(poll_list[i].rxq->rx);
5560 }
5561 }
6c3eee82 5562 }
68a0625b
DM
5563
5564 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5565 if (OVS_UNLIKELY(reload)) {
5566 break;
5567 }
5568
79f36875
JS
5569 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5570 pmd_perf_metrics_enabled(pmd));
e4cfed38 5571 }
79f36875 5572 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6c3eee82 5573
d0cca6c3 5574 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6d9fead1 5575 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
e2cafa86 5576 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
299c8d61 5577 atomic_read_relaxed(&pmd->exit, &exiting);
d42f9307
DDP
5578 /* Signal here to make sure the pmd finishes
5579 * reloading the updated configuration. */
5580 dp_netdev_pmd_reload_done(pmd);
5581
e2cafa86
DM
5582 if (reload_tx_qid) {
5583 pmd_free_static_tx_qid(pmd);
5584 pmd_alloc_static_tx_qid(pmd);
5585 }
9bbf1c3d 5586
d42f9307 5587 if (!exiting) {
e4cfed38
PS
5588 goto reload;
5589 }
6c3eee82 5590
e2cafa86 5591 pmd_free_static_tx_qid(pmd);
60d8ccae 5592 dfc_cache_uninit(&pmd->flow_cache);
e4cfed38 5593 free(poll_list);
d0cca6c3 5594 pmd_free_cached_ports(pmd);
6c3eee82
BP
5595 return NULL;
5596}
5597
6b31e073
RW
5598static void
5599dp_netdev_disable_upcall(struct dp_netdev *dp)
5600 OVS_ACQUIRES(dp->upcall_rwlock)
5601{
5602 fat_rwlock_wrlock(&dp->upcall_rwlock);
5603}
5604
5dddf960
JR
5605\f
5606/* Meters */
5607static void
5608dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5609 struct ofputil_meter_features *features)
5610{
4b27db64
JR
5611 features->max_meters = MAX_METERS;
5612 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5613 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5614 features->max_bands = MAX_BANDS;
5dddf960
JR
5615 features->max_color = 0;
5616}
5617
425a7b9e
JP
5618/* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5619 * that exceed a band are dropped in-place. */
4b27db64
JR
5620static void
5621dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5622 uint32_t meter_id, long long int now)
5623{
5624 struct dp_meter *meter;
5625 struct dp_meter_band *band;
79c81260 5626 struct dp_packet *packet;
4b27db64
JR
5627 long long int long_delta_t; /* msec */
5628 uint32_t delta_t; /* msec */
79c81260 5629 const size_t cnt = dp_packet_batch_size(packets_);
4b27db64
JR
5630 uint32_t bytes, volume;
5631 int exceeded_band[NETDEV_MAX_BURST];
5632 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5633 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5634
5635 if (meter_id >= MAX_METERS) {
5636 return;
5637 }
5638
5639 meter_lock(dp, meter_id);
5640 meter = dp->meters[meter_id];
5641 if (!meter) {
5642 goto out;
5643 }
5644
5645 /* Initialize as negative values. */
5646 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5647 /* Initialize as zeroes. */
5648 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5649
5650 /* All packets will hit the meter at the same time. */
42697ca7 5651 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
4b27db64 5652
acc5df0e
IM
5653 if (long_delta_t < 0) {
5654 /* This condition means that we have several threads fighting for a
5655 meter lock, and the one who received the packets a bit later wins.
5656 Assuming that all racing threads received packets at the same time
5657 to avoid overflow. */
5658 long_delta_t = 0;
5659 }
5660
4b27db64
JR
5661 /* Make sure delta_t will not be too large, so that bucket will not
5662 * wrap around below. */
5663 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5664 ? meter->max_delta_t : (uint32_t)long_delta_t;
5665
5666 /* Update meter stats. */
5667 meter->used = now;
5668 meter->packet_count += cnt;
5669 bytes = 0;
e883448e 5670 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5671 bytes += dp_packet_size(packet);
4b27db64
JR
5672 }
5673 meter->byte_count += bytes;
5674
5675 /* Meters can operate in terms of packets per second or kilobits per
5676 * second. */
5677 if (meter->flags & OFPMF13_PKTPS) {
5678 /* Rate in packets/second, bucket 1/1000 packets. */
5679 /* msec * packets/sec = 1/1000 packets. */
5680 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5681 } else {
5682 /* Rate in kbps, bucket in bits. */
5683 /* msec * kbps = bits */
5684 volume = bytes * 8;
5685 }
5686
5687 /* Update all bands and find the one hit with the highest rate for each
5688 * packet (if any). */
5689 for (int m = 0; m < meter->n_bands; ++m) {
5690 band = &meter->bands[m];
5691
5692 /* Update band's bucket. */
5693 band->bucket += delta_t * band->up.rate;
5694 if (band->bucket > band->up.burst_size) {
5695 band->bucket = band->up.burst_size;
5696 }
5697
5698 /* Drain the bucket for all the packets, if possible. */
5699 if (band->bucket >= volume) {
5700 band->bucket -= volume;
5701 } else {
5702 int band_exceeded_pkt;
5703
5704 /* Band limit hit, must process packet-by-packet. */
5705 if (meter->flags & OFPMF13_PKTPS) {
5706 band_exceeded_pkt = band->bucket / 1000;
5707 band->bucket %= 1000; /* Remainder stays in bucket. */
5708
5709 /* Update the exceeding band for each exceeding packet.
5710 * (Only one band will be fired by a packet, and that
5711 * can be different for each packet.) */
e883448e 5712 for (int i = band_exceeded_pkt; i < cnt; i++) {
4b27db64
JR
5713 if (band->up.rate > exceeded_rate[i]) {
5714 exceeded_rate[i] = band->up.rate;
5715 exceeded_band[i] = m;
5716 }
5717 }
5718 } else {
5719 /* Packet sizes differ, must process one-by-one. */
5720 band_exceeded_pkt = cnt;
e883448e 5721 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5722 uint32_t bits = dp_packet_size(packet) * 8;
4b27db64
JR
5723
5724 if (band->bucket >= bits) {
5725 band->bucket -= bits;
5726 } else {
5727 if (i < band_exceeded_pkt) {
5728 band_exceeded_pkt = i;
5729 }
5730 /* Update the exceeding band for the exceeding packet.
5731 * (Only one band will be fired by a packet, and that
5732 * can be different for each packet.) */
5733 if (band->up.rate > exceeded_rate[i]) {
5734 exceeded_rate[i] = band->up.rate;
5735 exceeded_band[i] = m;
5736 }
5737 }
5738 }
5739 }
5740 /* Remember the first exceeding packet. */
5741 if (exceeded_pkt > band_exceeded_pkt) {
5742 exceeded_pkt = band_exceeded_pkt;
5743 }
5744 }
5745 }
5746
425a7b9e
JP
5747 /* Fire the highest rate band exceeded by each packet, and drop
5748 * packets if needed. */
4b27db64 5749 size_t j;
79c81260 5750 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4b27db64
JR
5751 if (exceeded_band[j] >= 0) {
5752 /* Meter drop packet. */
5753 band = &meter->bands[exceeded_band[j]];
5754 band->packet_count += 1;
5755 band->byte_count += dp_packet_size(packet);
5756
5757 dp_packet_delete(packet);
5758 } else {
5759 /* Meter accepts packet. */
5760 dp_packet_batch_refill(packets_, packet, j);
5761 }
5762 }
5763 out:
5764 meter_unlock(dp, meter_id);
5765}
5766
5767/* Meter set/get/del processing is still single-threaded. */
5dddf960 5768static int
8101f03f 5769dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
4b27db64 5770 struct ofputil_meter_config *config)
5dddf960 5771{
4b27db64 5772 struct dp_netdev *dp = get_dp_netdev(dpif);
8101f03f 5773 uint32_t mid = meter_id.uint32;
4b27db64
JR
5774 struct dp_meter *meter;
5775 int i;
5776
4b27db64
JR
5777 if (mid >= MAX_METERS) {
5778 return EFBIG; /* Meter_id out of range. */
5779 }
5780
6508c845 5781 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
4b27db64
JR
5782 return EBADF; /* Unsupported flags set */
5783 }
2029ce9a 5784
6508c845
JP
5785 if (config->n_bands > MAX_BANDS) {
5786 return EINVAL;
2029ce9a
AVA
5787 }
5788
4b27db64
JR
5789 for (i = 0; i < config->n_bands; ++i) {
5790 switch (config->bands[i].type) {
5791 case OFPMBT13_DROP:
5792 break;
5793 default:
5794 return ENODEV; /* Unsupported band type */
5795 }
5796 }
5797
5798 /* Allocate meter */
5799 meter = xzalloc(sizeof *meter
5800 + config->n_bands * sizeof(struct dp_meter_band));
4b27db64 5801
d0db81ea
JP
5802 meter->flags = config->flags;
5803 meter->n_bands = config->n_bands;
5804 meter->max_delta_t = 0;
5805 meter->used = time_usec();
4b27db64 5806
d0db81ea
JP
5807 /* set up bands */
5808 for (i = 0; i < config->n_bands; ++i) {
5809 uint32_t band_max_delta_t;
4b27db64 5810
d0db81ea
JP
5811 /* Set burst size to a workable value if none specified. */
5812 if (config->bands[i].burst_size == 0) {
5813 config->bands[i].burst_size = config->bands[i].rate;
5814 }
5815
5816 meter->bands[i].up = config->bands[i];
5817 /* Convert burst size to the bucket units: */
5818 /* pkts => 1/1000 packets, kilobits => bits. */
5819 meter->bands[i].up.burst_size *= 1000;
5820 /* Initialize bucket to empty. */
5821 meter->bands[i].bucket = 0;
5822
5823 /* Figure out max delta_t that is enough to fill any bucket. */
5824 band_max_delta_t
5825 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5826 if (band_max_delta_t > meter->max_delta_t) {
5827 meter->max_delta_t = band_max_delta_t;
5828 }
4b27db64 5829 }
d0db81ea
JP
5830
5831 meter_lock(dp, mid);
5832 dp_delete_meter(dp, mid); /* Free existing meter, if any */
5833 dp->meters[mid] = meter;
5834 meter_unlock(dp, mid);
5835
5836 return 0;
5dddf960
JR
5837}
5838
5839static int
4b27db64
JR
5840dpif_netdev_meter_get(const struct dpif *dpif,
5841 ofproto_meter_id meter_id_,
5842 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 5843{
4b27db64 5844 const struct dp_netdev *dp = get_dp_netdev(dpif);
4b27db64 5845 uint32_t meter_id = meter_id_.uint32;
866bc756 5846 int retval = 0;
4b27db64
JR
5847
5848 if (meter_id >= MAX_METERS) {
5849 return EFBIG;
5850 }
866bc756
JP
5851
5852 meter_lock(dp, meter_id);
5853 const struct dp_meter *meter = dp->meters[meter_id];
4b27db64 5854 if (!meter) {
866bc756
JP
5855 retval = ENOENT;
5856 goto done;
4b27db64
JR
5857 }
5858 if (stats) {
5859 int i = 0;
5860
4b27db64
JR
5861 stats->packet_in_count = meter->packet_count;
5862 stats->byte_in_count = meter->byte_count;
5863
5864 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5865 stats->bands[i].packet_count = meter->bands[i].packet_count;
5866 stats->bands[i].byte_count = meter->bands[i].byte_count;
5867 }
4b27db64
JR
5868
5869 stats->n_bands = i;
5870 }
866bc756
JP
5871
5872done:
5873 meter_unlock(dp, meter_id);
5874 return retval;
5dddf960
JR
5875}
5876
5877static int
4b27db64
JR
5878dpif_netdev_meter_del(struct dpif *dpif,
5879 ofproto_meter_id meter_id_,
5880 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 5881{
4b27db64
JR
5882 struct dp_netdev *dp = get_dp_netdev(dpif);
5883 int error;
5884
5885 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5886 if (!error) {
5887 uint32_t meter_id = meter_id_.uint32;
5888
5889 meter_lock(dp, meter_id);
5890 dp_delete_meter(dp, meter_id);
5891 meter_unlock(dp, meter_id);
4b27db64
JR
5892 }
5893 return error;
5dddf960
JR
5894}
5895
5896\f
6b31e073
RW
5897static void
5898dpif_netdev_disable_upcall(struct dpif *dpif)
5899 OVS_NO_THREAD_SAFETY_ANALYSIS
5900{
5901 struct dp_netdev *dp = get_dp_netdev(dpif);
5902 dp_netdev_disable_upcall(dp);
5903}
5904
5905static void
5906dp_netdev_enable_upcall(struct dp_netdev *dp)
5907 OVS_RELEASES(dp->upcall_rwlock)
5908{
5909 fat_rwlock_unlock(&dp->upcall_rwlock);
5910}
5911
5912static void
5913dpif_netdev_enable_upcall(struct dpif *dpif)
5914 OVS_NO_THREAD_SAFETY_ANALYSIS
5915{
5916 struct dp_netdev *dp = get_dp_netdev(dpif);
5917 dp_netdev_enable_upcall(dp);
5918}
5919
ae7ad0a1 5920static void
accf8626
AW
5921dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
5922{
6d9fead1 5923 atomic_store_relaxed(&pmd->wait_for_reload, false);
e2cafa86 5924 atomic_store_relaxed(&pmd->reload_tx_qid, false);
2788a1b1 5925 pmd->last_reload_seq = seq_read(pmd->reload_seq);
8f077b31 5926 atomic_store_explicit(&pmd->reload, false, memory_order_release);
accf8626
AW
5927}
5928
1c1e46ed 5929/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
546e57d4
DDP
5930 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
5931 * 'core_id' is NON_PMD_CORE_ID).
1c1e46ed
AW
5932 *
5933 * Caller must unrefs the returned reference. */
65f13b50 5934static struct dp_netdev_pmd_thread *
bd5131ba 5935dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
5936{
5937 struct dp_netdev_pmd_thread *pmd;
55847abe 5938 const struct cmap_node *pnode;
65f13b50 5939
b19befae 5940 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
5941 if (!pnode) {
5942 return NULL;
5943 }
65f13b50
AW
5944 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
5945
1c1e46ed 5946 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
5947}
5948
f2eee189
AW
5949/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
5950static void
5951dp_netdev_set_nonpmd(struct dp_netdev *dp)
e9985d6a 5952 OVS_REQUIRES(dp->port_mutex)
f2eee189
AW
5953{
5954 struct dp_netdev_pmd_thread *non_pmd;
5955
5956 non_pmd = xzalloc(sizeof *non_pmd);
00873463 5957 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
f2eee189
AW
5958}
5959
1c1e46ed
AW
5960/* Caller must have valid pointer to 'pmd'. */
5961static bool
5962dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5963{
5964 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5965}
5966
5967static void
5968dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5969{
5970 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5971 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5972 }
5973}
5974
5975/* Given cmap position 'pos', tries to ref the next node. If try_ref()
5976 * fails, keeps checking for next node until reaching the end of cmap.
5977 *
5978 * Caller must unrefs the returned reference. */
5979static struct dp_netdev_pmd_thread *
5980dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5981{
5982 struct dp_netdev_pmd_thread *next;
5983
5984 do {
5985 struct cmap_node *node;
5986
5987 node = cmap_next_position(&dp->poll_threads, pos);
5988 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5989 : NULL;
5990 } while (next && !dp_netdev_pmd_try_ref(next));
5991
5992 return next;
5993}
5994
65f13b50 5995/* Configures the 'pmd' based on the input argument. */
6c3eee82 5996static void
65f13b50 5997dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
00873463 5998 unsigned core_id, int numa_id)
65f13b50
AW
5999{
6000 pmd->dp = dp;
65f13b50
AW
6001 pmd->core_id = core_id;
6002 pmd->numa_id = numa_id;
e32971b8 6003 pmd->need_reload = false;
c71ea3c4 6004 pmd->n_output_batches = 0;
1c1e46ed
AW
6005
6006 ovs_refcount_init(&pmd->ref_cnt);
299c8d61 6007 atomic_init(&pmd->exit, false);
2788a1b1
DDP
6008 pmd->reload_seq = seq_create();
6009 pmd->last_reload_seq = seq_read(pmd->reload_seq);
14e3e12a 6010 atomic_init(&pmd->reload, false);
1c1e46ed 6011 ovs_mutex_init(&pmd->flow_mutex);
d0cca6c3 6012 ovs_mutex_init(&pmd->port_mutex);
1c1e46ed 6013 cmap_init(&pmd->flow_table);
3453b4d6 6014 cmap_init(&pmd->classifiers);
58ed6df0 6015 pmd->ctx.last_rxq = NULL;
b010be17
IM
6016 pmd_thread_ctx_time_update(pmd);
6017 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6018 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
947dc567 6019 hmap_init(&pmd->poll_list);
d0cca6c3 6020 hmap_init(&pmd->tx_ports);
57eebbb4
DDP
6021 hmap_init(&pmd->tnl_port_cache);
6022 hmap_init(&pmd->send_port_cache);
65f13b50
AW
6023 /* init the 'flow_cache' since there is no
6024 * actual thread created for NON_PMD_CORE_ID. */
6025 if (core_id == NON_PMD_CORE_ID) {
60d8ccae 6026 dfc_cache_init(&pmd->flow_cache);
140dd699 6027 pmd_alloc_static_tx_qid(pmd);
65f13b50 6028 }
82a48ead 6029 pmd_perf_stats_init(&pmd->perf_stats);
65f13b50
AW
6030 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6031 hash_int(core_id, 0));
6032}
6033
1c1e46ed
AW
6034static void
6035dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6036{
3453b4d6
JS
6037 struct dpcls *cls;
6038
1c1e46ed 6039 dp_netdev_pmd_flow_flush(pmd);
57eebbb4
DDP
6040 hmap_destroy(&pmd->send_port_cache);
6041 hmap_destroy(&pmd->tnl_port_cache);
d0cca6c3 6042 hmap_destroy(&pmd->tx_ports);
947dc567 6043 hmap_destroy(&pmd->poll_list);
3453b4d6
JS
6044 /* All flows (including their dpcls_rules) have been deleted already */
6045 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6046 dpcls_destroy(cls);
7c269972 6047 ovsrcu_postpone(free, cls);
3453b4d6
JS
6048 }
6049 cmap_destroy(&pmd->classifiers);
1c1e46ed
AW
6050 cmap_destroy(&pmd->flow_table);
6051 ovs_mutex_destroy(&pmd->flow_mutex);
2788a1b1 6052 seq_destroy(pmd->reload_seq);
d0cca6c3 6053 ovs_mutex_destroy(&pmd->port_mutex);
1c1e46ed
AW
6054 free(pmd);
6055}
6056
6057/* Stops the pmd thread, removes it from the 'dp->poll_threads',
6058 * and unrefs the struct. */
65f13b50 6059static void
e4e74c3a 6060dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 6061{
d0cca6c3
DDP
6062 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6063 * but extra cleanup is necessary */
65f13b50 6064 if (pmd->core_id == NON_PMD_CORE_ID) {
febf4a7a 6065 ovs_mutex_lock(&dp->non_pmd_mutex);
60d8ccae 6066 dfc_cache_uninit(&pmd->flow_cache);
d0cca6c3 6067 pmd_free_cached_ports(pmd);
140dd699 6068 pmd_free_static_tx_qid(pmd);
febf4a7a 6069 ovs_mutex_unlock(&dp->non_pmd_mutex);
65f13b50 6070 } else {
299c8d61 6071 atomic_store_relaxed(&pmd->exit, true);
65f13b50 6072 dp_netdev_reload_pmd__(pmd);
65f13b50
AW
6073 xpthread_join(pmd->thread, NULL);
6074 }
ae7ad0a1 6075
d0cca6c3 6076 dp_netdev_pmd_clear_ports(pmd);
ae7ad0a1 6077
e4e74c3a
AW
6078 /* Purges the 'pmd''s flows after stopping the thread, but before
6079 * destroying the flows, so that the flow stats can be collected. */
6080 if (dp->dp_purge_cb) {
6081 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6082 }
65f13b50 6083 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 6084 dp_netdev_pmd_unref(pmd);
65f13b50 6085}
6c3eee82 6086
e32971b8
DDP
6087/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6088 * thread. */
65f13b50 6089static void
e32971b8 6090dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
65f13b50
AW
6091{
6092 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
6093 struct dp_netdev_pmd_thread **pmd_list;
6094 size_t k = 0, n_pmds;
6095
e32971b8 6096 n_pmds = cmap_count(&dp->poll_threads);
d916785c 6097 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
6098
6099 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
e32971b8 6100 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
b9584f21
DDP
6101 continue;
6102 }
d916785c
DDP
6103 /* We cannot call dp_netdev_del_pmd(), since it alters
6104 * 'dp->poll_threads' (while we're iterating it) and it
6105 * might quiesce. */
6106 ovs_assert(k < n_pmds);
6107 pmd_list[k++] = pmd;
6c3eee82 6108 }
d916785c
DDP
6109
6110 for (size_t i = 0; i < k; i++) {
6111 dp_netdev_del_pmd(dp, pmd_list[i]);
6112 }
6113 free(pmd_list);
65f13b50 6114}
6c3eee82 6115
d0cca6c3
DDP
6116/* Deletes all rx queues from pmd->poll_list and all the ports from
6117 * pmd->tx_ports. */
cc245ce8 6118static void
d0cca6c3 6119dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
6120{
6121 struct rxq_poll *poll;
d0cca6c3 6122 struct tx_port *port;
cc245ce8 6123
d0cca6c3 6124 ovs_mutex_lock(&pmd->port_mutex);
947dc567 6125 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
cc245ce8
IM
6126 free(poll);
6127 }
d0cca6c3
DDP
6128 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6129 free(port);
6130 }
6131 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8
IM
6132}
6133
e32971b8 6134/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
b68872d8 6135static void
e32971b8
DDP
6136dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6137 struct dp_netdev_rxq *rxq)
6138 OVS_REQUIRES(pmd->port_mutex)
b68872d8 6139{
e32971b8
DDP
6140 int qid = netdev_rxq_get_queue_id(rxq->rx);
6141 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6142 struct rxq_poll *poll;
b68872d8 6143
e32971b8
DDP
6144 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6145 if (poll->rxq == rxq) {
6146 /* 'rxq' is already polled by this thread. Do nothing. */
6147 return;
d0cca6c3 6148 }
cc245ce8 6149 }
cc245ce8 6150
e32971b8
DDP
6151 poll = xmalloc(sizeof *poll);
6152 poll->rxq = rxq;
6153 hmap_insert(&pmd->poll_list, &poll->node, hash);
b68872d8 6154
e32971b8 6155 pmd->need_reload = true;
ae7ad0a1
IM
6156}
6157
e32971b8 6158/* Delete 'poll' from poll_list of PMD thread. */
ae7ad0a1 6159static void
e32971b8
DDP
6160dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6161 struct rxq_poll *poll)
d0cca6c3 6162 OVS_REQUIRES(pmd->port_mutex)
ae7ad0a1 6163{
e32971b8
DDP
6164 hmap_remove(&pmd->poll_list, &poll->node);
6165 free(poll);
ae7ad0a1 6166
e32971b8 6167 pmd->need_reload = true;
ae7ad0a1
IM
6168}
6169
d0cca6c3
DDP
6170/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6171 * changes to take effect. */
cc245ce8 6172static void
d0cca6c3
DDP
6173dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6174 struct dp_netdev_port *port)
e32971b8 6175 OVS_REQUIRES(pmd->port_mutex)
d0cca6c3 6176{
57eebbb4
DDP
6177 struct tx_port *tx;
6178
e32971b8
DDP
6179 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6180 if (tx) {
6181 /* 'port' is already on this thread tx cache. Do nothing. */
6182 return;
6183 }
6184
57eebbb4 6185 tx = xzalloc(sizeof *tx);
d0cca6c3 6186
324c8374
IM
6187 tx->port = port;
6188 tx->qid = -1;
c71ea3c4 6189 tx->flush_time = 0LL;
009e0033 6190 dp_packet_batch_init(&tx->output_pkts);
d0cca6c3 6191
324c8374 6192 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
e32971b8 6193 pmd->need_reload = true;
d0cca6c3
DDP
6194}
6195
e32971b8
DDP
6196/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6197 * changes to take effect. */
b9584f21 6198static void
e32971b8
DDP
6199dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6200 struct tx_port *tx)
6201 OVS_REQUIRES(pmd->port_mutex)
b9584f21 6202{
e32971b8
DDP
6203 hmap_remove(&pmd->tx_ports, &tx->node);
6204 free(tx);
6205 pmd->need_reload = true;
6c3eee82
BP
6206}
6207\f
b5cbbcf6
AZ
6208static char *
6209dpif_netdev_get_datapath_version(void)
6210{
6211 return xstrdup("<built-in>");
6212}
6213
72865317 6214static void
1c1e46ed 6215dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 6216 uint16_t tcp_flags, long long now)
72865317 6217{
eb94da30 6218 uint16_t flags;
72865317 6219
eb94da30
DDP
6220 atomic_store_relaxed(&netdev_flow->stats.used, now);
6221 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6222 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6223 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6224 flags |= tcp_flags;
6225 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
6226}
6227
623540e4 6228static int
e14deea0 6229dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 6230 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
6231 enum dpif_upcall_type type, const struct nlattr *userdata,
6232 struct ofpbuf *actions, struct ofpbuf *put_actions)
6233{
1c1e46ed 6234 struct dp_netdev *dp = pmd->dp;
623540e4 6235
623540e4
EJ
6236 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6237 return ENODEV;
6238 }
6239
6240 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6241 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 6242 char *packet_str;
cf62fa4c 6243 struct ofpbuf key;
5262eea1
JG
6244 struct odp_flow_key_parms odp_parms = {
6245 .flow = flow,
1dea1435 6246 .mask = wc ? &wc->masks : NULL,
2494ccd7 6247 .support = dp_netdev_support,
5262eea1 6248 };
623540e4
EJ
6249
6250 ofpbuf_init(&key, 0);
5262eea1 6251 odp_flow_key_from_flow(&odp_parms, &key);
2482b0b0 6252 packet_str = ofp_dp_packet_to_string(packet_);
623540e4 6253
6fd6ed71 6254 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
6255
6256 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6257 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6258
6259 ofpbuf_uninit(&key);
6260 free(packet_str);
6fd6ed71 6261
623540e4
EJ
6262 ds_destroy(&ds);
6263 }
6264
8d8ab6c2
JG
6265 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6266 actions, wc, put_actions, dp->upcall_aux);
623540e4
EJ
6267}
6268
bde94613
FA
6269static inline uint32_t
6270dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6271 const struct miniflow *mf)
6272{
6273 uint32_t hash;
6274
6275 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6276 hash = dp_packet_get_rss_hash(packet);
6277 } else {
6278 hash = miniflow_hash_5tuple(mf, 0);
6279 dp_packet_set_rss_hash(packet, hash);
6280 }
6281
6282 return hash;
6283}
6284
9bbf1c3d 6285static inline uint32_t
048963aa
DDP
6286dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6287 const struct miniflow *mf)
9bbf1c3d 6288{
048963aa 6289 uint32_t hash, recirc_depth;
9bbf1c3d 6290
f2f44f5d
DDP
6291 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6292 hash = dp_packet_get_rss_hash(packet);
6293 } else {
9bbf1c3d 6294 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 6295 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 6296 }
048963aa
DDP
6297
6298 /* The RSS hash must account for the recirculation depth to avoid
6299 * collisions in the exact match cache */
6300 recirc_depth = *recirc_depth_get_unsafe();
6301 if (OVS_UNLIKELY(recirc_depth)) {
6302 hash = hash_finish(hash, recirc_depth);
048963aa 6303 }
9bbf1c3d
DDP
6304 return hash;
6305}
6306
f7ce4811 6307struct packet_batch_per_flow {
8cbf4f47
DDP
6308 unsigned int byte_count;
6309 uint16_t tcp_flags;
8cbf4f47
DDP
6310 struct dp_netdev_flow *flow;
6311
1895cc8d 6312 struct dp_packet_batch array;
8cbf4f47
DDP
6313};
6314
6315static inline void
f7ce4811
PS
6316packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6317 struct dp_packet *packet,
aab96ec4 6318 uint16_t tcp_flags)
8cbf4f47 6319{
cf62fa4c 6320 batch->byte_count += dp_packet_size(packet);
aab96ec4 6321 batch->tcp_flags |= tcp_flags;
940ac2ce 6322 dp_packet_batch_add(&batch->array, packet);
8cbf4f47
DDP
6323}
6324
6325static inline void
f7ce4811
PS
6326packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6327 struct dp_netdev_flow *flow)
8cbf4f47 6328{
11e5cf1f 6329 flow->batch = batch;
8cbf4f47 6330
11e5cf1f 6331 batch->flow = flow;
1895cc8d 6332 dp_packet_batch_init(&batch->array);
8cbf4f47
DDP
6333 batch->byte_count = 0;
6334 batch->tcp_flags = 0;
8cbf4f47
DDP
6335}
6336
6337static inline void
f7ce4811 6338packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
b010be17 6339 struct dp_netdev_pmd_thread *pmd)
8cbf4f47
DDP
6340{
6341 struct dp_netdev_actions *actions;
6342 struct dp_netdev_flow *flow = batch->flow;
6343
940ac2ce
PC
6344 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6345 batch->byte_count,
05f9e707 6346 batch->tcp_flags, pmd->ctx.now / 1000);
8cbf4f47
DDP
6347
6348 actions = dp_netdev_flow_get_actions(flow);
6349
66e4ad8a 6350 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
b010be17 6351 actions->actions, actions->size);
8cbf4f47
DDP
6352}
6353
8aaa125d 6354static inline void
e14deea0 6355dp_netdev_queue_batches(struct dp_packet *pkt,
aab96ec4 6356 struct dp_netdev_flow *flow, uint16_t tcp_flags,
47a45d86
KT
6357 struct packet_batch_per_flow *batches,
6358 size_t *n_batches)
9bbf1c3d 6359{
f7ce4811 6360 struct packet_batch_per_flow *batch = flow->batch;
11e5cf1f 6361
f9fe365b
AZ
6362 if (OVS_UNLIKELY(!batch)) {
6363 batch = &batches[(*n_batches)++];
f7ce4811 6364 packet_batch_per_flow_init(batch, flow);
9bbf1c3d
DDP
6365 }
6366
aab96ec4 6367 packet_batch_per_flow_update(batch, pkt, tcp_flags);
9bbf1c3d
DDP
6368}
6369
9b4f08cd
VDA
6370static inline void
6371packet_enqueue_to_flow_map(struct dp_packet *packet,
6372 struct dp_netdev_flow *flow,
6373 uint16_t tcp_flags,
6374 struct dp_packet_flow_map *flow_map,
6375 size_t index)
6376{
6377 struct dp_packet_flow_map *map = &flow_map[index];
6378 map->flow = flow;
6379 map->packet = packet;
6380 map->tcp_flags = tcp_flags;
6381}
6382
60d8ccae
YW
6383/* SMC lookup function for a batch of packets.
6384 * By doing batching SMC lookup, we can use prefetch
6385 * to hide memory access latency.
6386 */
6387static inline void
6388smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6389 struct netdev_flow_key *keys,
6390 struct netdev_flow_key **missed_keys,
6391 struct dp_packet_batch *packets_,
9b4f08cd
VDA
6392 const int cnt,
6393 struct dp_packet_flow_map *flow_map,
6394 uint8_t *index_map)
60d8ccae
YW
6395{
6396 int i;
6397 struct dp_packet *packet;
6398 size_t n_smc_hit = 0, n_missed = 0;
6399 struct dfc_cache *cache = &pmd->flow_cache;
6400 struct smc_cache *smc_cache = &cache->smc_cache;
6401 const struct cmap_node *flow_node;
9b4f08cd
VDA
6402 int recv_idx;
6403 uint16_t tcp_flags;
60d8ccae
YW
6404
6405 /* Prefetch buckets for all packets */
6406 for (i = 0; i < cnt; i++) {
6407 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6408 }
6409
6410 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6411 struct dp_netdev_flow *flow = NULL;
6412 flow_node = smc_entry_get(pmd, keys[i].hash);
6413 bool hit = false;
9b4f08cd
VDA
6414 /* Get the original order of this packet in received batch. */
6415 recv_idx = index_map[i];
60d8ccae
YW
6416
6417 if (OVS_LIKELY(flow_node != NULL)) {
6418 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6419 /* Since we dont have per-port megaflow to check the port
6420 * number, we need to verify that the input ports match. */
6421 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6422 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
9b4f08cd
VDA
6423 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6424
60d8ccae 6425 /* SMC hit and emc miss, we insert into EMC */
60d8ccae
YW
6426 keys[i].len =
6427 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
18e08953 6428 emc_probabilistic_insert(pmd, &keys[i], flow);
9b4f08cd
VDA
6429 /* Add these packets into the flow map in the same order
6430 * as received.
6431 */
6432 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6433 flow_map, recv_idx);
60d8ccae
YW
6434 n_smc_hit++;
6435 hit = true;
6436 break;
6437 }
6438 }
6439 if (hit) {
6440 continue;
6441 }
6442 }
6443
6444 /* SMC missed. Group missed packets together at
6445 * the beginning of the 'packets' array. */
6446 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6447
6448 /* Preserve the order of packet for flow batching. */
6449 index_map[n_missed] = recv_idx;
6450
60d8ccae
YW
6451 /* Put missed keys to the pointer arrays return to the caller */
6452 missed_keys[n_missed++] = &keys[i];
6453 }
6454
6455 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6456}
6457
6458/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
a90ed026 6459 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d 6460 * miniflow is copied into 'keys' and the packet pointer is moved at the
60d8ccae
YW
6461 * beginning of the 'packets' array. The pointers of missed keys are put in the
6462 * missed_keys pointer array for future processing.
9bbf1c3d
DDP
6463 *
6464 * The function returns the number of packets that needs to be processed in the
6465 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026 6466 *
02305520
FA
6467 * For performance reasons a caller may choose not to initialize the metadata
6468 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6469 * is not valid and must be initialized by this function using 'port_no'.
6470 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6471 * will be ignored.
9bbf1c3d
DDP
6472 */
6473static inline size_t
60d8ccae 6474dfc_processing(struct dp_netdev_pmd_thread *pmd,
72c84bc2 6475 struct dp_packet_batch *packets_,
1895cc8d 6476 struct netdev_flow_key *keys,
60d8ccae 6477 struct netdev_flow_key **missed_keys,
f7ce4811 6478 struct packet_batch_per_flow batches[], size_t *n_batches,
9b4f08cd
VDA
6479 struct dp_packet_flow_map *flow_map,
6480 size_t *n_flows, uint8_t *index_map,
a90ed026 6481 bool md_is_valid, odp_port_t port_no)
72865317 6482{
b89c678b 6483 struct netdev_flow_key *key = &keys[0];
60d8ccae
YW
6484 size_t n_missed = 0, n_emc_hit = 0;
6485 struct dfc_cache *cache = &pmd->flow_cache;
72c84bc2 6486 struct dp_packet *packet;
45df9fef 6487 const size_t cnt = dp_packet_batch_size(packets_);
2fbadeb6 6488 uint32_t cur_min = pmd->ctx.emc_insert_min;
72c84bc2 6489 int i;
aab96ec4 6490 uint16_t tcp_flags;
60d8ccae 6491 bool smc_enable_db;
9b4f08cd
VDA
6492 size_t map_cnt = 0;
6493 bool batch_enable = true;
8cbf4f47 6494
60d8ccae 6495 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
82a48ead
JS
6496 pmd_perf_update_counter(&pmd->perf_stats,
6497 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6498 cnt);
f79b1ddb 6499
45df9fef 6500 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
9bbf1c3d 6501 struct dp_netdev_flow *flow;
aab96ec4 6502 uint32_t mark;
9bbf1c3d 6503
5a2fed48
AZ
6504 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6505 dp_packet_delete(packet);
84d6d5eb
EJ
6506 continue;
6507 }
8cbf4f47 6508
45df9fef 6509 if (i != cnt - 1) {
72c84bc2 6510 struct dp_packet **packets = packets_->packets;
a90ed026 6511 /* Prefetch next packet data and metadata. */
72a5e2b8 6512 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 6513 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
6514 }
6515
a90ed026
DDP
6516 if (!md_is_valid) {
6517 pkt_metadata_init(&packet->md, port_no);
6518 }
aab96ec4
YL
6519
6520 if ((*recirc_depth_get() == 0) &&
6521 dp_packet_has_flow_mark(packet, &mark)) {
6522 flow = mark_to_flow_find(pmd, mark);
9b4f08cd 6523 if (OVS_LIKELY(flow)) {
aab96ec4 6524 tcp_flags = parse_tcp_flags(packet);
9b4f08cd
VDA
6525 if (OVS_LIKELY(batch_enable)) {
6526 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6527 n_batches);
6528 } else {
6529 /* Flow batching should be performed only after fast-path
6530 * processing is also completed for packets with emc miss
6531 * or else it will result in reordering of packets with
6532 * same datapath flows. */
6533 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6534 flow_map, map_cnt++);
6535 }
aab96ec4
YL
6536 continue;
6537 }
6538 }
6539
5a2fed48 6540 miniflow_extract(packet, &key->mf);
d262ac2c 6541 key->len = 0; /* Not computed yet. */
b137383e
IM
6542 key->hash =
6543 (md_is_valid == false)
6544 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6545 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6546
6547 /* If EMC is disabled skip emc_lookup */
6548 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
8aaa125d 6549 if (OVS_LIKELY(flow)) {
aab96ec4 6550 tcp_flags = miniflow_get_tcp_flags(&key->mf);
60d8ccae 6551 n_emc_hit++;
9b4f08cd
VDA
6552 if (OVS_LIKELY(batch_enable)) {
6553 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6554 n_batches);
6555 } else {
6556 /* Flow batching should be performed only after fast-path
6557 * processing is also completed for packets with emc miss
6558 * or else it will result in reordering of packets with
6559 * same datapath flows. */
6560 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6561 flow_map, map_cnt++);
6562 }
8aaa125d 6563 } else {
d1aa0b94 6564 /* Exact match cache missed. Group missed packets together at
72c84bc2
AZ
6565 * the beginning of the 'packets' array. */
6566 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6567
6568 /* Preserve the order of packet for flow batching. */
6569 index_map[n_missed] = map_cnt;
6570 flow_map[map_cnt++].flow = NULL;
6571
400486f7 6572 /* 'key[n_missed]' contains the key of the current packet and it
60d8ccae
YW
6573 * will be passed to SMC lookup. The next key should be extracted
6574 * to 'keys[n_missed + 1]'.
6575 * We also maintain a pointer array to keys missed both SMC and EMC
6576 * which will be returned to the caller for future processing. */
6577 missed_keys[n_missed] = key;
400486f7 6578 key = &keys[++n_missed];
9b4f08cd
VDA
6579
6580 /* Skip batching for subsequent packets to avoid reordering. */
6581 batch_enable = false;
9bbf1c3d
DDP
6582 }
6583 }
9b4f08cd
VDA
6584 /* Count of packets which are not flow batched. */
6585 *n_flows = map_cnt;
9bbf1c3d 6586
60d8ccae
YW
6587 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6588
6589 if (!smc_enable_db) {
6590 return dp_packet_batch_size(packets_);
6591 }
6592
6593 /* Packets miss EMC will do a batch lookup in SMC if enabled */
9b4f08cd
VDA
6594 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6595 n_missed, flow_map, index_map);
4f150744 6596
72c84bc2 6597 return dp_packet_batch_size(packets_);
9bbf1c3d
DDP
6598}
6599
82a48ead 6600static inline int
47a45d86
KT
6601handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6602 struct dp_packet *packet,
a260d966 6603 const struct netdev_flow_key *key,
82a48ead 6604 struct ofpbuf *actions, struct ofpbuf *put_actions)
a260d966
PS
6605{
6606 struct ofpbuf *add_actions;
6607 struct dp_packet_batch b;
6608 struct match match;
6609 ovs_u128 ufid;
6610 int error;
79f36875 6611 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
a260d966
PS
6612
6613 match.tun_md.valid = false;
6614 miniflow_expand(&key->mf, &match.flow);
c98eedf9 6615 memset(&match.wc, 0, sizeof match.wc);
a260d966
PS
6616
6617 ofpbuf_clear(actions);
6618 ofpbuf_clear(put_actions);
6619
6620 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
6621 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6622 &ufid, DPIF_UC_MISS, NULL, actions,
6623 put_actions);
6624 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6625 dp_packet_delete(packet);
82a48ead 6626 return error;
a260d966
PS
6627 }
6628
6629 /* The Netlink encoding of datapath flow keys cannot express
6630 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6631 * tag is interpreted as exact match on the fact that there is no
6632 * VLAN. Unless we refactor a lot of code that translates between
6633 * Netlink and struct flow representations, we have to do the same
35fe9efb 6634 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
f0fb825a
EG
6635 if (!match.wc.masks.vlans[0].tci) {
6636 match.wc.masks.vlans[0].tci = htons(0xffff);
a260d966
PS
6637 }
6638
6639 /* We can't allow the packet batching in the next loop to execute
6640 * the actions. Otherwise, if there are any slow path actions,
6641 * we'll send the packet up twice. */
72c84bc2 6642 dp_packet_batch_init_packet(&b, packet);
66e4ad8a 6643 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
b010be17 6644 actions->data, actions->size);
a260d966
PS
6645
6646 add_actions = put_actions->size ? put_actions : actions;
6647 if (OVS_LIKELY(error != ENOSPC)) {
6648 struct dp_netdev_flow *netdev_flow;
6649
6650 /* XXX: There's a race window where a flow covering this packet
6651 * could have already been installed since we last did the flow
6652 * lookup before upcall. This could be solved by moving the
6653 * mutex lock outside the loop, but that's an awful long time
af741ca3 6654 * to be locking revalidators out of making flow modifications. */
a260d966 6655 ovs_mutex_lock(&pmd->flow_mutex);
3453b4d6 6656 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
a260d966
PS
6657 if (OVS_LIKELY(!netdev_flow)) {
6658 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6659 add_actions->data,
6660 add_actions->size);
6661 }
6662 ovs_mutex_unlock(&pmd->flow_mutex);
60d8ccae
YW
6663 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6664 smc_insert(pmd, key, hash);
4c30b246 6665 emc_probabilistic_insert(pmd, key, netdev_flow);
a260d966 6666 }
79f36875
JS
6667 if (pmd_perf_metrics_enabled(pmd)) {
6668 /* Update upcall stats. */
6669 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6670 struct pmd_perf_stats *s = &pmd->perf_stats;
6671 s->current.upcalls++;
6672 s->current.upcall_cycles += cycles;
6673 histogram_add_sample(&s->cycles_per_upcall, cycles);
6674 }
82a48ead 6675 return error;
a260d966
PS
6676}
6677
9bbf1c3d 6678static inline void
65f13b50 6679fast_path_processing(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6680 struct dp_packet_batch *packets_,
60d8ccae 6681 struct netdev_flow_key **keys,
9b4f08cd
VDA
6682 struct dp_packet_flow_map *flow_map,
6683 uint8_t *index_map,
b010be17 6684 odp_port_t in_port)
9bbf1c3d 6685{
31c82130 6686 const size_t cnt = dp_packet_batch_size(packets_);
1a0d5831 6687#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
6688 const size_t PKT_ARRAY_SIZE = cnt;
6689#else
1a0d5831 6690 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 6691 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 6692#endif
31c82130 6693 struct dp_packet *packet;
3453b4d6 6694 struct dpcls *cls;
0de8783a 6695 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50 6696 struct dp_netdev *dp = pmd->dp;
82a48ead 6697 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
3453b4d6 6698 int lookup_cnt = 0, add_lookup_cnt;
9bbf1c3d
DDP
6699 bool any_miss;
6700
e883448e 6701 for (size_t i = 0; i < cnt; i++) {
0de8783a 6702 /* Key length is needed in all the cases, hash computed on demand. */
60d8ccae 6703 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
9bbf1c3d 6704 }
3453b4d6
JS
6705 /* Get the classifier for the in_port */
6706 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6707 if (OVS_LIKELY(cls)) {
60d8ccae
YW
6708 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6709 rules, cnt, &lookup_cnt);
3453b4d6
JS
6710 } else {
6711 any_miss = true;
6712 memset(rules, 0, sizeof(rules));
6713 }
623540e4
EJ
6714 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6715 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6716 struct ofpbuf actions, put_actions;
623540e4
EJ
6717
6718 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6719 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6720
e883448e 6721 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 6722 struct dp_netdev_flow *netdev_flow;
623540e4 6723
0de8783a 6724 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
6725 continue;
6726 }
6727
6728 /* It's possible that an earlier slow path execution installed
0de8783a 6729 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 6730 * to catch it here than execute a miss. */
60d8ccae 6731 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
3453b4d6 6732 &add_lookup_cnt);
623540e4 6733 if (netdev_flow) {
3453b4d6 6734 lookup_cnt += add_lookup_cnt;
0de8783a 6735 rules[i] = &netdev_flow->cr;
623540e4
EJ
6736 continue;
6737 }
6738
60d8ccae 6739 int error = handle_packet_upcall(pmd, packet, keys[i],
82a48ead
JS
6740 &actions, &put_actions);
6741
6742 if (OVS_UNLIKELY(error)) {
6743 upcall_fail_cnt++;
6744 } else {
6745 upcall_ok_cnt++;
6746 }
623540e4
EJ
6747 }
6748
6749 ofpbuf_uninit(&actions);
6750 ofpbuf_uninit(&put_actions);
6751 fat_rwlock_unlock(&dp->upcall_rwlock);
ac8c2081 6752 } else if (OVS_UNLIKELY(any_miss)) {
e883448e 6753 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 6754 if (OVS_UNLIKELY(!rules[i])) {
31c82130 6755 dp_packet_delete(packet);
82a48ead 6756 upcall_fail_cnt++;
ac8c2081
DDP
6757 }
6758 }
623540e4 6759 }
84d6d5eb 6760
e883448e 6761 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
84d6d5eb 6762 struct dp_netdev_flow *flow;
9b4f08cd
VDA
6763 /* Get the original order of this packet in received batch. */
6764 int recv_idx = index_map[i];
6765 uint16_t tcp_flags;
8cbf4f47 6766
0de8783a 6767 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
6768 continue;
6769 }
6770
84d6d5eb 6771 flow = dp_netdev_flow_cast(rules[i]);
60d8ccae
YW
6772 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
6773 smc_insert(pmd, keys[i], hash);
0de8783a 6774
60d8ccae 6775 emc_probabilistic_insert(pmd, keys[i], flow);
9b4f08cd
VDA
6776 /* Add these packets into the flow map in the same order
6777 * as received.
6778 */
6779 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6780 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6781 flow_map, recv_idx);
8cbf4f47
DDP
6782 }
6783
82a48ead
JS
6784 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6785 cnt - upcall_ok_cnt - upcall_fail_cnt);
6786 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6787 lookup_cnt);
6788 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6789 upcall_ok_cnt);
6790 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6791 upcall_fail_cnt);
72865317
BP
6792}
6793
a90ed026
DDP
6794/* Packets enter the datapath from a port (or from recirculation) here.
6795 *
02305520
FA
6796 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6797 * When false the metadata in 'packets' need to be initialized. */
adcf00ba 6798static void
a90ed026 6799dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6800 struct dp_packet_batch *packets,
a90ed026 6801 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 6802{
1a0d5831 6803#if !defined(__CHECKER__) && !defined(_WIN32)
37eabc70 6804 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
9bbf1c3d 6805#else
1a0d5831 6806 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 6807 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 6808#endif
47a45d86
KT
6809 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6810 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
60d8ccae 6811 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
f7ce4811 6812 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
72c84bc2 6813 size_t n_batches;
9b4f08cd
VDA
6814 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6815 uint8_t index_map[PKT_ARRAY_SIZE];
6816 size_t n_flows, i;
6817
3453b4d6 6818 odp_port_t in_port;
9bbf1c3d 6819
8aaa125d 6820 n_batches = 0;
60d8ccae 6821 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
9b4f08cd
VDA
6822 flow_map, &n_flows, index_map, md_is_valid, port_no);
6823
72c84bc2 6824 if (!dp_packet_batch_is_empty(packets)) {
3453b4d6
JS
6825 /* Get ingress port from first packet's metadata. */
6826 in_port = packets->packets[0]->md.in_port.odp_port;
60d8ccae 6827 fast_path_processing(pmd, packets, missed_keys,
9b4f08cd 6828 flow_map, index_map, in_port);
8aaa125d
DDP
6829 }
6830
9b4f08cd
VDA
6831 /* Batch rest of packets which are in flow map. */
6832 for (i = 0; i < n_flows; i++) {
6833 struct dp_packet_flow_map *map = &flow_map[i];
6834
6835 if (OVS_UNLIKELY(!map->flow)) {
6836 continue;
6837 }
6838 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6839 batches, &n_batches);
6840 }
6841
ad9f0581
BB
6842 /* All the flow batches need to be reset before any call to
6843 * packet_batch_per_flow_execute() as it could potentially trigger
6844 * recirculation. When a packet matching flow ‘j’ happens to be
6845 * recirculated, the nested call to dp_netdev_input__() could potentially
6846 * classify the packet as matching another flow - say 'k'. It could happen
6847 * that in the previous call to dp_netdev_input__() that same flow 'k' had
6848 * already its own batches[k] still waiting to be served. So if its
6849 * ‘batch’ member is not reset, the recirculated packet would be wrongly
6850 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
603f2ce0
EJ
6851 for (i = 0; i < n_batches; i++) {
6852 batches[i].flow->batch = NULL;
6853 }
6854
8aaa125d 6855 for (i = 0; i < n_batches; i++) {
b010be17 6856 packet_batch_per_flow_execute(&batches[i], pmd);
9bbf1c3d
DDP
6857 }
6858}
6859
a90ed026
DDP
6860static void
6861dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6862 struct dp_packet_batch *packets,
a90ed026
DDP
6863 odp_port_t port_no)
6864{
3453b4d6 6865 dp_netdev_input__(pmd, packets, false, port_no);
a90ed026
DDP
6866}
6867
6868static void
6869dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6870 struct dp_packet_batch *packets)
a90ed026 6871{
3453b4d6 6872 dp_netdev_input__(pmd, packets, true, 0);
a90ed026
DDP
6873}
6874
9080a111 6875struct dp_netdev_execute_aux {
65f13b50 6876 struct dp_netdev_pmd_thread *pmd;
66e4ad8a 6877 const struct flow *flow;
9080a111
JR
6878};
6879
e4e74c3a
AW
6880static void
6881dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6882 void *aux)
6883{
6884 struct dp_netdev *dp = get_dp_netdev(dpif);
6885 dp->dp_purge_aux = aux;
6886 dp->dp_purge_cb = cb;
6887}
6888
6b31e073 6889static void
623540e4
EJ
6890dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6891 void *aux)
6b31e073
RW
6892{
6893 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 6894 dp->upcall_aux = aux;
6b31e073
RW
6895 dp->upcall_cb = cb;
6896}
6897
324c8374
IM
6898static void
6899dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 6900 bool purge)
324c8374
IM
6901{
6902 struct tx_port *tx;
6903 struct dp_netdev_port *port;
6904 long long interval;
6905
57eebbb4 6906 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
9f7a3035 6907 if (!tx->port->dynamic_txqs) {
324c8374
IM
6908 continue;
6909 }
b010be17 6910 interval = pmd->ctx.now - tx->last_used;
05f9e707 6911 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
324c8374
IM
6912 port = tx->port;
6913 ovs_mutex_lock(&port->txq_used_mutex);
6914 port->txq_used[tx->qid]--;
6915 ovs_mutex_unlock(&port->txq_used_mutex);
6916 tx->qid = -1;
6917 }
6918 }
6919}
6920
6921static int
6922dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 6923 struct tx_port *tx)
324c8374
IM
6924{
6925 struct dp_netdev_port *port;
6926 long long interval;
6927 int i, min_cnt, min_qid;
6928
b010be17
IM
6929 interval = pmd->ctx.now - tx->last_used;
6930 tx->last_used = pmd->ctx.now;
324c8374 6931
05f9e707 6932 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
324c8374
IM
6933 return tx->qid;
6934 }
6935
6936 port = tx->port;
6937
6938 ovs_mutex_lock(&port->txq_used_mutex);
6939 if (tx->qid >= 0) {
6940 port->txq_used[tx->qid]--;
6941 tx->qid = -1;
6942 }
6943
6944 min_cnt = -1;
6945 min_qid = 0;
6946 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
6947 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
6948 min_cnt = port->txq_used[i];
6949 min_qid = i;
6950 }
6951 }
6952
6953 port->txq_used[min_qid]++;
6954 tx->qid = min_qid;
6955
6956 ovs_mutex_unlock(&port->txq_used_mutex);
6957
b010be17 6958 dpif_netdev_xps_revalidate_pmd(pmd, false);
324c8374
IM
6959
6960 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
6961 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
6962 return min_qid;
6963}
6964
d0cca6c3 6965static struct tx_port *
57eebbb4
DDP
6966pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6967 odp_port_t port_no)
6968{
6969 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
6970}
6971
6972static struct tx_port *
6973pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6974 odp_port_t port_no)
d0cca6c3 6975{
57eebbb4 6976 return tx_port_lookup(&pmd->send_port_cache, port_no);
d0cca6c3
DDP
6977}
6978
a36de779 6979static int
d0cca6c3 6980push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
1895cc8d
PS
6981 const struct nlattr *attr,
6982 struct dp_packet_batch *batch)
a36de779 6983{
d0cca6c3 6984 struct tx_port *tun_port;
a36de779 6985 const struct ovs_action_push_tnl *data;
4c742796 6986 int err;
a36de779
PS
6987
6988 data = nl_attr_get(attr);
6989
81765c00 6990 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
a36de779 6991 if (!tun_port) {
4c742796
PS
6992 err = -EINVAL;
6993 goto error;
a36de779 6994 }
324c8374 6995 err = netdev_push_header(tun_port->port->netdev, batch, data);
4c742796
PS
6996 if (!err) {
6997 return 0;
6998 }
6999error:
7000 dp_packet_delete_batch(batch, true);
7001 return err;
a36de779
PS
7002}
7003
66525ef3
PS
7004static void
7005dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7d7ded7a 7006 struct dp_packet *packet, bool should_steal,
66525ef3
PS
7007 struct flow *flow, ovs_u128 *ufid,
7008 struct ofpbuf *actions,
b010be17 7009 const struct nlattr *userdata)
66525ef3
PS
7010{
7011 struct dp_packet_batch b;
7012 int error;
7013
7014 ofpbuf_clear(actions);
7015
7016 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7017 DPIF_UC_ACTION, userdata, actions,
7018 NULL);
7019 if (!error || error == ENOSPC) {
72c84bc2 7020 dp_packet_batch_init_packet(&b, packet);
7d7ded7a 7021 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
b010be17 7022 actions->data, actions->size);
7d7ded7a 7023 } else if (should_steal) {
66525ef3
PS
7024 dp_packet_delete(packet);
7025 }
7026}
7027
a36de779 7028static void
1895cc8d 7029dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7d7ded7a 7030 const struct nlattr *a, bool should_steal)
4b27db64 7031 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
7032{
7033 struct dp_netdev_execute_aux *aux = aux_;
623540e4 7034 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
7035 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7036 struct dp_netdev *dp = pmd->dp;
09f9da0b 7037 int type = nl_attr_type(a);
d0cca6c3 7038 struct tx_port *p;
9080a111 7039
09f9da0b
JR
7040 switch ((enum ovs_action_attr)type) {
7041 case OVS_ACTION_ATTR_OUTPUT:
57eebbb4 7042 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
26a5075b 7043 if (OVS_LIKELY(p)) {
009e0033
IM
7044 struct dp_packet *packet;
7045 struct dp_packet_batch out;
347ba9bb 7046
7d7ded7a 7047 if (!should_steal) {
009e0033
IM
7048 dp_packet_batch_clone(&out, packets_);
7049 dp_packet_batch_reset_cutlen(packets_);
7050 packets_ = &out;
324c8374 7051 }
009e0033 7052 dp_packet_batch_apply_cutlen(packets_);
347ba9bb 7053
009e0033
IM
7054#ifdef DPDK_NETDEV
7055 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7056 && packets_->packets[0]->source
7057 != p->output_pkts.packets[0]->source)) {
7058 /* XXX: netdev-dpdk assumes that all packets in a single
7059 * output batch has the same source. Flush here to
7060 * avoid memory access issues. */
7061 dp_netdev_pmd_flush_output_on_port(pmd, p);
7062 }
7063#endif
c71ea3c4
IM
7064 if (dp_packet_batch_size(&p->output_pkts)
7065 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7066 /* Flush here to avoid overflow. */
009e0033
IM
7067 dp_netdev_pmd_flush_output_on_port(pmd, p);
7068 }
c71ea3c4
IM
7069
7070 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7071 pmd->n_output_batches++;
7072 }
7073
e883448e 7074 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
58ed6df0
IM
7075 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7076 pmd->ctx.last_rxq;
009e0033
IM
7077 dp_packet_batch_add(&p->output_pkts, packet);
7078 }
ac8c2081 7079 return;
8a4e3a85 7080 }
09f9da0b
JR
7081 break;
7082
a36de779 7083 case OVS_ACTION_ATTR_TUNNEL_PUSH:
47e1b3b6
IM
7084 if (should_steal) {
7085 /* We're requested to push tunnel header, but also we need to take
7086 * the ownership of these packets. Thus, we can avoid performing
7087 * the action, because the caller will not use the result anyway.
7088 * Just break to free the batch. */
7089 break;
a36de779 7090 }
47e1b3b6
IM
7091 dp_packet_batch_apply_cutlen(packets_);
7092 push_tnl_action(pmd, a, packets_);
7093 return;
a36de779
PS
7094
7095 case OVS_ACTION_ATTR_TUNNEL_POP:
7096 if (*depth < MAX_RECIRC_DEPTH) {
aaca4fe0 7097 struct dp_packet_batch *orig_packets_ = packets_;
8611f9a4 7098 odp_port_t portno = nl_attr_get_odp_port(a);
a36de779 7099
57eebbb4 7100 p = pmd_tnl_port_cache_lookup(pmd, portno);
a36de779 7101 if (p) {
1895cc8d 7102 struct dp_packet_batch tnl_pkt;
a36de779 7103
7d7ded7a 7104 if (!should_steal) {
aaca4fe0
WT
7105 dp_packet_batch_clone(&tnl_pkt, packets_);
7106 packets_ = &tnl_pkt;
7107 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
7108 }
7109
aaca4fe0
WT
7110 dp_packet_batch_apply_cutlen(packets_);
7111
324c8374 7112 netdev_pop_header(p->port->netdev, packets_);
72c84bc2 7113 if (dp_packet_batch_is_empty(packets_)) {
1c8f98d9
PS
7114 return;
7115 }
9235b479 7116
72c84bc2 7117 struct dp_packet *packet;
e883448e 7118 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7119 packet->md.in_port.odp_port = portno;
a36de779 7120 }
9235b479
PS
7121
7122 (*depth)++;
7123 dp_netdev_recirculate(pmd, packets_);
7124 (*depth)--;
a36de779
PS
7125 return;
7126 }
7127 }
7128 break;
7129
623540e4
EJ
7130 case OVS_ACTION_ATTR_USERSPACE:
7131 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
aaca4fe0 7132 struct dp_packet_batch *orig_packets_ = packets_;
623540e4 7133 const struct nlattr *userdata;
aaca4fe0 7134 struct dp_packet_batch usr_pkt;
623540e4
EJ
7135 struct ofpbuf actions;
7136 struct flow flow;
7af12bd7 7137 ovs_u128 ufid;
aaca4fe0 7138 bool clone = false;
4fc65926 7139
623540e4
EJ
7140 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7141 ofpbuf_init(&actions, 0);
8cbf4f47 7142
aaca4fe0 7143 if (packets_->trunc) {
7d7ded7a 7144 if (!should_steal) {
aaca4fe0
WT
7145 dp_packet_batch_clone(&usr_pkt, packets_);
7146 packets_ = &usr_pkt;
aaca4fe0
WT
7147 clone = true;
7148 dp_packet_batch_reset_cutlen(orig_packets_);
7149 }
7150
7151 dp_packet_batch_apply_cutlen(packets_);
7152 }
7153
72c84bc2 7154 struct dp_packet *packet;
e883448e 7155 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7156 flow_extract(packet, &flow);
7af12bd7 7157 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
7d7ded7a 7158 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
b010be17 7159 &ufid, &actions, userdata);
db73f716 7160 }
aaca4fe0
WT
7161
7162 if (clone) {
7163 dp_packet_delete_batch(packets_, true);
7164 }
7165
623540e4
EJ
7166 ofpbuf_uninit(&actions);
7167 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 7168
ac8c2081
DDP
7169 return;
7170 }
09f9da0b 7171 break;
572f732a 7172
adcf00ba
AZ
7173 case OVS_ACTION_ATTR_RECIRC:
7174 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 7175 struct dp_packet_batch recirc_pkts;
572f732a 7176
7d7ded7a 7177 if (!should_steal) {
1895cc8d
PS
7178 dp_packet_batch_clone(&recirc_pkts, packets_);
7179 packets_ = &recirc_pkts;
28e2fa02 7180 }
8cbf4f47 7181
72c84bc2 7182 struct dp_packet *packet;
e883448e 7183 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7184 packet->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 7185 }
28e2fa02
DDP
7186
7187 (*depth)++;
1895cc8d 7188 dp_netdev_recirculate(pmd, packets_);
adcf00ba
AZ
7189 (*depth)--;
7190
ac8c2081 7191 return;
adcf00ba 7192 }
ac8c2081
DDP
7193
7194 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 7195 break;
572f732a 7196
5cf3edb3
DDP
7197 case OVS_ACTION_ATTR_CT: {
7198 const struct nlattr *b;
a76a37ef 7199 bool force = false;
5cf3edb3
DDP
7200 bool commit = false;
7201 unsigned int left;
7202 uint16_t zone = 0;
7203 const char *helper = NULL;
7204 const uint32_t *setmark = NULL;
7205 const struct ovs_key_ct_labels *setlabel = NULL;
4cddb1f0
DB
7206 struct nat_action_info_t nat_action_info;
7207 struct nat_action_info_t *nat_action_info_ref = NULL;
7208 bool nat_config = false;
5cf3edb3
DDP
7209
7210 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7211 nl_attr_get_size(a)) {
7212 enum ovs_ct_attr sub_type = nl_attr_type(b);
7213
7214 switch(sub_type) {
b80e259f 7215 case OVS_CT_ATTR_FORCE_COMMIT:
a76a37ef
JR
7216 force = true;
7217 /* fall through. */
5cf3edb3
DDP
7218 case OVS_CT_ATTR_COMMIT:
7219 commit = true;
7220 break;
7221 case OVS_CT_ATTR_ZONE:
7222 zone = nl_attr_get_u16(b);
7223 break;
7224 case OVS_CT_ATTR_HELPER:
7225 helper = nl_attr_get_string(b);
7226 break;
7227 case OVS_CT_ATTR_MARK:
7228 setmark = nl_attr_get(b);
7229 break;
7230 case OVS_CT_ATTR_LABELS:
7231 setlabel = nl_attr_get(b);
7232 break;
8e83854c
JR
7233 case OVS_CT_ATTR_EVENTMASK:
7234 /* Silently ignored, as userspace datapath does not generate
7235 * netlink events. */
7236 break;
ebe62ec1
YHW
7237 case OVS_CT_ATTR_TIMEOUT:
7238 /* Userspace datapath does not support customized timeout
7239 * policy yet. */
7240 break;
4cddb1f0
DB
7241 case OVS_CT_ATTR_NAT: {
7242 const struct nlattr *b_nest;
7243 unsigned int left_nest;
7244 bool ip_min_specified = false;
7245 bool proto_num_min_specified = false;
7246 bool ip_max_specified = false;
7247 bool proto_num_max_specified = false;
7248 memset(&nat_action_info, 0, sizeof nat_action_info);
7249 nat_action_info_ref = &nat_action_info;
7250
7251 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7252 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7253
7254 switch (sub_type_nest) {
7255 case OVS_NAT_ATTR_SRC:
7256 case OVS_NAT_ATTR_DST:
7257 nat_config = true;
7258 nat_action_info.nat_action |=
7259 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7260 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7261 break;
7262 case OVS_NAT_ATTR_IP_MIN:
7263 memcpy(&nat_action_info.min_addr,
7264 nl_attr_get(b_nest),
7265 nl_attr_get_size(b_nest));
7266 ip_min_specified = true;
7267 break;
7268 case OVS_NAT_ATTR_IP_MAX:
7269 memcpy(&nat_action_info.max_addr,
7270 nl_attr_get(b_nest),
7271 nl_attr_get_size(b_nest));
7272 ip_max_specified = true;
7273 break;
7274 case OVS_NAT_ATTR_PROTO_MIN:
7275 nat_action_info.min_port =
7276 nl_attr_get_u16(b_nest);
7277 proto_num_min_specified = true;
7278 break;
7279 case OVS_NAT_ATTR_PROTO_MAX:
7280 nat_action_info.max_port =
7281 nl_attr_get_u16(b_nest);
7282 proto_num_max_specified = true;
7283 break;
7284 case OVS_NAT_ATTR_PERSISTENT:
7285 case OVS_NAT_ATTR_PROTO_HASH:
7286 case OVS_NAT_ATTR_PROTO_RANDOM:
7287 break;
7288 case OVS_NAT_ATTR_UNSPEC:
7289 case __OVS_NAT_ATTR_MAX:
7290 OVS_NOT_REACHED();
7291 }
7292 }
7293
7294 if (ip_min_specified && !ip_max_specified) {
7295 nat_action_info.max_addr = nat_action_info.min_addr;
7296 }
7297 if (proto_num_min_specified && !proto_num_max_specified) {
7298 nat_action_info.max_port = nat_action_info.min_port;
7299 }
7300 if (proto_num_min_specified || proto_num_max_specified) {
7301 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7302 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7303 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7304 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7305 }
7306 }
7307 break;
7308 }
5cf3edb3
DDP
7309 case OVS_CT_ATTR_UNSPEC:
7310 case __OVS_CT_ATTR_MAX:
7311 OVS_NOT_REACHED();
7312 }
7313 }
7314
4cddb1f0
DB
7315 /* We won't be able to function properly in this case, hence
7316 * complain loudly. */
7317 if (nat_config && !commit) {
7318 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7319 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7320 }
7321
57593fd2 7322 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
bd7d93f8 7323 commit, zone, setmark, setlabel, aux->flow->tp_src,
b010be17 7324 aux->flow->tp_dst, helper, nat_action_info_ref,
05f9e707 7325 pmd->ctx.now / 1000);
07659514 7326 break;
5cf3edb3 7327 }
07659514 7328
5dddf960 7329 case OVS_ACTION_ATTR_METER:
4b27db64 7330 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
b010be17 7331 pmd->ctx.now);
4b27db64
JR
7332 break;
7333
09f9da0b
JR
7334 case OVS_ACTION_ATTR_PUSH_VLAN:
7335 case OVS_ACTION_ATTR_POP_VLAN:
7336 case OVS_ACTION_ATTR_PUSH_MPLS:
7337 case OVS_ACTION_ATTR_POP_MPLS:
7338 case OVS_ACTION_ATTR_SET:
6d670e7f 7339 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 7340 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 7341 case OVS_ACTION_ATTR_HASH:
09f9da0b 7342 case OVS_ACTION_ATTR_UNSPEC:
aaca4fe0 7343 case OVS_ACTION_ATTR_TRUNC:
6fcecb85
YY
7344 case OVS_ACTION_ATTR_PUSH_ETH:
7345 case OVS_ACTION_ATTR_POP_ETH:
535e3acf 7346 case OVS_ACTION_ATTR_CLONE:
f59cb331
YY
7347 case OVS_ACTION_ATTR_PUSH_NSH:
7348 case OVS_ACTION_ATTR_POP_NSH:
1fe178d2 7349 case OVS_ACTION_ATTR_CT_CLEAR:
5b34f8fc 7350 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
09f9da0b
JR
7351 case __OVS_ACTION_ATTR_MAX:
7352 OVS_NOT_REACHED();
da546e07 7353 }
ac8c2081 7354
7d7ded7a 7355 dp_packet_delete_batch(packets_, should_steal);
98403001
BP
7356}
7357
4edb9ae9 7358static void
65f13b50 7359dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7360 struct dp_packet_batch *packets,
7d7ded7a 7361 bool should_steal, const struct flow *flow,
b010be17 7362 const struct nlattr *actions, size_t actions_len)
72865317 7363{
b010be17 7364 struct dp_netdev_execute_aux aux = { pmd, flow };
9080a111 7365
7d7ded7a 7366 odp_execute_actions(&aux, packets, should_steal, actions,
8cbf4f47 7367 actions_len, dp_execute_cb);
72865317
BP
7368}
7369
4d4e68ed
DDP
7370struct dp_netdev_ct_dump {
7371 struct ct_dpif_dump_state up;
7372 struct conntrack_dump dump;
7373 struct conntrack *ct;
7374 struct dp_netdev *dp;
7375};
7376
7377static int
7378dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
ded30c74 7379 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
7380{
7381 struct dp_netdev *dp = get_dp_netdev(dpif);
7382 struct dp_netdev_ct_dump *dump;
7383
7384 dump = xzalloc(sizeof *dump);
7385 dump->dp = dp;
57593fd2 7386 dump->ct = dp->conntrack;
4d4e68ed 7387
57593fd2 7388 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
4d4e68ed
DDP
7389
7390 *dump_ = &dump->up;
7391
7392 return 0;
7393}
7394
7395static int
7396dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7397 struct ct_dpif_dump_state *dump_,
7398 struct ct_dpif_entry *entry)
7399{
7400 struct dp_netdev_ct_dump *dump;
7401
7402 INIT_CONTAINER(dump, dump_, up);
7403
7404 return conntrack_dump_next(&dump->dump, entry);
7405}
7406
7407static int
7408dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7409 struct ct_dpif_dump_state *dump_)
7410{
7411 struct dp_netdev_ct_dump *dump;
7412 int err;
7413
7414 INIT_CONTAINER(dump, dump_, up);
7415
7416 err = conntrack_dump_done(&dump->dump);
7417
7418 free(dump);
7419
7420 return err;
7421}
7422
5d9cbb4c 7423static int
817a7657
YHW
7424dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7425 const struct ct_dpif_tuple *tuple)
5d9cbb4c
DDP
7426{
7427 struct dp_netdev *dp = get_dp_netdev(dpif);
7428
817a7657 7429 if (tuple) {
57593fd2 7430 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
817a7657 7431 }
57593fd2 7432 return conntrack_flush(dp->conntrack, zone);
5d9cbb4c
DDP
7433}
7434
c92339ad
DB
7435static int
7436dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7437{
7438 struct dp_netdev *dp = get_dp_netdev(dpif);
7439
57593fd2 7440 return conntrack_set_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7441}
7442
7443static int
7444dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7445{
7446 struct dp_netdev *dp = get_dp_netdev(dpif);
7447
57593fd2 7448 return conntrack_get_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7449}
7450
875075b3
DB
7451static int
7452dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7453{
7454 struct dp_netdev *dp = get_dp_netdev(dpif);
7455
57593fd2 7456 return conntrack_get_nconns(dp->conntrack, nconns);
875075b3
DB
7457}
7458
64207120
DB
7459static int
7460dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7461{
7462 struct dp_netdev *dp = get_dp_netdev(dpif);
7463
7464 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7465}
7466
7467static int
7468dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7469{
7470 struct dp_netdev *dp = get_dp_netdev(dpif);
7471 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7472 return 0;
7473}
7474
4ea96698
DB
7475static int
7476dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7477{
7478 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7479 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
4ea96698
DB
7480}
7481
7482static int
7483dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7484{
7485 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7486 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
4ea96698
DB
7487}
7488
7489static int
7490dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7491{
7492 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7493 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
4ea96698
DB
7494}
7495
7496/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7497 * diverge. */
7498static int
7499dpif_netdev_ipf_get_status(struct dpif *dpif,
7500 struct dpif_ipf_status *dpif_ipf_status)
7501{
7502 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7503 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
4ea96698
DB
7504 (struct ipf_status *) dpif_ipf_status);
7505 return 0;
7506}
7507
7508static int
7509dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7510 struct ipf_dump_ctx **ipf_dump_ctx)
7511{
7512 return ipf_dump_start(ipf_dump_ctx);
7513}
7514
7515static int
7516dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7517{
7518 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7519 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
4ea96698
DB
7520 dump);
7521}
7522
7523static int
7524dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7525{
7526 return ipf_dump_done(ipf_dump_ctx);
7527
7528}
7529
72865317 7530const struct dpif_class dpif_netdev_class = {
72865317 7531 "netdev",
f87c1357 7532 true, /* cleanup_required */
6553d06b 7533 dpif_netdev_init,
2197d7ab 7534 dpif_netdev_enumerate,
0aeaabc8 7535 dpif_netdev_port_open_type,
72865317
BP
7536 dpif_netdev_open,
7537 dpif_netdev_close,
7dab847a 7538 dpif_netdev_destroy,
e4cfed38
PS
7539 dpif_netdev_run,
7540 dpif_netdev_wait,
72865317 7541 dpif_netdev_get_stats,
72865317
BP
7542 dpif_netdev_port_add,
7543 dpif_netdev_port_del,
3eb67853 7544 dpif_netdev_port_set_config,
72865317
BP
7545 dpif_netdev_port_query_by_number,
7546 dpif_netdev_port_query_by_name,
98403001 7547 NULL, /* port_get_pid */
b0ec0f27
BP
7548 dpif_netdev_port_dump_start,
7549 dpif_netdev_port_dump_next,
7550 dpif_netdev_port_dump_done,
72865317
BP
7551 dpif_netdev_port_poll,
7552 dpif_netdev_port_poll_wait,
72865317 7553 dpif_netdev_flow_flush,
ac64794a
BP
7554 dpif_netdev_flow_dump_create,
7555 dpif_netdev_flow_dump_destroy,
7556 dpif_netdev_flow_dump_thread_create,
7557 dpif_netdev_flow_dump_thread_destroy,
704a1e09 7558 dpif_netdev_flow_dump_next,
1a0c894a 7559 dpif_netdev_operate,
6b31e073
RW
7560 NULL, /* recv_set */
7561 NULL, /* handlers_set */
d4f6865c 7562 dpif_netdev_set_config,
5bf93d67 7563 dpif_netdev_queue_to_priority,
6b31e073
RW
7564 NULL, /* recv */
7565 NULL, /* recv_wait */
7566 NULL, /* recv_purge */
e4e74c3a 7567 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
7568 dpif_netdev_register_upcall_cb,
7569 dpif_netdev_enable_upcall,
7570 dpif_netdev_disable_upcall,
b5cbbcf6 7571 dpif_netdev_get_datapath_version,
4d4e68ed
DDP
7572 dpif_netdev_ct_dump_start,
7573 dpif_netdev_ct_dump_next,
7574 dpif_netdev_ct_dump_done,
5d9cbb4c 7575 dpif_netdev_ct_flush,
c92339ad
DB
7576 dpif_netdev_ct_set_maxconns,
7577 dpif_netdev_ct_get_maxconns,
875075b3 7578 dpif_netdev_ct_get_nconns,
64207120
DB
7579 dpif_netdev_ct_set_tcp_seq_chk,
7580 dpif_netdev_ct_get_tcp_seq_chk,
cd015a11
YHW
7581 NULL, /* ct_set_limits */
7582 NULL, /* ct_get_limits */
7583 NULL, /* ct_del_limits */
1f161318
YHW
7584 NULL, /* ct_set_timeout_policy */
7585 NULL, /* ct_get_timeout_policy */
7586 NULL, /* ct_del_timeout_policy */
7587 NULL, /* ct_timeout_policy_dump_start */
7588 NULL, /* ct_timeout_policy_dump_next */
7589 NULL, /* ct_timeout_policy_dump_done */
187bb41f 7590 NULL, /* ct_get_timeout_policy_name */
4ea96698
DB
7591 dpif_netdev_ipf_set_enabled,
7592 dpif_netdev_ipf_set_min_frag,
7593 dpif_netdev_ipf_set_max_nfrags,
7594 dpif_netdev_ipf_get_status,
7595 dpif_netdev_ipf_dump_start,
7596 dpif_netdev_ipf_dump_next,
7597 dpif_netdev_ipf_dump_done,
5dddf960
JR
7598 dpif_netdev_meter_get_features,
7599 dpif_netdev_meter_set,
7600 dpif_netdev_meter_get,
7601 dpif_netdev_meter_del,
72865317 7602};
614c4892 7603
74cc3969
BP
7604static void
7605dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7606 const char *argv[], void *aux OVS_UNUSED)
7607{
e9985d6a 7608 struct dp_netdev_port *port;
74cc3969 7609 struct dp_netdev *dp;
ff073a71 7610 odp_port_t port_no;
74cc3969 7611
8a4e3a85 7612 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
7613 dp = shash_find_data(&dp_netdevs, argv[1]);
7614 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 7615 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
7616 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7617 return;
7618 }
8a4e3a85
BP
7619 ovs_refcount_ref(&dp->ref_cnt);
7620 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 7621
59e6d833 7622 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 7623 if (get_port_by_name(dp, argv[2], &port)) {
74cc3969 7624 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 7625 goto exit;
74cc3969
BP
7626 }
7627
ff073a71
BP
7628 port_no = u32_to_odp(atoi(argv[3]));
7629 if (!port_no || port_no == ODPP_NONE) {
74cc3969 7630 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 7631 goto exit;
74cc3969 7632 }
ff073a71 7633 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 7634 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 7635 goto exit;
74cc3969 7636 }
59e6d833 7637
e9985d6a
DDP
7638 /* Remove port. */
7639 hmap_remove(&dp->ports, &port->node);
e32971b8 7640 reconfigure_datapath(dp);
59e6d833 7641
e9985d6a
DDP
7642 /* Reinsert with new port number. */
7643 port->port_no = port_no;
7644 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
e32971b8 7645 reconfigure_datapath(dp);
59e6d833 7646
d33ed218 7647 seq_change(dp->port_seq);
74cc3969 7648 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
7649
7650exit:
59e6d833 7651 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 7652 dp_netdev_unref(dp);
74cc3969
BP
7653}
7654
0cbfe35d
BP
7655static void
7656dpif_dummy_register__(const char *type)
7657{
7658 struct dpif_class *class;
7659
7660 class = xmalloc(sizeof *class);
7661 *class = dpif_netdev_class;
7662 class->type = xstrdup(type);
7663 dp_register_provider(class);
7664}
7665
8420c7ad
BP
7666static void
7667dpif_dummy_override(const char *type)
7668{
65d43fdc
YT
7669 int error;
7670
7671 /*
7672 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7673 * a userland-only build. It's useful for testsuite.
7674 */
7675 error = dp_unregister_provider(type);
7676 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
7677 dpif_dummy_register__(type);
7678 }
7679}
7680
614c4892 7681void
8420c7ad 7682dpif_dummy_register(enum dummy_level level)
614c4892 7683{
8420c7ad 7684 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
7685 struct sset types;
7686 const char *type;
7687
7688 sset_init(&types);
7689 dp_enumerate_types(&types);
7690 SSET_FOR_EACH (type, &types) {
8420c7ad 7691 dpif_dummy_override(type);
0cbfe35d
BP
7692 }
7693 sset_destroy(&types);
8420c7ad
BP
7694 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7695 dpif_dummy_override("system");
614c4892 7696 }
0cbfe35d
BP
7697
7698 dpif_dummy_register__("dummy");
74cc3969
BP
7699
7700 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 7701 "dp port new-number",
74cc3969 7702 3, 3, dpif_dummy_change_port_number, NULL);
614c4892 7703}
0de8783a
JR
7704\f
7705/* Datapath Classifier. */
7706
0fcf0776
ZF
7707static void
7708dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7709{
7710 cmap_destroy(&subtable->rules);
a0b36b39 7711 ovsrcu_postpone(free, subtable->mf_masks);
0fcf0776
ZF
7712 ovsrcu_postpone(free, subtable);
7713}
7714
0de8783a
JR
7715/* Initializes 'cls' as a classifier that initially contains no classification
7716 * rules. */
7717static void
7718dpcls_init(struct dpcls *cls)
7719{
7720 cmap_init(&cls->subtables_map);
da9cfca6 7721 pvector_init(&cls->subtables);
0de8783a
JR
7722}
7723
7724static void
7725dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7726{
3453b4d6 7727 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
da9cfca6 7728 pvector_remove(&cls->subtables, subtable);
0de8783a
JR
7729 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7730 subtable->mask.hash);
0fcf0776 7731 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
0de8783a
JR
7732}
7733
7734/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
7735 * caller's responsibility.
7736 * May only be called after all the readers have been terminated. */
7737static void
7738dpcls_destroy(struct dpcls *cls)
7739{
7740 if (cls) {
7741 struct dpcls_subtable *subtable;
7742
7743 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 7744 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
7745 dpcls_destroy_subtable(cls, subtable);
7746 }
7747 cmap_destroy(&cls->subtables_map);
da9cfca6 7748 pvector_destroy(&cls->subtables);
0de8783a
JR
7749 }
7750}
7751
7752static struct dpcls_subtable *
7753dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7754{
7755 struct dpcls_subtable *subtable;
7756
7757 /* Need to add one. */
caeb4906
JR
7758 subtable = xmalloc(sizeof *subtable
7759 - sizeof subtable->mask.mf + mask->len);
0de8783a 7760 cmap_init(&subtable->rules);
3453b4d6 7761 subtable->hit_cnt = 0;
0de8783a 7762 netdev_flow_key_clone(&subtable->mask, mask);
aadede3d 7763
a0b36b39
HH
7764 /* The count of bits in the mask defines the space required for masks.
7765 * Then call gen_masks() to create the appropriate masks, avoiding the cost
7766 * of doing runtime calculations. */
7767 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
7768 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
7769 subtable->mf_bits_set_unit0 = unit0;
7770 subtable->mf_bits_set_unit1 = unit1;
7771 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
7772 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
7773
f54d8f00
HH
7774 /* Probe for a specialized generic lookup function. */
7775 subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
7776
7777 /* If not set, assign generic lookup. Generic works for any miniflow. */
7778 if (!subtable->lookup_func) {
7779 subtable->lookup_func = dpcls_subtable_lookup_generic;
7780 }
aadede3d 7781
0de8783a 7782 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
3453b4d6 7783 /* Add the new subtable at the end of the pvector (with no hits yet) */
da9cfca6 7784 pvector_insert(&cls->subtables, subtable, 0);
84dbfb2b 7785 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
3453b4d6 7786 cmap_count(&cls->subtables_map), subtable, cls->in_port);
da9cfca6 7787 pvector_publish(&cls->subtables);
0de8783a
JR
7788
7789 return subtable;
7790}
7791
7792static inline struct dpcls_subtable *
7793dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7794{
7795 struct dpcls_subtable *subtable;
7796
7797 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7798 &cls->subtables_map) {
7799 if (netdev_flow_key_equal(&subtable->mask, mask)) {
7800 return subtable;
7801 }
7802 }
7803 return dpcls_create_subtable(cls, mask);
7804}
7805
3453b4d6
JS
7806
7807/* Periodically sort the dpcls subtable vectors according to hit counts */
7808static void
7809dpcls_sort_subtable_vector(struct dpcls *cls)
7810{
7811 struct pvector *pvec = &cls->subtables;
7812 struct dpcls_subtable *subtable;
7813
7814 PVECTOR_FOR_EACH (subtable, pvec) {
7815 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
7816 subtable->hit_cnt = 0;
7817 }
7818 pvector_publish(pvec);
7819}
7820
7821static inline void
4809891b
KT
7822dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
7823 struct polled_queue *poll_list, int poll_cnt)
3453b4d6
JS
7824{
7825 struct dpcls *cls;
5bf84282
NK
7826 uint64_t tot_idle = 0, tot_proc = 0;
7827 unsigned int pmd_load = 0;
3453b4d6 7828
b010be17 7829 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
2a2c67b4 7830 uint64_t curr_tsc;
5bf84282
NK
7831 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
7832 if (pmd_alb->is_enabled && !pmd->isolated
7833 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
7834 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
7835 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
7836 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
7837 {
7838 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
7839 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
7840 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
7841 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
7842
7843 if (tot_proc) {
7844 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
7845 }
7846
7847 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
7848 atomic_count_inc(&pmd->pmd_overloaded);
7849 } else {
7850 atomic_count_set(&pmd->pmd_overloaded, 0);
7851 }
7852 }
7853
7854 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
7855 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
7856 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
7857 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
7858
4809891b
KT
7859 /* Get the cycles that were used to process each queue and store. */
7860 for (unsigned i = 0; i < poll_cnt; i++) {
7861 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
7862 RXQ_CYCLES_PROC_CURR);
7863 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
7864 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
7865 0);
7866 }
2a2c67b4
KT
7867 curr_tsc = cycles_counter_update(&pmd->perf_stats);
7868 if (pmd->intrvl_tsc_prev) {
7869 /* There is a prev timestamp, store a new intrvl cycle count. */
7870 atomic_store_relaxed(&pmd->intrvl_cycles,
7871 curr_tsc - pmd->intrvl_tsc_prev);
7872 }
7873 pmd->intrvl_tsc_prev = curr_tsc;
4809891b 7874 /* Start new measuring interval */
b010be17 7875 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
4809891b
KT
7876 }
7877
b010be17 7878 if (pmd->ctx.now > pmd->next_optimization) {
3453b4d6
JS
7879 /* Try to obtain the flow lock to block out revalidator threads.
7880 * If not possible, just try next time. */
7881 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
7882 /* Optimize each classifier */
7883 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7884 dpcls_sort_subtable_vector(cls);
7885 }
7886 ovs_mutex_unlock(&pmd->flow_mutex);
7887 /* Start new measuring interval */
b010be17
IM
7888 pmd->next_optimization = pmd->ctx.now
7889 + DPCLS_OPTIMIZATION_INTERVAL;
3453b4d6
JS
7890 }
7891 }
7892}
7893
0de8783a
JR
7894/* Insert 'rule' into 'cls'. */
7895static void
7896dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
7897 const struct netdev_flow_key *mask)
7898{
7899 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
7900
3453b4d6 7901 /* Refer to subtable's mask, also for later removal. */
0de8783a
JR
7902 rule->mask = &subtable->mask;
7903 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
7904}
7905
7906/* Removes 'rule' from 'cls', also destructing the 'rule'. */
7907static void
7908dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
7909{
7910 struct dpcls_subtable *subtable;
7911
7912 ovs_assert(rule->mask);
7913
3453b4d6 7914 /* Get subtable from reference in rule->mask. */
0de8783a 7915 INIT_CONTAINER(subtable, rule->mask, mask);
0de8783a
JR
7916 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
7917 == 0) {
3453b4d6 7918 /* Delete empty subtable. */
0de8783a 7919 dpcls_destroy_subtable(cls, subtable);
da9cfca6 7920 pvector_publish(&cls->subtables);
0de8783a
JR
7921 }
7922}
7923
a0b36b39
HH
7924/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
7925static inline void
7926netdev_flow_key_gen_mask_unit(uint64_t iter,
7927 const uint64_t count,
7928 uint64_t *mf_masks)
7929{
7930 int i;
7931 for (i = 0; i < count; i++) {
7932 uint64_t lowest_bit = (iter & -iter);
7933 iter &= ~lowest_bit;
7934 mf_masks[i] = (lowest_bit - 1);
7935 }
7936 /* Checks that count has covered all bits in the iter bitmap. */
7937 ovs_assert(iter == 0);
7938}
7939
7940/* Generate a mask for each block in the miniflow, based on the bits set. This
7941 * allows easily masking packets with the generated array here, without
7942 * calculations. This replaces runtime-calculating the masks.
7943 * @param key The table to generate the mf_masks for
7944 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
7945 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
7946 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
7947 */
7948void
7949netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
7950 uint64_t *mf_masks,
7951 const uint32_t mf_bits_u0,
7952 const uint32_t mf_bits_u1)
7953{
7954 uint64_t iter_u0 = tbl->mf.map.bits[0];
7955 uint64_t iter_u1 = tbl->mf.map.bits[1];
7956
7957 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
7958 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
7959}
7960
361d808d
JR
7961/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
7962 * in 'mask' the values in 'key' and 'target' are the same. */
f5ace7cd 7963bool
0de8783a
JR
7964dpcls_rule_matches_key(const struct dpcls_rule *rule,
7965 const struct netdev_flow_key *target)
7966{
09b0fa9c
JR
7967 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
7968 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 7969 uint64_t value;
0de8783a 7970
5fcff47b
JR
7971 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
7972 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
7973 return false;
7974 }
7975 }
7976 return true;
7977}
7978
5b1c9c78
FA
7979/* For each miniflow in 'keys' performs a classifier lookup writing the result
7980 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
0de8783a
JR
7981 * NULL it is skipped.
7982 *
7983 * This function is optimized for use in the userspace datapath and therefore
7984 * does not implement a lot of features available in the standard
7985 * classifier_lookup() function. Specifically, it does not implement
7986 * priorities, instead returning any rule which matches the flow.
7987 *
5b1c9c78 7988 * Returns true if all miniflows found a corresponding rule. */
0de8783a 7989static bool
60d8ccae 7990dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
3453b4d6
JS
7991 struct dpcls_rule **rules, const size_t cnt,
7992 int *num_lookups_p)
0de8783a 7993{
5b1c9c78 7994 /* The received 'cnt' miniflows are the search-keys that will be processed
63906f18
BB
7995 * to find a matching entry into the available subtables.
7996 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
aadede3d 7997#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
63906f18 7998 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
0de8783a 7999
0de8783a 8000 struct dpcls_subtable *subtable;
aadede3d 8001 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
63906f18
BB
8002
8003 if (cnt != MAP_BITS) {
8004 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
0de8783a
JR
8005 }
8006 memset(rules, 0, cnt * sizeof *rules);
8007
3453b4d6 8008 int lookups_match = 0, subtable_pos = 1;
aadede3d 8009 uint32_t found_map;
3453b4d6 8010
5b1c9c78
FA
8011 /* The Datapath classifier - aka dpcls - is composed of subtables.
8012 * Subtables are dynamically created as needed when new rules are inserted.
8013 * Each subtable collects rules with matches on a specific subset of packet
8014 * fields as defined by the subtable's mask. We proceed to process every
8015 * search-key against each subtable, but when a match is found for a
8016 * search-key, the search for that key can stop because the rules are
8017 * non-overlapping. */
da9cfca6 8018 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
aadede3d
HH
8019 /* Call the subtable specific lookup function. */
8020 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
63906f18 8021
aadede3d
HH
8022 /* Count the number of subtables searched for this packet match. This
8023 * estimates the "spread" of subtables looked at per matched packet. */
8024 uint32_t pkts_matched = count_1bits(found_map);
8025 lookups_match += pkts_matched * subtable_pos;
63906f18 8026
aadede3d
HH
8027 /* Clear the found rules, and return early if all packets are found. */
8028 keys_map &= ~found_map;
63906f18 8029 if (!keys_map) {
3453b4d6
JS
8030 if (num_lookups_p) {
8031 *num_lookups_p = lookups_match;
8032 }
aadede3d 8033 return true;
0de8783a 8034 }
3453b4d6
JS
8035 subtable_pos++;
8036 }
aadede3d 8037
3453b4d6
JS
8038 if (num_lookups_p) {
8039 *num_lookups_p = lookups_match;
0de8783a 8040 }
aadede3d 8041 return false;
0de8783a 8042}