]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netdev.c
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
[mirror_ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
4ea96698 2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
f5ace7cd 19#include "dpif-netdev-private.h"
72865317 20
72865317
BP
21#include <ctype.h>
22#include <errno.h>
23#include <fcntl.h>
24#include <inttypes.h>
7f3adc00 25#include <net/if.h>
b2befd5b 26#include <sys/types.h>
7daedce4 27#include <netinet/in.h>
cdee00fd 28#include <stdint.h>
72865317
BP
29#include <stdlib.h>
30#include <string.h>
31#include <sys/ioctl.h>
7daedce4 32#include <sys/socket.h>
72865317 33#include <sys/stat.h>
72865317
BP
34#include <unistd.h>
35
9f861c91 36#include "bitmap.h"
59e6d833 37#include "cmap.h"
5cf3edb3 38#include "conntrack.h"
2078901a 39#include "conntrack-tp.h"
7daedce4 40#include "coverage.h"
4d4e68ed 41#include "ct-dpif.h"
72865317 42#include "csum.h"
e14deea0 43#include "dp-packet.h"
614c4892 44#include "dpif.h"
82a48ead 45#include "dpif-netdev-perf.h"
72865317 46#include "dpif-provider.h"
614c4892 47#include "dummy.h"
afae68b1 48#include "fat-rwlock.h"
72865317 49#include "flow.h"
762d146a 50#include "hmapx.h"
140dd699 51#include "id-pool.h"
4ea96698 52#include "ipf.h"
72865317 53#include "netdev.h"
b6cabb8f 54#include "netdev-offload.h"
79f36875 55#include "netdev-provider.h"
de281153 56#include "netdev-vport.h"
cdee00fd 57#include "netlink.h"
f094af7b 58#include "odp-execute.h"
72865317 59#include "odp-util.h"
25d436fb
BW
60#include "openvswitch/dynamic-string.h"
61#include "openvswitch/list.h"
62#include "openvswitch/match.h"
0d71302e 63#include "openvswitch/ofp-parse.h"
25d436fb 64#include "openvswitch/ofp-print.h"
64c96779 65#include "openvswitch/ofpbuf.h"
3eb67853 66#include "openvswitch/shash.h"
25d436fb 67#include "openvswitch/vlog.h"
5a034064 68#include "ovs-numa.h"
61e7deb1 69#include "ovs-rcu.h"
72865317 70#include "packets.h"
fd016ae3 71#include "openvswitch/poll-loop.h"
0de8783a 72#include "pvector.h"
26c6b6cd 73#include "random.h"
d33ed218 74#include "seq.h"
3eb67853 75#include "smap.h"
0cbfe35d 76#include "sset.h"
72865317 77#include "timeval.h"
53902038 78#include "tnl-neigh-cache.h"
7f9b8504 79#include "tnl-ports.h"
74cc3969 80#include "unixctl.h"
72865317 81#include "util.h"
241bad15 82#include "uuid.h"
7daedce4 83
d98e6007 84VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 85
5bf84282
NK
86/* Auto Load Balancing Defaults */
87#define ALB_ACCEPTABLE_IMPROVEMENT 25
88#define ALB_PMD_LOAD_THRESHOLD 95
89#define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
90#define MIN_TO_MSEC 60000
91
8bb113da 92#define FLOW_DUMP_MAX_BATCH 50
adcf00ba 93/* Use per thread recirc_depth to prevent recirculation loop. */
3f9d3836 94#define MAX_RECIRC_DEPTH 6
adcf00ba 95DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 96
c71ea3c4
IM
97/* Use instant packet send by default. */
98#define DEFAULT_TX_FLUSH_INTERVAL 0
99
72865317 100/* Configuration parameters. */
72865317 101enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
4b27db64
JR
102enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
103enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
104enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
72865317 105
a13a0209
AT
106COVERAGE_DEFINE(datapath_drop_meter);
107COVERAGE_DEFINE(datapath_drop_upcall_error);
108COVERAGE_DEFINE(datapath_drop_lock_error);
109COVERAGE_DEFINE(datapath_drop_userspace_action_error);
110COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
111COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
112COVERAGE_DEFINE(datapath_drop_recirc_error);
113COVERAGE_DEFINE(datapath_drop_invalid_port);
9df65060 114COVERAGE_DEFINE(datapath_drop_invalid_bond);
a13a0209
AT
115COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
116COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
117
8a4e3a85
BP
118/* Protects against changes to 'dp_netdevs'. */
119static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
120
121/* Contains all 'struct dp_netdev's. */
122static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
123 = SHASH_INITIALIZER(&dp_netdevs);
124
623540e4 125static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 126
5cf3edb3 127#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
4cddb1f0
DB
128 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
129 | CS_SRC_NAT | CS_DST_NAT)
5cf3edb3
DDP
130#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
131
2494ccd7 132static struct odp_support dp_netdev_support = {
f0fb825a 133 .max_vlan_headers = SIZE_MAX,
2494ccd7
JS
134 .max_mpls_depth = SIZE_MAX,
135 .recirc = true,
5cf3edb3
DDP
136 .ct_state = true,
137 .ct_zone = true,
138 .ct_mark = true,
139 .ct_label = true,
2575df07
JP
140 .ct_state_nat = true,
141 .ct_orig_tuple = true,
142 .ct_orig_tuple6 = true,
2494ccd7
JS
143};
144
60d8ccae
YW
145/* EMC cache and SMC cache compose the datapath flow cache (DFC)
146 *
147 * Exact match cache for frequently used flows
9bbf1c3d
DDP
148 *
149 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
150 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 151 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
152 *
153 * A cache entry holds a reference to its 'dp_netdev_flow'.
154 *
155 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
156 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
157 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
158 * value is the index of a cache entry where the miniflow could be.
159 *
160 *
60d8ccae
YW
161 * Signature match cache (SMC)
162 *
163 * This cache stores a 16-bit signature for each flow without storing keys, and
164 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
165 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
166 * SMC uses a set-associative design that each bucket contains
167 * SMC_ENTRY_PER_BUCKET number of entries.
168 * Since 16-bit flow_table index is used, if there are more than 2^16
169 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
170 *
171 *
9bbf1c3d
DDP
172 * Thread-safety
173 * =============
174 *
175 * Each pmd_thread has its own private exact match cache.
176 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
177 */
178
fc82e877 179#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
180#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
181#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
182#define EM_FLOW_HASH_SEGS 2
183
60d8ccae
YW
184/* SMC uses a set-associative design. A bucket contains a set of entries that
185 * a flow item can occupy. For now, it uses one hash function rather than two
186 * as for the EMC design. */
187#define SMC_ENTRY_PER_BUCKET 4
188#define SMC_ENTRIES (1u << 20)
189#define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
190#define SMC_MASK (SMC_BUCKET_CNT - 1)
191
4c30b246
CL
192/* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
193#define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
194#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
195 DEFAULT_EM_FLOW_INSERT_INV_PROB)
196
9bbf1c3d 197struct emc_entry {
9bbf1c3d 198 struct dp_netdev_flow *flow;
0de8783a 199 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
200};
201
202struct emc_cache {
203 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 204 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
205};
206
60d8ccae
YW
207struct smc_bucket {
208 uint16_t sig[SMC_ENTRY_PER_BUCKET];
209 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
210};
211
212/* Signature match cache, differentiate from EMC cache */
213struct smc_cache {
214 struct smc_bucket buckets[SMC_BUCKET_CNT];
215};
216
217struct dfc_cache {
218 struct emc_cache emc_cache;
219 struct smc_cache smc_cache;
220};
221
9bbf1c3d
DDP
222/* Iterate in the exact match cache through every entry that might contain a
223 * miniflow with hash 'HASH'. */
224#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
225 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
226 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
227 i__ < EM_FLOW_HASH_SEGS; \
228 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
229\f
230/* Simple non-wildcarding single-priority classifier. */
231
05f9e707
IM
232/* Time in microseconds between successive optimizations of the dpcls
233 * subtable vector */
234#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
3453b4d6 235
05f9e707
IM
236/* Time in microseconds of the interval in which rxq processing cycles used
237 * in rxq to pmd assignments is measured and stored. */
238#define PMD_RXQ_INTERVAL_LEN 10000000LL
4809891b 239
c59e759f
KT
240/* Number of intervals for which cycles are stored
241 * and used during rxq to pmd assignment. */
242#define PMD_RXQ_INTERVAL_MAX 6
243
0de8783a 244struct dpcls {
3453b4d6
JS
245 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
246 odp_port_t in_port;
0de8783a 247 struct cmap subtables_map;
da9cfca6 248 struct pvector subtables;
0de8783a 249};
9bbf1c3d 250
9b4f08cd
VDA
251/* Data structure to keep packet order till fastpath processing. */
252struct dp_packet_flow_map {
253 struct dp_packet *packet;
254 struct dp_netdev_flow *flow;
255 uint16_t tcp_flags;
256};
257
0de8783a
JR
258static void dpcls_init(struct dpcls *);
259static void dpcls_destroy(struct dpcls *);
3453b4d6 260static void dpcls_sort_subtable_vector(struct dpcls *);
0de8783a
JR
261static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
262 const struct netdev_flow_key *mask);
263static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
3453b4d6 264static bool dpcls_lookup(struct dpcls *cls,
60d8ccae 265 const struct netdev_flow_key *keys[],
3453b4d6
JS
266 struct dpcls_rule **rules, size_t cnt,
267 int *num_lookups_p);
92c7c870 268
4b27db64
JR
269/* Set of supported meter flags */
270#define DP_SUPPORTED_METER_FLAGS_MASK \
271 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
272
273/* Set of supported meter band types */
274#define DP_SUPPORTED_METER_BAND_TYPES \
275 ( 1 << OFPMBT13_DROP )
276
277struct dp_meter_band {
278 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
279 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
280 uint64_t packet_count;
281 uint64_t byte_count;
282};
283
284struct dp_meter {
285 uint16_t flags;
286 uint16_t n_bands;
287 uint32_t max_delta_t;
288 uint64_t used;
289 uint64_t packet_count;
290 uint64_t byte_count;
291 struct dp_meter_band bands[];
292};
293
5bf84282
NK
294struct pmd_auto_lb {
295 bool auto_lb_requested; /* Auto load balancing requested by user. */
296 bool is_enabled; /* Current status of Auto load balancing. */
297 uint64_t rebalance_intvl;
298 uint64_t rebalance_poll_timer;
299};
300
8a4e3a85
BP
301/* Datapath based on the network device interface from netdev.h.
302 *
303 *
304 * Thread-safety
305 * =============
306 *
307 * Some members, marked 'const', are immutable. Accessing other members
308 * requires synchronization, as noted in more detail below.
309 *
310 * Acquisition order is, from outermost to innermost:
311 *
312 * dp_netdev_mutex (global)
59e6d833 313 * port_mutex
9df65060 314 * bond_mutex
d0cca6c3 315 * non_pmd_mutex
8a4e3a85 316 */
72865317 317struct dp_netdev {
8a4e3a85
BP
318 const struct dpif_class *const class;
319 const char *const name;
6a8267c5
BP
320 struct ovs_refcount ref_cnt;
321 atomic_flag destroyed;
72865317 322
8a4e3a85
BP
323 /* Ports.
324 *
e9985d6a
DDP
325 * Any lookup into 'ports' or any access to the dp_netdev_ports found
326 * through 'ports' requires taking 'port_mutex'. */
59e6d833 327 struct ovs_mutex port_mutex;
e9985d6a 328 struct hmap ports;
d33ed218 329 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 330
c71ea3c4
IM
331 /* The time that a packet can wait in output batch for sending. */
332 atomic_uint32_t tx_flush_interval;
333
4b27db64
JR
334 /* Meters. */
335 struct ovs_mutex meter_locks[N_METER_LOCKS];
336 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
4b27db64 337
65dcf3da
BB
338 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
339 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
79f36875
JS
340 /* Enable collection of PMD performance metrics. */
341 atomic_bool pmd_perf_metrics;
60d8ccae
YW
342 /* Enable the SMC cache from ovsdb config */
343 atomic_bool smc_enable_db;
65dcf3da 344
6b31e073
RW
345 /* Protects access to ofproto-dpif-upcall interface during revalidator
346 * thread synchronization. */
347 struct fat_rwlock upcall_rwlock;
623540e4
EJ
348 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
349 void *upcall_aux;
6b31e073 350
e4e74c3a
AW
351 /* Callback function for notifying the purging of dp flows (during
352 * reseting pmd deletion). */
353 dp_purge_callback *dp_purge_cb;
354 void *dp_purge_aux;
355
65f13b50
AW
356 /* Stores all 'struct dp_netdev_pmd_thread's. */
357 struct cmap poll_threads;
140dd699
IM
358 /* id pool for per thread static_tx_qid. */
359 struct id_pool *tx_qid_pool;
360 struct ovs_mutex tx_qid_pool_mutex;
e77c97b9
KT
361 /* Use measured cycles for rxq to pmd assignment. */
362 bool pmd_rxq_assign_cyc;
65f13b50
AW
363
364 /* Protects the access of the 'struct dp_netdev_pmd_thread'
365 * instance for non-pmd thread. */
366 struct ovs_mutex non_pmd_mutex;
367
368 /* Each pmd thread will store its pointer to
369 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
370 ovsthread_key_t per_pmd_key;
f2eee189 371
a6a426d6
IM
372 struct seq *reconfigure_seq;
373 uint64_t last_reconfigure_seq;
374
a14b8947 375 /* Cpu mask for pin of pmd threads. */
f2eee189 376 char *pmd_cmask;
6e3c6fa4 377
a36de779 378 uint64_t last_tnl_conf_seq;
5cf3edb3 379
57593fd2 380 struct conntrack *conntrack;
5bf84282 381 struct pmd_auto_lb pmd_alb;
9df65060
VDA
382
383 /* Bonds. */
384 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
385 struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
72865317
BP
386};
387
4b27db64
JR
388static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
389 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
390{
391 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
392}
393
394static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
395 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
396{
397 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
398}
399
400
8a4e3a85 401static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
e9985d6a
DDP
402 odp_port_t)
403 OVS_REQUIRES(dp->port_mutex);
ff073a71 404
c59e759f
KT
405enum rxq_cycles_counter_type {
406 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
407 processing packets during the current
408 interval. */
409 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
410 during rxq to pmd assignment. */
411 RXQ_N_CYCLES
412};
413
02bb2824
YL
414enum {
415 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
416 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
417 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
418};
419
420struct dp_flow_offload_item {
421 struct dp_netdev_pmd_thread *pmd;
422 struct dp_netdev_flow *flow;
423 int op;
424 struct match match;
425 struct nlattr *actions;
426 size_t actions_len;
427
428 struct ovs_list node;
429};
430
431struct dp_flow_offload {
432 struct ovs_mutex mutex;
433 struct ovs_list list;
434 pthread_cond_t cond;
435};
436
437static struct dp_flow_offload dp_flow_offload = {
438 .mutex = OVS_MUTEX_INITIALIZER,
439 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
440};
441
442static struct ovsthread_once offload_thread_once
443 = OVSTHREAD_ONCE_INITIALIZER;
444
05f9e707 445#define XPS_TIMEOUT 500000LL /* In microseconds. */
324c8374 446
3eb67853
IM
447/* Contained by struct dp_netdev_port's 'rxqs' member. */
448struct dp_netdev_rxq {
947dc567
DDP
449 struct dp_netdev_port *port;
450 struct netdev_rxq *rx;
451 unsigned core_id; /* Core to which this queue should be
452 pinned. OVS_CORE_UNSPEC if the
453 queue doesn't need to be pinned to a
454 particular core. */
ee42dd70 455 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
47a45d86 456 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
79f36875 457 bool is_vhost; /* Is rxq of a vhost port. */
c59e759f
KT
458
459 /* Counters of cycles spent successfully polling and processing pkts. */
460 atomic_ullong cycles[RXQ_N_CYCLES];
461 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
462 sum them to yield the cycles used for an rxq. */
463 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
3eb67853
IM
464};
465
72865317
BP
466/* A port in a netdev-based datapath. */
467struct dp_netdev_port {
35303d71 468 odp_port_t port_no;
ca62bb16
BB
469 bool dynamic_txqs; /* If true XPS will be used. */
470 bool need_reconfigure; /* True if we should reconfigure netdev. */
72865317 471 struct netdev *netdev;
e9985d6a 472 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 473 struct netdev_saved_flags *sf;
3eb67853 474 struct dp_netdev_rxq *rxqs;
85a4f238 475 unsigned n_rxq; /* Number of elements in 'rxqs' */
47a45d86 476 unsigned *txq_used; /* Number of threads that use each tx queue. */
324c8374 477 struct ovs_mutex txq_used_mutex;
2fbadeb6 478 bool emc_enabled; /* If true EMC will be used. */
0cbfe35d 479 char *type; /* Port type as requested by user. */
3eb67853 480 char *rxq_affinity_list; /* Requested affinity of rx queues. */
72865317
BP
481};
482
1c1e46ed
AW
483/* Contained by struct dp_netdev_flow's 'stats' member. */
484struct dp_netdev_flow_stats {
eb94da30
DDP
485 atomic_llong used; /* Last used time, in monotonic msecs. */
486 atomic_ullong packet_count; /* Number of packets matched. */
487 atomic_ullong byte_count; /* Number of bytes matched. */
488 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
489};
490
491/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
492 *
493 *
494 * Thread-safety
495 * =============
496 *
497 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 498 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
499 *
500 * Motivation
501 * ----------
502 *
503 * The thread safety rules described here for "struct dp_netdev_flow" are
504 * motivated by two goals:
505 *
506 * - Prevent threads that read members of "struct dp_netdev_flow" from
507 * reading bad data due to changes by some thread concurrently modifying
508 * those members.
509 *
510 * - Prevent two threads making changes to members of a given "struct
511 * dp_netdev_flow" from interfering with each other.
512 *
513 *
514 * Rules
515 * -----
516 *
ed79f89a
DDP
517 * A flow 'flow' may be accessed without a risk of being freed during an RCU
518 * grace period. Code that needs to hold onto a flow for a while
519 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
520 *
521 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
522 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
523 * from modification.
8a4e3a85
BP
524 *
525 * Some members, marked 'const', are immutable. Accessing other members
526 * requires synchronization, as noted in more detail below.
527 */
72865317 528struct dp_netdev_flow {
11e5cf1f 529 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 530 /* Hash table index by unmasked flow. */
1c1e46ed
AW
531 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
532 /* 'flow_table'. */
241bad15 533 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
70e5ed6f 534 const ovs_u128 ufid; /* Unique flow identifier. */
241bad15 535 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
bd5131ba 536 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 537 /* flow. */
72865317 538
ed79f89a
DDP
539 /* Number of references.
540 * The classifier owns one reference.
541 * Any thread trying to keep a rule from being freed should hold its own
542 * reference. */
543 struct ovs_refcount ref_cnt;
544
11e5cf1f 545 bool dead;
241bad15 546 uint32_t mark; /* Unique flow mark assigned to a flow */
11e5cf1f 547
1c1e46ed
AW
548 /* Statistics. */
549 struct dp_netdev_flow_stats stats;
8a4e3a85 550
45c626a3 551 /* Actions. */
61e7deb1 552 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 553
11e5cf1f
DDP
554 /* While processing a group of input packets, the datapath uses the next
555 * member to store a pointer to the output batch for the flow. It is
556 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
f7ce4811
PS
557 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
558 struct packet_batch_per_flow *batch;
11e5cf1f 559
0de8783a 560 /* Packet classification. */
342b8904 561 char *dp_extra_info; /* String to return in a flow dump/get. */
0de8783a
JR
562 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
563 /* 'cr' must be the last member. */
72865317
BP
564};
565
ed79f89a 566static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 567static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f 568static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
f0fb825a 569 struct flow *, bool);
8a4e3a85 570
a84cb64a
BP
571/* A set of datapath actions within a "struct dp_netdev_flow".
572 *
573 *
574 * Thread-safety
575 * =============
576 *
45c626a3 577 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 578struct dp_netdev_actions {
a84cb64a
BP
579 /* These members are immutable: they do not change during the struct's
580 * lifetime. */
a84cb64a 581 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 582 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
583};
584
585struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
586 size_t);
61e7deb1
BP
587struct dp_netdev_actions *dp_netdev_flow_get_actions(
588 const struct dp_netdev_flow *);
589static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 590
947dc567 591struct polled_queue {
922b28d4 592 struct dp_netdev_rxq *rxq;
947dc567 593 odp_port_t port_no;
2fbadeb6 594 bool emc_enabled;
35c91567
DM
595 bool rxq_enabled;
596 uint64_t change_seq;
947dc567
DDP
597};
598
ae7ad0a1
IM
599/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
600struct rxq_poll {
947dc567
DDP
601 struct dp_netdev_rxq *rxq;
602 struct hmap_node node;
ae7ad0a1
IM
603};
604
57eebbb4
DDP
605/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
606 * 'tnl_port_cache' or 'tx_ports'. */
d0cca6c3 607struct tx_port {
324c8374
IM
608 struct dp_netdev_port *port;
609 int qid;
610 long long last_used;
d0cca6c3 611 struct hmap_node node;
c71ea3c4 612 long long flush_time;
009e0033 613 struct dp_packet_batch output_pkts;
58ed6df0 614 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
d0cca6c3
DDP
615};
616
9df65060
VDA
617/* Contained by struct tx_bond 'slave_buckets'. */
618struct slave_entry {
619 odp_port_t slave_id;
620 atomic_ullong n_packets;
621 atomic_ullong n_bytes;
622};
623
624/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
625struct tx_bond {
626 struct cmap_node node;
627 uint32_t bond_id;
628 struct slave_entry slave_buckets[BOND_BUCKETS];
629};
630
b010be17
IM
631/* A set of properties for the current processing loop that is not directly
632 * associated with the pmd thread itself, but with the packets being
633 * processed or the short-term system configuration (for example, time).
634 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
635struct dp_netdev_pmd_thread_ctx {
636 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
637 long long now;
58ed6df0
IM
638 /* RX queue from which last packet was received. */
639 struct dp_netdev_rxq *last_rxq;
2fbadeb6
IM
640 /* EMC insertion probability context for the current processing cycle. */
641 uint32_t emc_insert_min;
d0cca6c3
DDP
642};
643
e4cfed38
PS
644/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
645 * the performance overhead of interrupt processing. Therefore netdev can
646 * not implement rx-wait for these devices. dpif-netdev needs to poll
647 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 648 * devices assigned to itself.
e4cfed38
PS
649 *
650 * DPDK used PMD for accessing NIC.
651 *
65f13b50
AW
652 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
653 * I/O of all non-pmd threads. There will be no actual thread created
654 * for the instance.
1c1e46ed 655 *
1859876c
BB
656 * Each struct has its own flow cache and classifier per managed ingress port.
657 * For packets received on ingress port, a look up is done on corresponding PMD
658 * thread's flow cache and in case of a miss, lookup is performed in the
659 * corresponding classifier of port. Packets are executed with the found
660 * actions in either case.
1c1e46ed 661 * */
65f13b50 662struct dp_netdev_pmd_thread {
d9d73f84
IM
663 struct dp_netdev *dp;
664 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
665 struct cmap_node node; /* In 'dp->poll_threads'. */
666
65f13b50
AW
667 /* Per thread exact-match cache. Note, the instance for cpu core
668 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
d0cca6c3
DDP
669 * need to be protected by 'non_pmd_mutex'. Every other instance
670 * will only be accessed by its own pmd thread. */
60d8ccae 671 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
1c1e46ed 672
3453b4d6 673 /* Flow-Table and classifiers
1c1e46ed
AW
674 *
675 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
3453b4d6
JS
676 * changes to 'classifiers' must be made while still holding the
677 * 'flow_mutex'.
1c1e46ed
AW
678 */
679 struct ovs_mutex flow_mutex;
d9d73f84
IM
680 struct cmap flow_table OVS_GUARDED; /* Flow table. */
681
682 /* One classifier per in_port polled by the pmd */
683 struct cmap classifiers;
684 /* Periodically sort subtable vectors according to hit frequencies */
685 long long int next_optimization;
686 /* End of the next time interval for which processing cycles
687 are stored for each polled rxq. */
688 long long int rxq_next_cycle_store;
689
2a2c67b4
KT
690 /* Last interval timestamp. */
691 uint64_t intrvl_tsc_prev;
692 /* Last interval cycles. */
693 atomic_ullong intrvl_cycles;
694
b010be17
IM
695 /* Current context of the PMD thread. */
696 struct dp_netdev_pmd_thread_ctx ctx;
d9d73f84 697
d9d73f84
IM
698 struct seq *reload_seq;
699 uint64_t last_reload_seq;
ec61d470
IM
700
701 /* These are atomic variables used as a synchronization and configuration
702 * points for thread reload/exit.
703 *
704 * 'reload' atomic is the main one and it's used as a memory
705 * synchronization point for all other knobs and data.
706 *
707 * For a thread that requests PMD reload:
708 *
709 * * All changes that should be visible to the PMD thread must be made
710 * before setting the 'reload'. These changes could use any memory
711 * ordering model including 'relaxed'.
712 * * Setting the 'reload' atomic should occur in the same thread where
713 * all other PMD configuration options updated.
714 * * Setting the 'reload' atomic should be done with 'release' memory
715 * ordering model or stricter. This will guarantee that all previous
716 * changes (including non-atomic and 'relaxed') will be visible to
717 * the PMD thread.
718 * * To check that reload is done, thread should poll the 'reload' atomic
719 * to become 'false'. Polling should be done with 'acquire' memory
720 * ordering model or stricter. This ensures that PMD thread completed
721 * the reload process.
722 *
723 * For the PMD thread:
724 *
725 * * PMD thread should read 'reload' atomic with 'acquire' memory
726 * ordering model or stricter. This will guarantee that all changes
727 * made before setting the 'reload' in the requesting thread will be
728 * visible to the PMD thread.
729 * * All other configuration data could be read with any memory
730 * ordering model (including non-atomic and 'relaxed') but *only after*
731 * reading the 'reload' atomic set to 'true'.
732 * * When the PMD reload done, PMD should (optionally) set all the below
733 * knobs except the 'reload' to their default ('false') values and
734 * (mandatory), as the last step, set the 'reload' to 'false' using
735 * 'release' memory ordering model or stricter. This will inform the
736 * requesting thread that PMD has completed a reload cycle.
737 */
d9d73f84 738 atomic_bool reload; /* Do we need to reload ports? */
6d9fead1 739 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
e2cafa86 740 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
299c8d61 741 atomic_bool exit; /* For terminating the pmd thread. */
ec61d470 742
d9d73f84
IM
743 pthread_t thread;
744 unsigned core_id; /* CPU core id of this pmd thread. */
745 int numa_id; /* numa node id of this pmd thread. */
746 bool isolated;
747
748 /* Queue id used by this pmd thread to send packets on all netdevs if
749 * XPS disabled for this netdev. All static_tx_qid's are unique and less
750 * than 'cmap_count(dp->poll_threads)'. */
751 uint32_t static_tx_qid;
752
c71ea3c4
IM
753 /* Number of filled output batches. */
754 int n_output_batches;
755
d9d73f84
IM
756 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
757 /* List of rx queues to poll. */
758 struct hmap poll_list OVS_GUARDED;
759 /* Map of 'tx_port's used for transmission. Written by the main thread,
760 * read by the pmd thread. */
761 struct hmap tx_ports OVS_GUARDED;
762
9df65060
VDA
763 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
764 /* Map of 'tx_bond's used for transmission. Written by the main thread
765 * and read by the pmd thread. */
766 struct cmap tx_bonds;
767
d9d73f84
IM
768 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
769 * ports (that support push_tunnel/pop_tunnel), the other contains ports
770 * with at least one txq (that support send). A port can be in both.
771 *
772 * There are two separate maps to make sure that we don't try to execute
773 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
774 *
775 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
776 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
777 * other instance will only be accessed by its own pmd thread. */
778 struct hmap tnl_port_cache;
779 struct hmap send_port_cache;
780
82a48ead
JS
781 /* Keep track of detailed PMD performance statistics. */
782 struct pmd_perf_stats perf_stats;
d9d73f84 783
5bf84282
NK
784 /* Stats from previous iteration used by automatic pmd
785 * load balance logic. */
786 uint64_t prev_stats[PMD_N_STATS];
787 atomic_count pmd_overloaded;
788
d9d73f84
IM
789 /* Set to true if the pmd thread needs to be reloaded. */
790 bool need_reload;
6c3eee82
BP
791};
792
72865317
BP
793/* Interface to netdev-based datapath. */
794struct dpif_netdev {
795 struct dpif dpif;
796 struct dp_netdev *dp;
d33ed218 797 uint64_t last_port_seq;
72865317
BP
798};
799
8a4e3a85 800static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
e9985d6a
DDP
801 struct dp_netdev_port **portp)
802 OVS_REQUIRES(dp->port_mutex);
8a4e3a85 803static int get_port_by_name(struct dp_netdev *dp, const char *devname,
e9985d6a
DDP
804 struct dp_netdev_port **portp)
805 OVS_REQUIRES(dp->port_mutex);
8a4e3a85
BP
806static void dp_netdev_free(struct dp_netdev *)
807 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
808static int do_add_port(struct dp_netdev *dp, const char *devname,
809 const char *type, odp_port_t port_no)
59e6d833 810 OVS_REQUIRES(dp->port_mutex);
c40b890f 811static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 812 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
813static int dpif_netdev_open(const struct dpif_class *, const char *name,
814 bool create, struct dpif **);
65f13b50 815static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 816 struct dp_packet_batch *,
7d7ded7a
DB
817 bool should_steal,
818 const struct flow *flow,
4edb9ae9 819 const struct nlattr *actions,
b010be17 820 size_t actions_len);
65f13b50 821static void dp_netdev_input(struct dp_netdev_pmd_thread *,
1895cc8d 822 struct dp_packet_batch *, odp_port_t port_no);
a90ed026 823static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
1895cc8d 824 struct dp_packet_batch *);
41ccaa24 825
6b31e073 826static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 827static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50 828static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
00873463
DDP
829 struct dp_netdev *dp, unsigned core_id,
830 int numa_id);
1c1e46ed 831static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
e9985d6a
DDP
832static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
833 OVS_REQUIRES(dp->port_mutex);
834
e32971b8 835static void *pmd_thread_main(void *);
b19befae 836static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 837 unsigned core_id);
1c1e46ed
AW
838static struct dp_netdev_pmd_thread *
839dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
140dd699
IM
840static void dp_netdev_del_pmd(struct dp_netdev *dp,
841 struct dp_netdev_pmd_thread *pmd);
e32971b8 842static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
d0cca6c3 843static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
d0cca6c3 844static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
e32971b8
DDP
845 struct dp_netdev_port *port)
846 OVS_REQUIRES(pmd->port_mutex);
847static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
848 struct tx_port *tx)
849 OVS_REQUIRES(pmd->port_mutex);
d0cca6c3 850static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
947dc567
DDP
851 struct dp_netdev_rxq *rxq)
852 OVS_REQUIRES(pmd->port_mutex);
e32971b8
DDP
853static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
854 struct rxq_poll *poll)
855 OVS_REQUIRES(pmd->port_mutex);
c71ea3c4
IM
856static int
857dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
858 bool force);
9df65060
VDA
859static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
860 struct tx_bond *bond, bool update)
861 OVS_EXCLUDED(pmd->bond_mutex);
862static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
863 uint32_t bond_id)
864 OVS_EXCLUDED(pmd->bond_mutex);
009e0033 865
e32971b8 866static void reconfigure_datapath(struct dp_netdev *dp)
3eb67853 867 OVS_REQUIRES(dp->port_mutex);
1c1e46ed
AW
868static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
869static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
870static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
d0cca6c3
DDP
871static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
872 OVS_REQUIRES(pmd->port_mutex);
3453b4d6 873static inline void
4809891b
KT
874dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
875 struct polled_queue *poll_list, int poll_cnt);
876static void
877dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
878 enum rxq_cycles_counter_type type,
879 unsigned long long cycles);
880static uint64_t
881dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
882 enum rxq_cycles_counter_type type);
883static void
884dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
885 unsigned long long cycles);
655856ef
KT
886static uint64_t
887dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
324c8374
IM
888static void
889dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 890 bool purge);
324c8374 891static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 892 struct tx_port *tx);
324c8374 893
67ad54cb 894static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d 895static void emc_clear_entry(struct emc_entry *ce);
60d8ccae 896static void smc_clear_entry(struct smc_bucket *b, int idx);
9bbf1c3d 897
cd995c73 898static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
79f36875
JS
899static inline bool
900pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
02bb2824
YL
901static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
902 struct dp_netdev_flow *flow);
cd995c73 903
9bbf1c3d
DDP
904static void
905emc_cache_init(struct emc_cache *flow_cache)
906{
907 int i;
908
67ad54cb 909 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
910 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
911 flow_cache->entries[i].flow = NULL;
0de8783a 912 flow_cache->entries[i].key.hash = 0;
09b0fa9c 913 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 914 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
915 }
916}
917
60d8ccae
YW
918static void
919smc_cache_init(struct smc_cache *smc_cache)
920{
921 int i, j;
922 for (i = 0; i < SMC_BUCKET_CNT; i++) {
923 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
924 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
925 }
926 }
927}
928
929static void
930dfc_cache_init(struct dfc_cache *flow_cache)
931{
932 emc_cache_init(&flow_cache->emc_cache);
933 smc_cache_init(&flow_cache->smc_cache);
934}
935
9bbf1c3d
DDP
936static void
937emc_cache_uninit(struct emc_cache *flow_cache)
938{
939 int i;
940
941 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
942 emc_clear_entry(&flow_cache->entries[i]);
943 }
944}
945
60d8ccae
YW
946static void
947smc_cache_uninit(struct smc_cache *smc)
948{
949 int i, j;
950
951 for (i = 0; i < SMC_BUCKET_CNT; i++) {
952 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
953 smc_clear_entry(&(smc->buckets[i]), j);
954 }
955 }
956}
957
958static void
959dfc_cache_uninit(struct dfc_cache *flow_cache)
960{
961 smc_cache_uninit(&flow_cache->smc_cache);
962 emc_cache_uninit(&flow_cache->emc_cache);
963}
964
67ad54cb
AW
965/* Check and clear dead flow references slowly (one entry at each
966 * invocation). */
967static void
968emc_cache_slow_sweep(struct emc_cache *flow_cache)
969{
970 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
971
972 if (!emc_entry_alive(entry)) {
973 emc_clear_entry(entry);
974 }
975 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
976}
977
b010be17
IM
978/* Updates the time in PMD threads context and should be called in three cases:
979 *
980 * 1. PMD structure initialization:
981 * - dp_netdev_configure_pmd()
982 *
983 * 2. Before processing of the new packet batch:
984 * - dpif_netdev_execute()
009e0033 985 * - dp_netdev_process_rxq_port()
b010be17
IM
986 *
987 * 3. At least once per polling iteration in main polling threads if no
988 * packets received on current iteration:
989 * - dpif_netdev_run()
990 * - pmd_thread_main()
991 *
992 * 'pmd->ctx.now' should be used without update in all other cases if possible.
993 */
994static inline void
995pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
996{
05f9e707 997 pmd->ctx.now = time_usec();
b010be17
IM
998}
999
c4ea7529
BP
1000/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
1001bool
1002dpif_is_netdev(const struct dpif *dpif)
1003{
1004 return dpif->dpif_class->open == dpif_netdev_open;
1005}
1006
72865317
BP
1007static struct dpif_netdev *
1008dpif_netdev_cast(const struct dpif *dpif)
1009{
c4ea7529 1010 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
1011 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
1012}
1013
1014static struct dp_netdev *
1015get_dp_netdev(const struct dpif *dpif)
1016{
1017 return dpif_netdev_cast(dpif)->dp;
1018}
6553d06b
DDP
1019\f
1020enum pmd_info_type {
ce179f11
IM
1021 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
1022 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
79f36875
JS
1023 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
1024 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
6553d06b
DDP
1025};
1026
1027static void
82a48ead 1028format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
6553d06b 1029{
6553d06b
DDP
1030 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1031 ? "main thread" : "pmd thread");
6553d06b
DDP
1032 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1033 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1034 }
d5c199ea 1035 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 1036 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
1037 }
1038 ds_put_cstr(reply, ":\n");
82a48ead
JS
1039}
1040
1041static void
1042pmd_info_show_stats(struct ds *reply,
1043 struct dp_netdev_pmd_thread *pmd)
1044{
1045 uint64_t stats[PMD_N_STATS];
1046 uint64_t total_cycles, total_packets;
1047 double passes_per_pkt = 0;
1048 double lookups_per_hit = 0;
1049 double packets_per_batch = 0;
1050
1051 pmd_perf_read_counters(&pmd->perf_stats, stats);
1052 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1053 + stats[PMD_CYCLES_ITER_BUSY];
1054 total_packets = stats[PMD_STAT_RECV];
1055
1056 format_pmd_thread(reply, pmd);
6553d06b 1057
82a48ead
JS
1058 if (total_packets > 0) {
1059 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1060 / (double) total_packets;
cc4891f3 1061 }
82a48ead
JS
1062 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1063 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1064 / (double) stats[PMD_STAT_MASKED_HIT];
1065 }
1066 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1067 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1068 / (double) stats[PMD_STAT_SENT_BATCHES];
cc4891f3
IM
1069 }
1070
6553d06b 1071 ds_put_format(reply,
5a0e4aec
BP
1072 " packets received: %"PRIu64"\n"
1073 " packet recirculations: %"PRIu64"\n"
1074 " avg. datapath passes per packet: %.02f\n"
1075 " emc hits: %"PRIu64"\n"
60d8ccae 1076 " smc hits: %"PRIu64"\n"
5a0e4aec
BP
1077 " megaflow hits: %"PRIu64"\n"
1078 " avg. subtable lookups per megaflow hit: %.02f\n"
1079 " miss with success upcall: %"PRIu64"\n"
1080 " miss with failed upcall: %"PRIu64"\n"
1081 " avg. packets per output batch: %.02f\n",
82a48ead
JS
1082 total_packets, stats[PMD_STAT_RECIRC],
1083 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
60d8ccae 1084 stats[PMD_STAT_SMC_HIT],
82a48ead
JS
1085 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1086 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
cc4891f3 1087 packets_per_batch);
6553d06b
DDP
1088
1089 if (total_cycles == 0) {
1090 return;
1091 }
1092
1093 ds_put_format(reply,
5a0e4aec
BP
1094 " idle cycles: %"PRIu64" (%.02f%%)\n"
1095 " processing cycles: %"PRIu64" (%.02f%%)\n",
82a48ead
JS
1096 stats[PMD_CYCLES_ITER_IDLE],
1097 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1098 stats[PMD_CYCLES_ITER_BUSY],
1099 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
6553d06b
DDP
1100
1101 if (total_packets == 0) {
1102 return;
1103 }
1104
1105 ds_put_format(reply,
5a0e4aec 1106 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
82a48ead 1107 total_cycles / (double) total_packets,
6553d06b
DDP
1108 total_cycles, total_packets);
1109
1110 ds_put_format(reply,
5a0e4aec 1111 " avg processing cycles per packet: "
82a48ead
JS
1112 "%.02f (%"PRIu64"/%"PRIu64")\n",
1113 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1114 stats[PMD_CYCLES_ITER_BUSY], total_packets);
6553d06b
DDP
1115}
1116
79f36875
JS
1117static void
1118pmd_info_show_perf(struct ds *reply,
1119 struct dp_netdev_pmd_thread *pmd,
1120 struct pmd_perf_params *par)
1121{
1122 if (pmd->core_id != NON_PMD_CORE_ID) {
1123 char *time_str =
1124 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1125 long long now = time_msec();
1126 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1127
1128 ds_put_cstr(reply, "\n");
1129 ds_put_format(reply, "Time: %s\n", time_str);
1130 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1131 ds_put_cstr(reply, "\n");
1132 format_pmd_thread(reply, pmd);
1133 ds_put_cstr(reply, "\n");
1134 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1135 if (pmd_perf_metrics_enabled(pmd)) {
1136 /* Prevent parallel clearing of perf metrics. */
1137 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1138 if (par->histograms) {
1139 ds_put_cstr(reply, "\n");
1140 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1141 }
1142 if (par->iter_hist_len > 0) {
1143 ds_put_cstr(reply, "\n");
1144 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1145 par->iter_hist_len);
1146 }
1147 if (par->ms_hist_len > 0) {
1148 ds_put_cstr(reply, "\n");
1149 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1150 par->ms_hist_len);
1151 }
1152 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1153 }
1154 free(time_str);
1155 }
1156}
1157
947dc567
DDP
1158static int
1159compare_poll_list(const void *a_, const void *b_)
1160{
1161 const struct rxq_poll *a = a_;
1162 const struct rxq_poll *b = b_;
1163
1164 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1165 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1166
1167 int cmp = strcmp(namea, nameb);
1168 if (!cmp) {
1169 return netdev_rxq_get_queue_id(a->rxq->rx)
1170 - netdev_rxq_get_queue_id(b->rxq->rx);
1171 } else {
1172 return cmp;
1173 }
1174}
1175
1176static void
1177sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1178 size_t *n)
216abd28 1179 OVS_REQUIRES(pmd->port_mutex)
947dc567
DDP
1180{
1181 struct rxq_poll *ret, *poll;
1182 size_t i;
1183
1184 *n = hmap_count(&pmd->poll_list);
1185 if (!*n) {
1186 ret = NULL;
1187 } else {
1188 ret = xcalloc(*n, sizeof *ret);
1189 i = 0;
1190 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1191 ret[i] = *poll;
1192 i++;
1193 }
1194 ovs_assert(i == *n);
1cc1b5f6 1195 qsort(ret, *n, sizeof *ret, compare_poll_list);
947dc567
DDP
1196 }
1197
947dc567
DDP
1198 *list = ret;
1199}
1200
ce179f11
IM
1201static void
1202pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1203{
1204 if (pmd->core_id != NON_PMD_CORE_ID) {
947dc567 1205 struct rxq_poll *list;
2a2c67b4
KT
1206 size_t n_rxq;
1207 uint64_t total_cycles = 0;
ce179f11 1208
3eb67853 1209 ds_put_format(reply,
5a0e4aec 1210 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
3eb67853
IM
1211 pmd->numa_id, pmd->core_id, (pmd->isolated)
1212 ? "true" : "false");
ce179f11 1213
d0cca6c3 1214 ovs_mutex_lock(&pmd->port_mutex);
2a2c67b4 1215 sorted_poll_list(pmd, &list, &n_rxq);
ce179f11 1216
2a2c67b4
KT
1217 /* Get the total pmd cycles for an interval. */
1218 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1219 /* Estimate the cycles to cover all intervals. */
1220 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1221
1222 for (int i = 0; i < n_rxq; i++) {
1223 struct dp_netdev_rxq *rxq = list[i].rxq;
1224 const char *name = netdev_rxq_get_name(rxq->rx);
1225 uint64_t proc_cycles = 0;
1226
1227 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1228 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
ce179f11 1229 }
5a0e4aec 1230 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
947dc567 1231 netdev_rxq_get_queue_id(list[i].rxq->rx));
35c91567
DM
1232 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1233 ? "(enabled) " : "(disabled)");
5a0e4aec 1234 ds_put_format(reply, " pmd usage: ");
2a2c67b4
KT
1235 if (total_cycles) {
1236 ds_put_format(reply, "%2"PRIu64"",
1237 proc_cycles * 100 / total_cycles);
1238 ds_put_cstr(reply, " %");
1239 } else {
1240 ds_put_format(reply, "%s", "NOT AVAIL");
1241 }
1242 ds_put_cstr(reply, "\n");
ce179f11 1243 }
d0cca6c3 1244 ovs_mutex_unlock(&pmd->port_mutex);
947dc567 1245 free(list);
ce179f11
IM
1246 }
1247}
1248
34d8e04b
EC
1249static int
1250compare_poll_thread_list(const void *a_, const void *b_)
1251{
1252 const struct dp_netdev_pmd_thread *a, *b;
1253
1254 a = *(struct dp_netdev_pmd_thread **)a_;
1255 b = *(struct dp_netdev_pmd_thread **)b_;
1256
1257 if (a->core_id < b->core_id) {
1258 return -1;
1259 }
1260 if (a->core_id > b->core_id) {
1261 return 1;
1262 }
1263 return 0;
1264}
1265
1266/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1267 * this list, as long as we do not go to quiescent state. */
1268static void
1269sorted_poll_thread_list(struct dp_netdev *dp,
1270 struct dp_netdev_pmd_thread ***list,
1271 size_t *n)
1272{
1273 struct dp_netdev_pmd_thread *pmd;
1274 struct dp_netdev_pmd_thread **pmd_list;
1275 size_t k = 0, n_pmds;
1276
1277 n_pmds = cmap_count(&dp->poll_threads);
1278 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1279
1280 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1281 if (k >= n_pmds) {
1282 break;
1283 }
1284 pmd_list[k++] = pmd;
1285 }
1286
1287 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1288
1289 *list = pmd_list;
1290 *n = k;
1291}
1292
cd995c73
KT
1293static void
1294dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1295 const char *argv[], void *aux OVS_UNUSED)
1296{
1297 struct ds reply = DS_EMPTY_INITIALIZER;
1298 struct dp_netdev *dp = NULL;
1299
1300 ovs_mutex_lock(&dp_netdev_mutex);
1301
1302 if (argc == 2) {
1303 dp = shash_find_data(&dp_netdevs, argv[1]);
1304 } else if (shash_count(&dp_netdevs) == 1) {
1305 /* There's only one datapath */
1306 dp = shash_first(&dp_netdevs)->data;
1307 }
1308
1309 if (!dp) {
1310 ovs_mutex_unlock(&dp_netdev_mutex);
1311 unixctl_command_reply_error(conn,
1312 "please specify an existing datapath");
1313 return;
1314 }
1315
1316 dp_netdev_request_reconfigure(dp);
1317 ovs_mutex_unlock(&dp_netdev_mutex);
1318 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1319 unixctl_command_reply(conn, ds_cstr(&reply));
1320 ds_destroy(&reply);
1321}
1322
6553d06b
DDP
1323static void
1324dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1325 void *aux)
1326{
1327 struct ds reply = DS_EMPTY_INITIALIZER;
34d8e04b 1328 struct dp_netdev_pmd_thread **pmd_list;
6553d06b
DDP
1329 struct dp_netdev *dp = NULL;
1330 enum pmd_info_type type = *(enum pmd_info_type *) aux;
82a48ead
JS
1331 unsigned int core_id;
1332 bool filter_on_pmd = false;
1333 size_t n;
6553d06b
DDP
1334
1335 ovs_mutex_lock(&dp_netdev_mutex);
1336
82a48ead 1337 while (argc > 1) {
79f36875 1338 if (!strcmp(argv[1], "-pmd") && argc > 2) {
82a48ead
JS
1339 if (str_to_uint(argv[2], 10, &core_id)) {
1340 filter_on_pmd = true;
1341 }
1342 argc -= 2;
1343 argv += 2;
1344 } else {
1345 dp = shash_find_data(&dp_netdevs, argv[1]);
1346 argc -= 1;
1347 argv += 1;
1348 }
6553d06b
DDP
1349 }
1350
1351 if (!dp) {
82a48ead
JS
1352 if (shash_count(&dp_netdevs) == 1) {
1353 /* There's only one datapath */
1354 dp = shash_first(&dp_netdevs)->data;
1355 } else {
1356 ovs_mutex_unlock(&dp_netdev_mutex);
1357 unixctl_command_reply_error(conn,
1358 "please specify an existing datapath");
1359 return;
1360 }
6553d06b
DDP
1361 }
1362
34d8e04b
EC
1363 sorted_poll_thread_list(dp, &pmd_list, &n);
1364 for (size_t i = 0; i < n; i++) {
1365 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1366 if (!pmd) {
1367 break;
1368 }
82a48ead
JS
1369 if (filter_on_pmd && pmd->core_id != core_id) {
1370 continue;
1371 }
ce179f11
IM
1372 if (type == PMD_INFO_SHOW_RXQ) {
1373 pmd_info_show_rxq(&reply, pmd);
82a48ead
JS
1374 } else if (type == PMD_INFO_CLEAR_STATS) {
1375 pmd_perf_stats_clear(&pmd->perf_stats);
1376 } else if (type == PMD_INFO_SHOW_STATS) {
1377 pmd_info_show_stats(&reply, pmd);
79f36875
JS
1378 } else if (type == PMD_INFO_PERF_SHOW) {
1379 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
6553d06b
DDP
1380 }
1381 }
34d8e04b 1382 free(pmd_list);
6553d06b
DDP
1383
1384 ovs_mutex_unlock(&dp_netdev_mutex);
1385
1386 unixctl_command_reply(conn, ds_cstr(&reply));
1387 ds_destroy(&reply);
1388}
79f36875
JS
1389
1390static void
1391pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1392 const char *argv[],
1393 void *aux OVS_UNUSED)
1394{
1395 struct pmd_perf_params par;
1396 long int it_hist = 0, ms_hist = 0;
1397 par.histograms = true;
1398
1399 while (argc > 1) {
1400 if (!strcmp(argv[1], "-nh")) {
1401 par.histograms = false;
1402 argc -= 1;
1403 argv += 1;
1404 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1405 it_hist = strtol(argv[2], NULL, 10);
1406 if (it_hist < 0) {
1407 it_hist = 0;
1408 } else if (it_hist > HISTORY_LEN) {
1409 it_hist = HISTORY_LEN;
1410 }
1411 argc -= 2;
1412 argv += 2;
1413 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1414 ms_hist = strtol(argv[2], NULL, 10);
1415 if (ms_hist < 0) {
1416 ms_hist = 0;
1417 } else if (ms_hist > HISTORY_LEN) {
1418 ms_hist = HISTORY_LEN;
1419 }
1420 argc -= 2;
1421 argv += 2;
1422 } else {
1423 break;
1424 }
1425 }
1426 par.iter_hist_len = it_hist;
1427 par.ms_hist_len = ms_hist;
1428 par.command_type = PMD_INFO_PERF_SHOW;
1429 dpif_netdev_pmd_info(conn, argc, argv, &par);
1430}
9df65060
VDA
1431
1432static void
1433dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1434 const char *argv[], void *aux OVS_UNUSED)
1435{
1436 struct ds reply = DS_EMPTY_INITIALIZER;
1437 struct dp_netdev *dp = NULL;
1438
1439 ovs_mutex_lock(&dp_netdev_mutex);
1440 if (argc == 2) {
1441 dp = shash_find_data(&dp_netdevs, argv[1]);
1442 } else if (shash_count(&dp_netdevs) == 1) {
1443 /* There's only one datapath. */
1444 dp = shash_first(&dp_netdevs)->data;
1445 }
1446 if (!dp) {
1447 ovs_mutex_unlock(&dp_netdev_mutex);
1448 unixctl_command_reply_error(conn,
1449 "please specify an existing datapath");
1450 return;
1451 }
1452
1453 if (cmap_count(&dp->tx_bonds) > 0) {
1454 struct tx_bond *dp_bond_entry;
1455 uint32_t slave_id;
1456
1457 ds_put_cstr(&reply, "Bonds:\n");
1458 CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1459 ds_put_format(&reply, " bond-id %"PRIu32":\n",
1460 dp_bond_entry->bond_id);
1461 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1462 slave_id =
1463 odp_to_u32(dp_bond_entry->slave_buckets[bucket].slave_id);
1464 ds_put_format(&reply, " bucket %d - slave %"PRIu32"\n",
1465 bucket, slave_id);
1466 }
1467 }
1468 }
1469 ovs_mutex_unlock(&dp_netdev_mutex);
1470 unixctl_command_reply(conn, ds_cstr(&reply));
1471 ds_destroy(&reply);
1472}
1473
6553d06b
DDP
1474\f
1475static int
1476dpif_netdev_init(void)
1477{
1478 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
1479 clear_aux = PMD_INFO_CLEAR_STATS,
1480 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b 1481
82a48ead
JS
1482 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1483 0, 3, dpif_netdev_pmd_info,
6553d06b 1484 (void *)&show_aux);
82a48ead
JS
1485 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1486 0, 3, dpif_netdev_pmd_info,
6553d06b 1487 (void *)&clear_aux);
82a48ead
JS
1488 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1489 0, 3, dpif_netdev_pmd_info,
ce179f11 1490 (void *)&poll_aux);
79f36875
JS
1491 unixctl_command_register("dpif-netdev/pmd-perf-show",
1492 "[-nh] [-it iter-history-len]"
1493 " [-ms ms-history-len]"
1494 " [-pmd core] [dp]",
1495 0, 8, pmd_perf_show_cmd,
1496 NULL);
cd995c73
KT
1497 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1498 0, 1, dpif_netdev_pmd_rebalance,
1499 NULL);
7178fefb
JS
1500 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1501 "on|off [-b before] [-a after] [-e|-ne] "
1502 "[-us usec] [-q qlen]",
1503 0, 10, pmd_perf_log_set_cmd,
1504 NULL);
9df65060
VDA
1505 unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1506 0, 1, dpif_netdev_bond_show,
1507 NULL);
6553d06b
DDP
1508 return 0;
1509}
72865317 1510
2197d7ab 1511static int
2240af25
DDP
1512dpif_netdev_enumerate(struct sset *all_dps,
1513 const struct dpif_class *dpif_class)
2197d7ab
GL
1514{
1515 struct shash_node *node;
1516
97be1538 1517 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 1518 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
1519 struct dp_netdev *dp = node->data;
1520 if (dpif_class != dp->class) {
1521 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1522 * If the class doesn't match, skip this dpif. */
1523 continue;
1524 }
2197d7ab
GL
1525 sset_add(all_dps, node->name);
1526 }
97be1538 1527 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 1528
2197d7ab
GL
1529 return 0;
1530}
1531
add90f6f
EJ
1532static bool
1533dpif_netdev_class_is_dummy(const struct dpif_class *class)
1534{
1535 return class != &dpif_netdev_class;
1536}
1537
0aeaabc8
JP
1538static const char *
1539dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1540{
1541 return strcmp(type, "internal") ? type
e98d0cb3 1542 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
0aeaabc8
JP
1543 : "tap";
1544}
1545
72865317
BP
1546static struct dpif *
1547create_dpif_netdev(struct dp_netdev *dp)
1548{
462278db 1549 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 1550 struct dpif_netdev *dpif;
72865317 1551
6a8267c5 1552 ovs_refcount_ref(&dp->ref_cnt);
72865317 1553
72865317 1554 dpif = xmalloc(sizeof *dpif);
614c4892 1555 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 1556 dpif->dp = dp;
d33ed218 1557 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
1558
1559 return &dpif->dpif;
1560}
1561
4e022ec0
AW
1562/* Choose an unused, non-zero port number and return it on success.
1563 * Return ODPP_NONE on failure. */
1564static odp_port_t
e44768b7 1565choose_port(struct dp_netdev *dp, const char *name)
59e6d833 1566 OVS_REQUIRES(dp->port_mutex)
e44768b7 1567{
4e022ec0 1568 uint32_t port_no;
e44768b7
JP
1569
1570 if (dp->class != &dpif_netdev_class) {
1571 const char *p;
1572 int start_no = 0;
1573
1574 /* If the port name begins with "br", start the number search at
1575 * 100 to make writing tests easier. */
1576 if (!strncmp(name, "br", 2)) {
1577 start_no = 100;
1578 }
1579
1580 /* If the port name contains a number, try to assign that port number.
1581 * This can make writing unit tests easier because port numbers are
1582 * predictable. */
1583 for (p = name; *p != '\0'; p++) {
1584 if (isdigit((unsigned char) *p)) {
1585 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
1586 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1587 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1588 return u32_to_odp(port_no);
e44768b7
JP
1589 }
1590 break;
1591 }
1592 }
1593 }
1594
ff073a71
BP
1595 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1596 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1597 return u32_to_odp(port_no);
e44768b7
JP
1598 }
1599 }
1600
4e022ec0 1601 return ODPP_NONE;
e44768b7
JP
1602}
1603
72865317 1604static int
614c4892
BP
1605create_dp_netdev(const char *name, const struct dpif_class *class,
1606 struct dp_netdev **dpp)
8a4e3a85 1607 OVS_REQUIRES(dp_netdev_mutex)
72865317 1608{
1276e3db 1609 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
72865317
BP
1610 struct dp_netdev *dp;
1611 int error;
72865317 1612
1276e3db
IM
1613 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1614 * unit tests. */
1615 if (!dpif_netdev_class_is_dummy(class)
1616 && ovsthread_once_start(&tsc_freq_check)) {
1617 pmd_perf_estimate_tsc_frequency();
1618 ovsthread_once_done(&tsc_freq_check);
1619 }
1620
462278db 1621 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
1622 shash_add(&dp_netdevs, name, dp);
1623
1624 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1625 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 1626 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 1627 atomic_flag_clear(&dp->destroyed);
8a4e3a85 1628
81e89d5c 1629 ovs_mutex_init_recursive(&dp->port_mutex);
e9985d6a 1630 hmap_init(&dp->ports);
d33ed218 1631 dp->port_seq = seq_create();
9df65060
VDA
1632 ovs_mutex_init(&dp->bond_mutex);
1633 cmap_init(&dp->tx_bonds);
1634
6b31e073
RW
1635 fat_rwlock_init(&dp->upcall_rwlock);
1636
a6a426d6
IM
1637 dp->reconfigure_seq = seq_create();
1638 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1639
4b27db64
JR
1640 for (int i = 0; i < N_METER_LOCKS; ++i) {
1641 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1642 }
1643
6b31e073
RW
1644 /* Disable upcalls by default. */
1645 dp_netdev_disable_upcall(dp);
623540e4 1646 dp->upcall_aux = NULL;
6b31e073 1647 dp->upcall_cb = NULL;
e44768b7 1648
57593fd2 1649 dp->conntrack = conntrack_init();
5cf3edb3 1650
4c30b246 1651 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
c71ea3c4 1652 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
4c30b246 1653
65f13b50 1654 cmap_init(&dp->poll_threads);
e77c97b9 1655 dp->pmd_rxq_assign_cyc = true;
140dd699
IM
1656
1657 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1658 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1659 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1660
65f13b50
AW
1661 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1662 ovsthread_key_create(&dp->per_pmd_key, NULL);
1663
e9985d6a 1664 ovs_mutex_lock(&dp->port_mutex);
140dd699
IM
1665 /* non-PMD will be created before all other threads and will
1666 * allocate static_tx_qid = 0. */
f2eee189 1667 dp_netdev_set_nonpmd(dp);
65f13b50 1668
a3e8437a
TLSC
1669 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1670 "internal"),
1671 ODPP_LOCAL);
59e6d833 1672 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
1673 if (error) {
1674 dp_netdev_free(dp);
462278db 1675 return error;
72865317
BP
1676 }
1677
a36de779 1678 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 1679 *dpp = dp;
72865317
BP
1680 return 0;
1681}
1682
a6a426d6
IM
1683static void
1684dp_netdev_request_reconfigure(struct dp_netdev *dp)
1685{
1686 seq_change(dp->reconfigure_seq);
1687}
1688
1689static bool
1690dp_netdev_is_reconf_required(struct dp_netdev *dp)
1691{
1692 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1693}
1694
72865317 1695static int
614c4892 1696dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 1697 bool create, struct dpif **dpifp)
72865317 1698{
462278db 1699 struct dp_netdev *dp;
5279f8fd 1700 int error;
462278db 1701
97be1538 1702 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
1703 dp = shash_find_data(&dp_netdevs, name);
1704 if (!dp) {
5279f8fd 1705 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 1706 } else {
5279f8fd
BP
1707 error = (dp->class != class ? EINVAL
1708 : create ? EEXIST
1709 : 0);
1710 }
1711 if (!error) {
1712 *dpifp = create_dpif_netdev(dp);
72865317 1713 }
97be1538 1714 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 1715
5279f8fd 1716 return error;
72865317
BP
1717}
1718
88ace79b
DDP
1719static void
1720dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1721 OVS_NO_THREAD_SAFETY_ANALYSIS
1722{
1723 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1724 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1725
1726 /* Before freeing a lock we should release it */
1727 fat_rwlock_unlock(&dp->upcall_rwlock);
1728 fat_rwlock_destroy(&dp->upcall_rwlock);
1729}
1730
4b27db64
JR
1731static void
1732dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1733 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1734{
1735 if (dp->meters[meter_id]) {
1736 free(dp->meters[meter_id]);
1737 dp->meters[meter_id] = NULL;
1738 }
1739}
1740
9df65060
VDA
1741static uint32_t
1742hash_bond_id(uint32_t bond_id)
1743{
1744 return hash_int(bond_id, 0);
1745}
1746
8a4e3a85
BP
1747/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1748 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
1749static void
1750dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 1751 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 1752{
e9985d6a 1753 struct dp_netdev_port *port, *next;
9df65060 1754 struct tx_bond *bond;
4ad28026 1755
8a4e3a85
BP
1756 shash_find_and_delete(&dp_netdevs, dp->name);
1757
59e6d833 1758 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 1759 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
c40b890f 1760 do_del_port(dp, port);
1ba530f4 1761 }
59e6d833 1762 ovs_mutex_unlock(&dp->port_mutex);
4b27db64 1763
9df65060
VDA
1764 ovs_mutex_lock(&dp->bond_mutex);
1765 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1766 cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1767 ovsrcu_postpone(free, bond);
1768 }
1769 ovs_mutex_unlock(&dp->bond_mutex);
1770
e32971b8 1771 dp_netdev_destroy_all_pmds(dp, true);
d916785c 1772 cmap_destroy(&dp->poll_threads);
51852a57 1773
140dd699
IM
1774 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1775 id_pool_destroy(dp->tx_qid_pool);
1776
b9584f21
DDP
1777 ovs_mutex_destroy(&dp->non_pmd_mutex);
1778 ovsthread_key_delete(dp->per_pmd_key);
1779
57593fd2 1780 conntrack_destroy(dp->conntrack);
b9584f21
DDP
1781
1782
a6a426d6
IM
1783 seq_destroy(dp->reconfigure_seq);
1784
d33ed218 1785 seq_destroy(dp->port_seq);
e9985d6a 1786 hmap_destroy(&dp->ports);
3186ea46 1787 ovs_mutex_destroy(&dp->port_mutex);
88ace79b 1788
9df65060
VDA
1789 cmap_destroy(&dp->tx_bonds);
1790 ovs_mutex_destroy(&dp->bond_mutex);
1791
88ace79b
DDP
1792 /* Upcalls must be disabled at this point */
1793 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 1794
4b27db64
JR
1795 int i;
1796
1797 for (i = 0; i < MAX_METERS; ++i) {
1798 meter_lock(dp, i);
1799 dp_delete_meter(dp, i);
1800 meter_unlock(dp, i);
1801 }
1802 for (i = 0; i < N_METER_LOCKS; ++i) {
1803 ovs_mutex_destroy(&dp->meter_locks[i]);
1804 }
1805
f2eee189 1806 free(dp->pmd_cmask);
8a4e3a85 1807 free(CONST_CAST(char *, dp->name));
72865317
BP
1808 free(dp);
1809}
1810
8a4e3a85
BP
1811static void
1812dp_netdev_unref(struct dp_netdev *dp)
1813{
1814 if (dp) {
1815 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1816 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1817 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1818 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1819 dp_netdev_free(dp);
1820 }
1821 ovs_mutex_unlock(&dp_netdev_mutex);
1822 }
1823}
1824
72865317
BP
1825static void
1826dpif_netdev_close(struct dpif *dpif)
1827{
1828 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1829
8a4e3a85 1830 dp_netdev_unref(dp);
72865317
BP
1831 free(dpif);
1832}
1833
1834static int
7dab847a 1835dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1836{
1837 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1838
6a8267c5 1839 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1840 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1841 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1842 OVS_NOT_REACHED();
1843 }
1844 }
5279f8fd 1845
72865317
BP
1846 return 0;
1847}
1848
eb94da30
DDP
1849/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1850 * load/store semantics. While the increment is not atomic, the load and
1851 * store operations are, making it impossible to read inconsistent values.
1852 *
1853 * This is used to update thread local stats counters. */
1854static void
1855non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1856{
1857 unsigned long long tmp;
1858
1859 atomic_read_relaxed(var, &tmp);
1860 tmp += n;
1861 atomic_store_relaxed(var, tmp);
1862}
1863
72865317 1864static int
a8d9304d 1865dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1866{
1867 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1868 struct dp_netdev_pmd_thread *pmd;
82a48ead 1869 uint64_t pmd_stats[PMD_N_STATS];
8a4e3a85 1870
1c1e46ed
AW
1871 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1872 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1873 stats->n_flows += cmap_count(&pmd->flow_table);
82a48ead
JS
1874 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1875 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
60d8ccae 1876 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
82a48ead
JS
1877 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1878 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1879 stats->n_lost += pmd_stats[PMD_STAT_LOST];
51852a57 1880 }
1ce3fa06 1881 stats->n_masks = UINT32_MAX;
847108dc 1882 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1883
72865317
BP
1884 return 0;
1885}
1886
e4cfed38 1887static void
65f13b50 1888dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1889{
accf8626 1890 if (pmd->core_id == NON_PMD_CORE_ID) {
d0cca6c3
DDP
1891 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1892 ovs_mutex_lock(&pmd->port_mutex);
1893 pmd_load_cached_ports(pmd);
1894 ovs_mutex_unlock(&pmd->port_mutex);
1895 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
accf8626
AW
1896 return;
1897 }
1898
2788a1b1 1899 seq_change(pmd->reload_seq);
299c8d61 1900 atomic_store_explicit(&pmd->reload, true, memory_order_release);
65f13b50 1901}
e4cfed38 1902
59e6d833
BP
1903static uint32_t
1904hash_port_no(odp_port_t port_no)
1905{
1906 return hash_int(odp_to_u32(port_no), 0);
1907}
1908
72865317 1909static int
a3e8437a 1910port_create(const char *devname, const char *type,
b8d29252 1911 odp_port_t port_no, struct dp_netdev_port **portp)
72865317 1912{
72865317 1913 struct dp_netdev_port *port;
2499a8ce 1914 enum netdev_flags flags;
b8d29252 1915 struct netdev *netdev;
e32971b8 1916 int error;
72865317 1917
b8d29252 1918 *portp = NULL;
72865317
BP
1919
1920 /* Open and validate network device. */
a3e8437a 1921 error = netdev_open(devname, type, &netdev);
72865317 1922 if (error) {
b8d29252 1923 return error;
72865317 1924 }
72865317
BP
1925 /* XXX reject non-Ethernet devices */
1926
2499a8ce
AC
1927 netdev_get_flags(netdev, &flags);
1928 if (flags & NETDEV_LOOPBACK) {
1929 VLOG_ERR("%s: cannot add a loopback device", devname);
d17f4f08 1930 error = EINVAL;
b8d29252 1931 goto out;
2499a8ce
AC
1932 }
1933
e4cfed38 1934 port = xzalloc(sizeof *port);
35303d71 1935 port->port_no = port_no;
e4cfed38
PS
1936 port->netdev = netdev;
1937 port->type = xstrdup(type);
96e74404 1938 port->sf = NULL;
2fbadeb6 1939 port->emc_enabled = true;
e32971b8
DDP
1940 port->need_reconfigure = true;
1941 ovs_mutex_init(&port->txq_used_mutex);
e4cfed38 1942
b8d29252 1943 *portp = port;
72865317
BP
1944
1945 return 0;
d17f4f08 1946
d17f4f08 1947out:
b8d29252 1948 netdev_close(netdev);
d17f4f08 1949 return error;
72865317
BP
1950}
1951
b8d29252
DDP
1952static int
1953do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1954 odp_port_t port_no)
1955 OVS_REQUIRES(dp->port_mutex)
1956{
96e74404 1957 struct netdev_saved_flags *sf;
b8d29252
DDP
1958 struct dp_netdev_port *port;
1959 int error;
1960
1961 /* Reject devices already in 'dp'. */
1962 if (!get_port_by_name(dp, devname, &port)) {
1963 return EEXIST;
1964 }
1965
a3e8437a 1966 error = port_create(devname, type, port_no, &port);
b8d29252
DDP
1967 if (error) {
1968 return error;
1969 }
1970
e9985d6a 1971 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
b8d29252
DDP
1972 seq_change(dp->port_seq);
1973
e32971b8
DDP
1974 reconfigure_datapath(dp);
1975
3f51ea18 1976 /* Check that port was successfully configured. */
96e74404
IM
1977 if (!dp_netdev_lookup_port(dp, port_no)) {
1978 return EINVAL;
1979 }
1980
1981 /* Updating device flags triggers an if_notifier, which triggers a bridge
1982 * reconfiguration and another attempt to add this port, leading to an
1983 * infinite loop if the device is configured incorrectly and cannot be
1984 * added. Setting the promisc mode after a successful reconfiguration,
1985 * since we already know that the device is somehow properly configured. */
1986 error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1987 if (error) {
1988 VLOG_ERR("%s: cannot set promisc flag", devname);
1989 do_del_port(dp, port);
1990 return error;
1991 }
1992 port->sf = sf;
1993
1994 return 0;
b8d29252
DDP
1995}
1996
247527db
BP
1997static int
1998dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1999 odp_port_t *port_nop)
247527db
BP
2000{
2001 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
2002 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2003 const char *dpif_port;
4e022ec0 2004 odp_port_t port_no;
5279f8fd 2005 int error;
247527db 2006
59e6d833 2007 ovs_mutex_lock(&dp->port_mutex);
3aa30359 2008 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 2009 if (*port_nop != ODPP_NONE) {
ff073a71
BP
2010 port_no = *port_nop;
2011 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 2012 } else {
3aa30359 2013 port_no = choose_port(dp, dpif_port);
5279f8fd 2014 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 2015 }
5279f8fd 2016 if (!error) {
247527db 2017 *port_nop = port_no;
5279f8fd 2018 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 2019 }
59e6d833 2020 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
2021
2022 return error;
72865317
BP
2023}
2024
2025static int
4e022ec0 2026dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
2027{
2028 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
2029 int error;
2030
59e6d833 2031 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
2032 if (port_no == ODPP_LOCAL) {
2033 error = EINVAL;
2034 } else {
2035 struct dp_netdev_port *port;
2036
2037 error = get_port_by_number(dp, port_no, &port);
2038 if (!error) {
2039 do_del_port(dp, port);
2040 }
2041 }
59e6d833 2042 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
2043
2044 return error;
72865317
BP
2045}
2046
2047static bool
4e022ec0 2048is_valid_port_number(odp_port_t port_no)
72865317 2049{
ff073a71
BP
2050 return port_no != ODPP_NONE;
2051}
2052
2053static struct dp_netdev_port *
2054dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
e9985d6a 2055 OVS_REQUIRES(dp->port_mutex)
ff073a71
BP
2056{
2057 struct dp_netdev_port *port;
2058
e9985d6a 2059 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 2060 if (port->port_no == port_no) {
ff073a71
BP
2061 return port;
2062 }
2063 }
2064 return NULL;
72865317
BP
2065}
2066
2067static int
2068get_port_by_number(struct dp_netdev *dp,
4e022ec0 2069 odp_port_t port_no, struct dp_netdev_port **portp)
e9985d6a 2070 OVS_REQUIRES(dp->port_mutex)
72865317
BP
2071{
2072 if (!is_valid_port_number(port_no)) {
2073 *portp = NULL;
2074 return EINVAL;
2075 } else {
ff073a71 2076 *portp = dp_netdev_lookup_port(dp, port_no);
0f6a066f 2077 return *portp ? 0 : ENODEV;
72865317
BP
2078 }
2079}
2080
b284085e 2081static void
62453dad 2082port_destroy(struct dp_netdev_port *port)
b284085e 2083{
62453dad
DDP
2084 if (!port) {
2085 return;
b284085e 2086 }
b284085e 2087
62453dad
DDP
2088 netdev_close(port->netdev);
2089 netdev_restore_flags(port->sf);
accf8626 2090
62453dad 2091 for (unsigned i = 0; i < port->n_rxq; i++) {
947dc567 2092 netdev_rxq_close(port->rxqs[i].rx);
b284085e 2093 }
324c8374 2094 ovs_mutex_destroy(&port->txq_used_mutex);
3eb67853 2095 free(port->rxq_affinity_list);
324c8374 2096 free(port->txq_used);
3eb67853 2097 free(port->rxqs);
62453dad
DDP
2098 free(port->type);
2099 free(port);
b284085e
PS
2100}
2101
72865317
BP
2102static int
2103get_port_by_name(struct dp_netdev *dp,
2104 const char *devname, struct dp_netdev_port **portp)
59e6d833 2105 OVS_REQUIRES(dp->port_mutex)
72865317
BP
2106{
2107 struct dp_netdev_port *port;
2108
e9985d6a 2109 HMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 2110 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
2111 *portp = port;
2112 return 0;
2113 }
2114 }
0f6a066f
DDP
2115
2116 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2117 * existing port. */
2118 return ENODEV;
72865317
BP
2119}
2120
b9584f21 2121/* Returns 'true' if there is a port with pmd netdev. */
65f13b50 2122static bool
b9584f21 2123has_pmd_port(struct dp_netdev *dp)
e9985d6a 2124 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
2125{
2126 struct dp_netdev_port *port;
2127
e9985d6a 2128 HMAP_FOR_EACH (port, node, &dp->ports) {
5dd57e80 2129 if (netdev_is_pmd(port->netdev)) {
b9584f21 2130 return true;
65f13b50
AW
2131 }
2132 }
2133
2134 return false;
2135}
2136
c40b890f
BP
2137static void
2138do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 2139 OVS_REQUIRES(dp->port_mutex)
72865317 2140{
e9985d6a 2141 hmap_remove(&dp->ports, &port->node);
d33ed218 2142 seq_change(dp->port_seq);
d0cca6c3 2143
e32971b8 2144 reconfigure_datapath(dp);
72865317 2145
62453dad 2146 port_destroy(port);
72865317
BP
2147}
2148
2149static void
4c738a8d
BP
2150answer_port_query(const struct dp_netdev_port *port,
2151 struct dpif_port *dpif_port)
72865317 2152{
3efb6063 2153 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 2154 dpif_port->type = xstrdup(port->type);
35303d71 2155 dpif_port->port_no = port->port_no;
72865317
BP
2156}
2157
2158static int
4e022ec0 2159dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 2160 struct dpif_port *dpif_port)
72865317
BP
2161{
2162 struct dp_netdev *dp = get_dp_netdev(dpif);
2163 struct dp_netdev_port *port;
2164 int error;
2165
e9985d6a 2166 ovs_mutex_lock(&dp->port_mutex);
72865317 2167 error = get_port_by_number(dp, port_no, &port);
4afba28d 2168 if (!error && dpif_port) {
4c738a8d 2169 answer_port_query(port, dpif_port);
72865317 2170 }
e9985d6a 2171 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2172
72865317
BP
2173 return error;
2174}
2175
2176static int
2177dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 2178 struct dpif_port *dpif_port)
72865317
BP
2179{
2180 struct dp_netdev *dp = get_dp_netdev(dpif);
2181 struct dp_netdev_port *port;
2182 int error;
2183
59e6d833 2184 ovs_mutex_lock(&dp->port_mutex);
72865317 2185 error = get_port_by_name(dp, devname, &port);
4afba28d 2186 if (!error && dpif_port) {
4c738a8d 2187 answer_port_query(port, dpif_port);
72865317 2188 }
59e6d833 2189 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2190
72865317
BP
2191 return error;
2192}
2193
61e7deb1
BP
2194static void
2195dp_netdev_flow_free(struct dp_netdev_flow *flow)
2196{
61e7deb1 2197 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
342b8904 2198 free(flow->dp_extra_info);
61e7deb1
BP
2199 free(flow);
2200}
2201
ed79f89a
DDP
2202static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2203{
2204 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2205 ovsrcu_postpone(dp_netdev_flow_free, flow);
2206 }
2207}
2208
70e5ed6f
JS
2209static uint32_t
2210dp_netdev_flow_hash(const ovs_u128 *ufid)
2211{
2212 return ufid->u32[0];
2213}
2214
3453b4d6
JS
2215static inline struct dpcls *
2216dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2217 odp_port_t in_port)
2218{
2219 struct dpcls *cls;
2220 uint32_t hash = hash_port_no(in_port);
2221 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2222 if (cls->in_port == in_port) {
2223 /* Port classifier exists already */
2224 return cls;
2225 }
2226 }
2227 return NULL;
2228}
2229
2230static inline struct dpcls *
2231dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2232 odp_port_t in_port)
2233 OVS_REQUIRES(pmd->flow_mutex)
2234{
2235 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2236 uint32_t hash = hash_port_no(in_port);
2237
2238 if (!cls) {
2239 /* Create new classifier for in_port */
2240 cls = xmalloc(sizeof(*cls));
2241 dpcls_init(cls);
2242 cls->in_port = in_port;
2243 cmap_insert(&pmd->classifiers, &cls->node, hash);
2244 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2245 }
2246 return cls;
2247}
2248
241bad15
YL
2249#define MAX_FLOW_MARK (UINT32_MAX - 1)
2250#define INVALID_FLOW_MARK (UINT32_MAX)
2251
2252struct megaflow_to_mark_data {
2253 const struct cmap_node node;
2254 ovs_u128 mega_ufid;
2255 uint32_t mark;
2256};
2257
2258struct flow_mark {
2259 struct cmap megaflow_to_mark;
2260 struct cmap mark_to_flow;
2261 struct id_pool *pool;
241bad15
YL
2262};
2263
2264static struct flow_mark flow_mark = {
2265 .megaflow_to_mark = CMAP_INITIALIZER,
2266 .mark_to_flow = CMAP_INITIALIZER,
241bad15
YL
2267};
2268
2269static uint32_t
2270flow_mark_alloc(void)
2271{
2272 uint32_t mark;
2273
2274 if (!flow_mark.pool) {
2275 /* Haven't initiated yet, do it here */
2276 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2277 }
2278
2279 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2280 return mark;
2281 }
2282
2283 return INVALID_FLOW_MARK;
2284}
2285
2286static void
2287flow_mark_free(uint32_t mark)
2288{
2289 id_pool_free_id(flow_mark.pool, mark);
2290}
2291
2292/* associate megaflow with a mark, which is a 1:1 mapping */
2293static void
2294megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2295{
2296 size_t hash = dp_netdev_flow_hash(mega_ufid);
2297 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2298
2299 data->mega_ufid = *mega_ufid;
2300 data->mark = mark;
2301
2302 cmap_insert(&flow_mark.megaflow_to_mark,
2303 CONST_CAST(struct cmap_node *, &data->node), hash);
2304}
2305
2306/* disassociate meagaflow with a mark */
2307static void
2308megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2309{
2310 size_t hash = dp_netdev_flow_hash(mega_ufid);
2311 struct megaflow_to_mark_data *data;
2312
2313 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2314 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2315 cmap_remove(&flow_mark.megaflow_to_mark,
2316 CONST_CAST(struct cmap_node *, &data->node), hash);
5752eae4 2317 ovsrcu_postpone(free, data);
241bad15
YL
2318 return;
2319 }
2320 }
2321
2322 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2323 UUID_ARGS((struct uuid *)mega_ufid));
2324}
2325
2326static inline uint32_t
2327megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2328{
2329 size_t hash = dp_netdev_flow_hash(mega_ufid);
2330 struct megaflow_to_mark_data *data;
2331
2332 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2333 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2334 return data->mark;
2335 }
2336 }
2337
5d1765d3
IM
2338 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2339 UUID_ARGS((struct uuid *)mega_ufid));
241bad15
YL
2340 return INVALID_FLOW_MARK;
2341}
2342
2343/* associate mark with a flow, which is 1:N mapping */
2344static void
2345mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2346{
2347 dp_netdev_flow_ref(flow);
2348
2349 cmap_insert(&flow_mark.mark_to_flow,
2350 CONST_CAST(struct cmap_node *, &flow->mark_node),
2351 hash_int(mark, 0));
2352 flow->mark = mark;
2353
2354 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2355}
2356
2357static bool
2358flow_mark_has_no_ref(uint32_t mark)
2359{
2360 struct dp_netdev_flow *flow;
2361
2362 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2363 &flow_mark.mark_to_flow) {
2364 if (flow->mark == mark) {
2365 return false;
2366 }
2367 }
2368
2369 return true;
2370}
2371
2372static int
2373mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2374 struct dp_netdev_flow *flow)
2375{
2376 int ret = 0;
2377 uint32_t mark = flow->mark;
2378 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2379 &flow->mark_node);
2380
2381 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2382 flow->mark = INVALID_FLOW_MARK;
2383
2384 /*
2385 * no flow is referencing the mark any more? If so, let's
2386 * remove the flow from hardware and free the mark.
2387 */
2388 if (flow_mark_has_no_ref(mark)) {
30115809 2389 struct netdev *port;
241bad15
YL
2390 odp_port_t in_port = flow->flow.in_port.odp_port;
2391
1061dc7c 2392 port = netdev_ports_get(in_port, pmd->dp->class);
241bad15 2393 if (port) {
e7cb123f
IM
2394 /* Taking a global 'port_mutex' to fulfill thread safety
2395 * restrictions for the netdev-offload-dpdk module. */
2396 ovs_mutex_lock(&pmd->dp->port_mutex);
30115809 2397 ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
e7cb123f 2398 ovs_mutex_unlock(&pmd->dp->port_mutex);
30115809 2399 netdev_close(port);
241bad15 2400 }
241bad15
YL
2401
2402 flow_mark_free(mark);
2403 VLOG_DBG("Freed flow mark %u\n", mark);
2404
2405 megaflow_to_mark_disassociate(&flow->mega_ufid);
2406 }
2407 dp_netdev_flow_unref(flow);
2408
2409 return ret;
2410}
2411
2412static void
2413flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2414{
2415 struct dp_netdev_flow *flow;
2416
2417 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2418 if (flow->pmd_id == pmd->core_id) {
02bb2824 2419 queue_netdev_flow_del(pmd, flow);
241bad15
YL
2420 }
2421 }
2422}
2423
aab96ec4
YL
2424static struct dp_netdev_flow *
2425mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2426 const uint32_t mark)
2427{
2428 struct dp_netdev_flow *flow;
2429
2430 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2431 &flow_mark.mark_to_flow) {
2432 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2433 flow->dead == false) {
2434 return flow;
2435 }
2436 }
2437
2438 return NULL;
2439}
2440
02bb2824
YL
2441static struct dp_flow_offload_item *
2442dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2443 struct dp_netdev_flow *flow,
2444 int op)
2445{
2446 struct dp_flow_offload_item *offload;
2447
2448 offload = xzalloc(sizeof(*offload));
2449 offload->pmd = pmd;
2450 offload->flow = flow;
2451 offload->op = op;
2452
2453 dp_netdev_flow_ref(flow);
2454 dp_netdev_pmd_try_ref(pmd);
2455
2456 return offload;
2457}
2458
2459static void
2460dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2461{
2462 dp_netdev_pmd_unref(offload->pmd);
2463 dp_netdev_flow_unref(offload->flow);
2464
2465 free(offload->actions);
2466 free(offload);
2467}
2468
2469static void
2470dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2471{
2472 ovs_mutex_lock(&dp_flow_offload.mutex);
2473 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2474 xpthread_cond_signal(&dp_flow_offload.cond);
2475 ovs_mutex_unlock(&dp_flow_offload.mutex);
2476}
2477
2478static int
2479dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2480{
2481 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2482}
2483
2484/*
2485 * There are two flow offload operations here: addition and modification.
2486 *
2487 * For flow addition, this function does:
2488 * - allocate a new flow mark id
2489 * - perform hardware flow offload
2490 * - associate the flow mark with flow and mega flow
2491 *
2492 * For flow modification, both flow mark and the associations are still
2493 * valid, thus only item 2 needed.
2494 */
2495static int
2496dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2497{
02bb2824 2498 struct dp_netdev_pmd_thread *pmd = offload->pmd;
319a9bb3 2499 const struct dpif_class *dpif_class = pmd->dp->class;
02bb2824
YL
2500 struct dp_netdev_flow *flow = offload->flow;
2501 odp_port_t in_port = flow->flow.in_port.odp_port;
2502 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2503 struct offload_info info;
30115809 2504 struct netdev *port;
02bb2824
YL
2505 uint32_t mark;
2506 int ret;
2507
2508 if (flow->dead) {
2509 return -1;
2510 }
2511
2512 if (modification) {
2513 mark = flow->mark;
2514 ovs_assert(mark != INVALID_FLOW_MARK);
2515 } else {
2516 /*
2517 * If a mega flow has already been offloaded (from other PMD
2518 * instances), do not offload it again.
2519 */
2520 mark = megaflow_to_mark_find(&flow->mega_ufid);
2521 if (mark != INVALID_FLOW_MARK) {
2522 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2523 if (flow->mark != INVALID_FLOW_MARK) {
2524 ovs_assert(flow->mark == mark);
2525 } else {
2526 mark_to_flow_associate(mark, flow);
2527 }
2528 return 0;
2529 }
2530
2531 mark = flow_mark_alloc();
2532 if (mark == INVALID_FLOW_MARK) {
2533 VLOG_ERR("Failed to allocate flow mark!\n");
2534 }
2535 }
2536 info.flow_mark = mark;
319a9bb3 2537 info.dpif_class = dpif_class;
02bb2824 2538
1061dc7c 2539 port = netdev_ports_get(in_port, pmd->dp->class);
30115809
IM
2540 if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2541 netdev_close(port);
0a5cba65 2542 goto err_free;
02bb2824 2543 }
e7cb123f
IM
2544 /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2545 * the netdev-offload-dpdk module. */
2546 ovs_mutex_lock(&pmd->dp->port_mutex);
30115809 2547 ret = netdev_flow_put(port, &offload->match,
02bb2824
YL
2548 CONST_CAST(struct nlattr *, offload->actions),
2549 offload->actions_len, &flow->mega_ufid, &info,
2550 NULL);
e7cb123f 2551 ovs_mutex_unlock(&pmd->dp->port_mutex);
30115809 2552 netdev_close(port);
02bb2824
YL
2553
2554 if (ret) {
0a5cba65 2555 goto err_free;
02bb2824
YL
2556 }
2557
2558 if (!modification) {
2559 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2560 mark_to_flow_associate(mark, flow);
2561 }
02bb2824 2562 return 0;
0a5cba65
IM
2563
2564err_free:
2565 if (!modification) {
2566 flow_mark_free(mark);
2567 } else {
2568 mark_to_flow_disassociate(pmd, flow);
2569 }
2570 return -1;
02bb2824
YL
2571}
2572
2573static void *
2574dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2575{
2576 struct dp_flow_offload_item *offload;
2577 struct ovs_list *list;
2578 const char *op;
2579 int ret;
2580
2581 for (;;) {
2582 ovs_mutex_lock(&dp_flow_offload.mutex);
2583 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2584 ovsrcu_quiesce_start();
2585 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2586 &dp_flow_offload.mutex);
6c95dbf9 2587 ovsrcu_quiesce_end();
02bb2824
YL
2588 }
2589 list = ovs_list_pop_front(&dp_flow_offload.list);
2590 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2591 ovs_mutex_unlock(&dp_flow_offload.mutex);
2592
2593 switch (offload->op) {
2594 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2595 op = "add";
2596 ret = dp_netdev_flow_offload_put(offload);
2597 break;
2598 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2599 op = "modify";
2600 ret = dp_netdev_flow_offload_put(offload);
2601 break;
2602 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2603 op = "delete";
2604 ret = dp_netdev_flow_offload_del(offload);
2605 break;
2606 default:
2607 OVS_NOT_REACHED();
2608 }
2609
2610 VLOG_DBG("%s to %s netdev flow\n",
2611 ret == 0 ? "succeed" : "failed", op);
2612 dp_netdev_free_flow_offload(offload);
ef32a1a3 2613 ovsrcu_quiesce();
02bb2824
YL
2614 }
2615
2616 return NULL;
2617}
2618
2619static void
2620queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2621 struct dp_netdev_flow *flow)
2622{
2623 struct dp_flow_offload_item *offload;
2624
2625 if (ovsthread_once_start(&offload_thread_once)) {
2626 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2627 ovs_thread_create("dp_netdev_flow_offload",
2628 dp_netdev_flow_offload_main, NULL);
2629 ovsthread_once_done(&offload_thread_once);
2630 }
2631
2632 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2633 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2634 dp_netdev_append_flow_offload(offload);
2635}
2636
2637static void
2638queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2639 struct dp_netdev_flow *flow, struct match *match,
2640 const struct nlattr *actions, size_t actions_len)
2641{
2642 struct dp_flow_offload_item *offload;
2643 int op;
2644
2645 if (!netdev_is_flow_api_enabled()) {
2646 return;
2647 }
2648
2649 if (ovsthread_once_start(&offload_thread_once)) {
2650 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2651 ovs_thread_create("dp_netdev_flow_offload",
2652 dp_netdev_flow_offload_main, NULL);
2653 ovsthread_once_done(&offload_thread_once);
2654 }
2655
2656 if (flow->mark != INVALID_FLOW_MARK) {
2657 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2658 } else {
2659 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2660 }
2661 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2662 offload->match = *match;
2663 offload->actions = xmalloc(actions_len);
2664 memcpy(offload->actions, actions, actions_len);
2665 offload->actions_len = actions_len;
2666
2667 dp_netdev_append_flow_offload(offload);
2668}
2669
72865317 2670static void
1c1e46ed
AW
2671dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2672 struct dp_netdev_flow *flow)
2673 OVS_REQUIRES(pmd->flow_mutex)
72865317 2674{
9f361d6b 2675 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3453b4d6
JS
2676 struct dpcls *cls;
2677 odp_port_t in_port = flow->flow.in_port.odp_port;
2c0ea78f 2678
3453b4d6
JS
2679 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2680 ovs_assert(cls != NULL);
2681 dpcls_remove(cls, &flow->cr);
1c1e46ed 2682 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
241bad15 2683 if (flow->mark != INVALID_FLOW_MARK) {
02bb2824 2684 queue_netdev_flow_del(pmd, flow);
241bad15 2685 }
9bbf1c3d 2686 flow->dead = true;
ed79f89a
DDP
2687
2688 dp_netdev_flow_unref(flow);
72865317
BP
2689}
2690
2691static void
1c1e46ed 2692dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 2693{
78c8df12 2694 struct dp_netdev_flow *netdev_flow;
72865317 2695
1c1e46ed
AW
2696 ovs_mutex_lock(&pmd->flow_mutex);
2697 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2698 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2699 }
1c1e46ed 2700 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
2701}
2702
2703static int
2704dpif_netdev_flow_flush(struct dpif *dpif)
2705{
2706 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
2707 struct dp_netdev_pmd_thread *pmd;
2708
2709 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2710 dp_netdev_pmd_flow_flush(pmd);
2711 }
5279f8fd 2712
72865317
BP
2713 return 0;
2714}
2715
b0ec0f27 2716struct dp_netdev_port_state {
e9985d6a 2717 struct hmap_position position;
4c738a8d 2718 char *name;
b0ec0f27
BP
2719};
2720
2721static int
2722dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2723{
2724 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2725 return 0;
2726}
2727
72865317 2728static int
b0ec0f27 2729dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 2730 struct dpif_port *dpif_port)
72865317 2731{
b0ec0f27 2732 struct dp_netdev_port_state *state = state_;
72865317 2733 struct dp_netdev *dp = get_dp_netdev(dpif);
e9985d6a 2734 struct hmap_node *node;
ff073a71 2735 int retval;
72865317 2736
e9985d6a
DDP
2737 ovs_mutex_lock(&dp->port_mutex);
2738 node = hmap_at_position(&dp->ports, &state->position);
ff073a71
BP
2739 if (node) {
2740 struct dp_netdev_port *port;
5279f8fd 2741
ff073a71
BP
2742 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2743
2744 free(state->name);
2745 state->name = xstrdup(netdev_get_name(port->netdev));
2746 dpif_port->name = state->name;
2747 dpif_port->type = port->type;
35303d71 2748 dpif_port->port_no = port->port_no;
ff073a71
BP
2749
2750 retval = 0;
2751 } else {
2752 retval = EOF;
72865317 2753 }
e9985d6a 2754 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2755
ff073a71 2756 return retval;
b0ec0f27
BP
2757}
2758
2759static int
4c738a8d 2760dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 2761{
4c738a8d
BP
2762 struct dp_netdev_port_state *state = state_;
2763 free(state->name);
b0ec0f27
BP
2764 free(state);
2765 return 0;
72865317
BP
2766}
2767
2768static int
67a4917b 2769dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
2770{
2771 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 2772 uint64_t new_port_seq;
5279f8fd
BP
2773 int error;
2774
d33ed218
BP
2775 new_port_seq = seq_read(dpif->dp->port_seq);
2776 if (dpif->last_port_seq != new_port_seq) {
2777 dpif->last_port_seq = new_port_seq;
5279f8fd 2778 error = ENOBUFS;
72865317 2779 } else {
5279f8fd 2780 error = EAGAIN;
72865317 2781 }
5279f8fd
BP
2782
2783 return error;
72865317
BP
2784}
2785
2786static void
2787dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2788{
2789 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 2790
d33ed218 2791 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
2792}
2793
2794static struct dp_netdev_flow *
0de8783a 2795dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
2796{
2797 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
2798}
2799
9bbf1c3d
DDP
2800static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2801{
2802 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2803}
2804
79df317f
DDP
2805/* netdev_flow_key utilities.
2806 *
2807 * netdev_flow_key is basically a miniflow. We use these functions
2808 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2809 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2810 *
2811 * - Since we are dealing exclusively with miniflows created by
2812 * miniflow_extract(), if the map is different the miniflow is different.
2813 * Therefore we can be faster by comparing the map and the miniflow in a
2814 * single memcmp().
5fcff47b 2815 * - These functions can be inlined by the compiler. */
79df317f 2816
361d808d 2817/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 2818 * 'netdev_flow_key.mf' */
361d808d
JR
2819static inline size_t
2820netdev_flow_key_size(size_t flow_u64s)
79df317f 2821{
361d808d 2822 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
2823}
2824
79df317f
DDP
2825static inline bool
2826netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
2827 const struct netdev_flow_key *b)
2828{
caeb4906
JR
2829 /* 'b->len' may be not set yet. */
2830 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
2831}
2832
2833/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
d79a39fe 2834 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
0de8783a
JR
2835 * generated by miniflow_extract. */
2836static inline bool
2837netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2838 const struct miniflow *mf)
79df317f 2839{
caeb4906 2840 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
2841}
2842
2843static inline void
2844netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
2845 const struct netdev_flow_key *src)
2846{
caeb4906
JR
2847 memcpy(dst, src,
2848 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
2849}
2850
0de8783a
JR
2851/* Initialize a netdev_flow_key 'mask' from 'match'. */
2852static inline void
2853netdev_flow_mask_init(struct netdev_flow_key *mask,
2854 const struct match *match)
2855{
09b0fa9c 2856 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 2857 struct flowmap fmap;
0de8783a 2858 uint32_t hash = 0;
5fcff47b 2859 size_t idx;
0de8783a
JR
2860
2861 /* Only check masks that make sense for the flow. */
5fcff47b
JR
2862 flow_wc_map(&match->flow, &fmap);
2863 flowmap_init(&mask->mf.map);
0de8783a 2864
5fcff47b
JR
2865 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2866 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 2867
5fcff47b
JR
2868 if (mask_u64) {
2869 flowmap_set(&mask->mf.map, idx, 1);
2870 *dst++ = mask_u64;
2871 hash = hash_add64(hash, mask_u64);
0de8783a 2872 }
0de8783a
JR
2873 }
2874
5fcff47b 2875 map_t map;
0de8783a 2876
5fcff47b
JR
2877 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2878 hash = hash_add64(hash, map);
2879 }
0de8783a 2880
5fcff47b 2881 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 2882
d70e8c28 2883 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
2884 mask->len = netdev_flow_key_size(n);
2885}
2886
361d808d 2887/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
2888static inline void
2889netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2890 const struct flow *flow,
2891 const struct netdev_flow_key *mask)
79df317f 2892{
09b0fa9c
JR
2893 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2894 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 2895 uint32_t hash = 0;
d70e8c28 2896 uint64_t value;
0de8783a
JR
2897
2898 dst->len = mask->len;
361d808d 2899 dst->mf = mask->mf; /* Copy maps. */
0de8783a 2900
5fcff47b 2901 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
2902 *dst_u64 = value & *mask_u64++;
2903 hash = hash_add64(hash, *dst_u64++);
0de8783a 2904 }
09b0fa9c
JR
2905 dst->hash = hash_finish(hash,
2906 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
2907}
2908
9bbf1c3d
DDP
2909static inline bool
2910emc_entry_alive(struct emc_entry *ce)
2911{
2912 return ce->flow && !ce->flow->dead;
2913}
2914
2915static void
2916emc_clear_entry(struct emc_entry *ce)
2917{
2918 if (ce->flow) {
2919 dp_netdev_flow_unref(ce->flow);
2920 ce->flow = NULL;
2921 }
2922}
2923
2924static inline void
2925emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 2926 const struct netdev_flow_key *key)
9bbf1c3d
DDP
2927{
2928 if (ce->flow != flow) {
2929 if (ce->flow) {
2930 dp_netdev_flow_unref(ce->flow);
2931 }
2932
2933 if (dp_netdev_flow_ref(flow)) {
2934 ce->flow = flow;
2935 } else {
2936 ce->flow = NULL;
2937 }
2938 }
0de8783a
JR
2939 if (key) {
2940 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
2941 }
2942}
2943
2944static inline void
0de8783a 2945emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
2946 struct dp_netdev_flow *flow)
2947{
2948 struct emc_entry *to_be_replaced = NULL;
2949 struct emc_entry *current_entry;
2950
0de8783a
JR
2951 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2952 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 2953 /* We found the entry with the 'mf' miniflow */
0de8783a 2954 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
2955 return;
2956 }
2957
2958 /* Replacement policy: put the flow in an empty (not alive) entry, or
2959 * in the first entry where it can be */
2960 if (!to_be_replaced
2961 || (emc_entry_alive(to_be_replaced)
2962 && !emc_entry_alive(current_entry))
0de8783a 2963 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
2964 to_be_replaced = current_entry;
2965 }
2966 }
2967 /* We didn't find the miniflow in the cache.
2968 * The 'to_be_replaced' entry is where the new flow will be stored */
2969
0de8783a 2970 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
2971}
2972
4c30b246
CL
2973static inline void
2974emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2975 const struct netdev_flow_key *key,
2976 struct dp_netdev_flow *flow)
2977{
2978 /* Insert an entry into the EMC based on probability value 'min'. By
2979 * default the value is UINT32_MAX / 100 which yields an insertion
2980 * probability of 1/100 ie. 1% */
2981
2fbadeb6 2982 uint32_t min = pmd->ctx.emc_insert_min;
4c30b246 2983
656238ee 2984 if (min && random_uint32() <= min) {
60d8ccae 2985 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
4c30b246
CL
2986 }
2987}
2988
9bbf1c3d 2989static inline struct dp_netdev_flow *
0de8783a 2990emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
2991{
2992 struct emc_entry *current_entry;
2993
0de8783a
JR
2994 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2995 if (current_entry->key.hash == key->hash
2996 && emc_entry_alive(current_entry)
2997 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 2998
0de8783a 2999 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
3000 return current_entry->flow;
3001 }
3002 }
3003
3004 return NULL;
3005}
3006
60d8ccae
YW
3007static inline const struct cmap_node *
3008smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3009{
3010 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3011 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3012 uint16_t sig = hash >> 16;
3013 uint16_t index = UINT16_MAX;
3014
3015 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3016 if (bucket->sig[i] == sig) {
3017 index = bucket->flow_idx[i];
3018 break;
3019 }
3020 }
3021 if (index != UINT16_MAX) {
3022 return cmap_find_by_index(&pmd->flow_table, index);
3023 }
3024 return NULL;
3025}
3026
3027static void
3028smc_clear_entry(struct smc_bucket *b, int idx)
3029{
3030 b->flow_idx[idx] = UINT16_MAX;
3031}
3032
3033/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3034 * turned off, 2) the flow_table index is larger than uint16_t can handle.
3035 * If there is already an SMC entry having same signature, the index will be
3036 * updated. If there is no existing entry, but an empty entry is available,
3037 * the empty entry will be taken. If no empty entry or existing same signature,
3038 * a random entry from the hashed bucket will be picked. */
3039static inline void
3040smc_insert(struct dp_netdev_pmd_thread *pmd,
3041 const struct netdev_flow_key *key,
3042 uint32_t hash)
3043{
3044 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3045 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3046 uint16_t index;
3047 uint32_t cmap_index;
3048 bool smc_enable_db;
3049 int i;
3050
3051 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
3052 if (!smc_enable_db) {
3053 return;
3054 }
3055
3056 cmap_index = cmap_find_index(&pmd->flow_table, hash);
3057 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3058
3059 /* If the index is larger than SMC can handle (uint16_t), we don't
3060 * insert */
3061 if (index == UINT16_MAX) {
3062 return;
3063 }
3064
3065 /* If an entry with same signature already exists, update the index */
3066 uint16_t sig = key->hash >> 16;
3067 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3068 if (bucket->sig[i] == sig) {
3069 bucket->flow_idx[i] = index;
3070 return;
3071 }
3072 }
3073 /* If there is an empty entry, occupy it. */
3074 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3075 if (bucket->flow_idx[i] == UINT16_MAX) {
3076 bucket->sig[i] = sig;
3077 bucket->flow_idx[i] = index;
3078 return;
3079 }
3080 }
3081 /* Otherwise, pick a random entry. */
3082 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3083 bucket->sig[i] = sig;
3084 bucket->flow_idx[i] = index;
3085}
3086
72865317 3087static struct dp_netdev_flow *
3453b4d6
JS
3088dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3089 const struct netdev_flow_key *key,
3090 int *lookup_num_p)
2c0ea78f 3091{
3453b4d6 3092 struct dpcls *cls;
0de8783a 3093 struct dpcls_rule *rule;
f825fdd4
BP
3094 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3095 in_port.odp_port));
3453b4d6 3096 struct dp_netdev_flow *netdev_flow = NULL;
2c0ea78f 3097
3453b4d6
JS
3098 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3099 if (OVS_LIKELY(cls)) {
60d8ccae 3100 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3453b4d6
JS
3101 netdev_flow = dp_netdev_flow_cast(rule);
3102 }
8a4e3a85 3103 return netdev_flow;
2c0ea78f
GS
3104}
3105
3106static struct dp_netdev_flow *
1c1e46ed
AW
3107dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3108 const ovs_u128 *ufidp, const struct nlattr *key,
3109 size_t key_len)
72865317 3110{
1763b4b8 3111 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
3112 struct flow flow;
3113 ovs_u128 ufid;
3114
3115 /* If a UFID is not provided, determine one based on the key. */
3116 if (!ufidp && key && key_len
f0fb825a 3117 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
7a5e0ee7 3118 odp_flow_key_hash(&flow, sizeof flow, &ufid);
70e5ed6f
JS
3119 ufidp = &ufid;
3120 }
72865317 3121
70e5ed6f
JS
3122 if (ufidp) {
3123 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 3124 &pmd->flow_table) {
2ff8484b 3125 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
70e5ed6f
JS
3126 return netdev_flow;
3127 }
72865317
BP
3128 }
3129 }
8a4e3a85 3130
72865317
BP
3131 return NULL;
3132}
3133
a309e4f5
OM
3134static bool
3135dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3136 const struct dp_netdev_flow *netdev_flow,
3137 struct dpif_flow_stats *stats,
3138 struct dpif_flow_attrs *attrs)
3139{
3140 uint64_t act_buf[1024 / 8];
3141 struct nlattr *actions;
3142 struct netdev *netdev;
3143 struct match match;
3144 struct ofpbuf buf;
3145
3146 int ret = 0;
3147
3148 if (!netdev_is_flow_api_enabled()) {
3149 return false;
3150 }
3151
3152 netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port, dp->class);
3153 if (!netdev) {
3154 return false;
3155 }
3156 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3157 /* Taking a global 'port_mutex' to fulfill thread safety
3158 * restrictions for the netdev-offload-dpdk module. */
3159 ovs_mutex_lock(&dp->port_mutex);
3160 ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid,
3161 stats, attrs, &buf);
3162 ovs_mutex_unlock(&dp->port_mutex);
3163 netdev_close(netdev);
3164 if (ret) {
3165 return false;
3166 }
3167
3168 return true;
3169}
3170
72865317 3171static void
a309e4f5
OM
3172get_dpif_flow_status(const struct dp_netdev *dp,
3173 const struct dp_netdev_flow *netdev_flow_,
3174 struct dpif_flow_stats *stats,
3175 struct dpif_flow_attrs *attrs)
feebdea2 3176{
a309e4f5
OM
3177 struct dpif_flow_stats offload_stats;
3178 struct dpif_flow_attrs offload_attrs;
eb94da30
DDP
3179 struct dp_netdev_flow *netdev_flow;
3180 unsigned long long n;
3181 long long used;
3182 uint16_t flags;
3183
3184 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3185
3186 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3187 stats->n_packets = n;
3188 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3189 stats->n_bytes = n;
3190 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3191 stats->used = used;
3192 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3193 stats->tcp_flags = flags;
a309e4f5
OM
3194
3195 if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3196 &offload_stats, &offload_attrs)) {
3197 stats->n_packets += offload_stats.n_packets;
3198 stats->n_bytes += offload_stats.n_bytes;
3199 stats->used = MAX(stats->used, offload_stats.used);
3200 stats->tcp_flags |= offload_stats.tcp_flags;
3201 if (attrs) {
3202 attrs->offloaded = offload_attrs.offloaded;
3203 attrs->dp_layer = offload_attrs.dp_layer;
3204 }
3205 } else if (attrs) {
3206 attrs->offloaded = false;
3207 attrs->dp_layer = "ovs";
3208 }
72865317
BP
3209}
3210
7af12bd7
JS
3211/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3212 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3213 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3214 * protect them. */
6fe09f8c 3215static void
a309e4f5
OM
3216dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3217 const struct dp_netdev_flow *netdev_flow,
7af12bd7 3218 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 3219 struct dpif_flow *flow, bool terse)
6fe09f8c 3220{
64bb477f
JS
3221 if (terse) {
3222 memset(flow, 0, sizeof *flow);
3223 } else {
3224 struct flow_wildcards wc;
3225 struct dp_netdev_actions *actions;
3226 size_t offset;
5262eea1
JG
3227 struct odp_flow_key_parms odp_parms = {
3228 .flow = &netdev_flow->flow,
3229 .mask = &wc.masks,
2494ccd7 3230 .support = dp_netdev_support,
5262eea1 3231 };
64bb477f
JS
3232
3233 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
f4b835bb
JR
3234 /* in_port is exact matched, but we have left it out from the mask for
3235 * optimnization reasons. Add in_port back to the mask. */
3236 wc.masks.in_port.odp_port = ODPP_NONE;
64bb477f
JS
3237
3238 /* Key */
6fd6ed71 3239 offset = key_buf->size;
64bb477f 3240 flow->key = ofpbuf_tail(key_buf);
5262eea1 3241 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 3242 flow->key_len = key_buf->size - offset;
64bb477f
JS
3243
3244 /* Mask */
6fd6ed71 3245 offset = mask_buf->size;
64bb477f 3246 flow->mask = ofpbuf_tail(mask_buf);
ec1f6f32 3247 odp_parms.key_buf = key_buf;
5262eea1 3248 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 3249 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
3250
3251 /* Actions */
3252 actions = dp_netdev_flow_get_actions(netdev_flow);
3253 flow->actions = actions->actions;
3254 flow->actions_len = actions->size;
3255 }
6fe09f8c 3256
70e5ed6f
JS
3257 flow->ufid = netdev_flow->ufid;
3258 flow->ufid_present = true;
1c1e46ed 3259 flow->pmd_id = netdev_flow->pmd_id;
0d6b401c 3260
a309e4f5 3261 get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
342b8904 3262 flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
6fe09f8c
JS
3263}
3264
36956a7d 3265static int
8c301900
JR
3266dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3267 const struct nlattr *mask_key,
3268 uint32_t mask_key_len, const struct flow *flow,
f0fb825a 3269 struct flow_wildcards *wc, bool probe)
8c301900 3270{
ca8d3442
DDP
3271 enum odp_key_fitness fitness;
3272
d40533fc 3273 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
ca8d3442 3274 if (fitness) {
f0fb825a
EG
3275 if (!probe) {
3276 /* This should not happen: it indicates that
3277 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3278 * disagree on the acceptable form of a mask. Log the problem
3279 * as an error, with enough details to enable debugging. */
3280 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3281
3282 if (!VLOG_DROP_ERR(&rl)) {
3283 struct ds s;
3284
3285 ds_init(&s);
3286 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3287 true);
3288 VLOG_ERR("internal error parsing flow mask %s (%s)",
3289 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3290 ds_destroy(&s);
3291 }
8c301900 3292 }
ca8d3442
DDP
3293
3294 return EINVAL;
8c301900
JR
3295 }
3296
3297 return 0;
3298}
3299
3300static int
3301dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
f0fb825a 3302 struct flow *flow, bool probe)
36956a7d 3303{
d40533fc 3304 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
f0fb825a
EG
3305 if (!probe) {
3306 /* This should not happen: it indicates that
3307 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3308 * the acceptable form of a flow. Log the problem as an error,
3309 * with enough details to enable debugging. */
3310 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3311
3312 if (!VLOG_DROP_ERR(&rl)) {
3313 struct ds s;
3314
3315 ds_init(&s);
3316 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3317 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3318 ds_destroy(&s);
3319 }
36956a7d
BP
3320 }
3321
3322 return EINVAL;
3323 }
3324
5cf3edb3 3325 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
07659514
JS
3326 return EINVAL;
3327 }
3328
36956a7d
BP
3329 return 0;
3330}
3331
72865317 3332static int
6fe09f8c 3333dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
3334{
3335 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 3336 struct dp_netdev_flow *netdev_flow;
1c1e46ed 3337 struct dp_netdev_pmd_thread *pmd;
c673049c
IM
3338 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3339 struct hmapx_node *node;
3340 int error = EINVAL;
3341
3342 if (get->pmd_id == PMD_ID_NULL) {
3343 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3344 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3345 dp_netdev_pmd_unref(pmd);
3346 }
3347 }
3348 } else {
3349 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3350 if (!pmd) {
3351 goto out;
3352 }
3353 hmapx_add(&to_find, pmd);
1c1e46ed
AW
3354 }
3355
c673049c
IM
3356 if (!hmapx_count(&to_find)) {
3357 goto out;
72865317 3358 }
1c1e46ed 3359
c673049c
IM
3360 HMAPX_FOR_EACH (node, &to_find) {
3361 pmd = (struct dp_netdev_pmd_thread *) node->data;
3362 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3363 get->key_len);
3364 if (netdev_flow) {
a309e4f5
OM
3365 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3366 get->buffer, get->flow, false);
c673049c
IM
3367 error = 0;
3368 break;
3369 } else {
3370 error = ENOENT;
3371 }
3372 }
bc4a05c6 3373
c673049c
IM
3374 HMAPX_FOR_EACH (node, &to_find) {
3375 pmd = (struct dp_netdev_pmd_thread *) node->data;
3376 dp_netdev_pmd_unref(pmd);
3377 }
3378out:
3379 hmapx_destroy(&to_find);
5279f8fd 3380 return error;
72865317
BP
3381}
3382
241bad15
YL
3383static void
3384dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3385{
3386 struct flow masked_flow;
3387 size_t i;
3388
3389 for (i = 0; i < sizeof(struct flow); i++) {
3390 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3391 ((uint8_t *)&match->wc)[i];
3392 }
7a5e0ee7 3393 odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
241bad15
YL
3394}
3395
0de8783a 3396static struct dp_netdev_flow *
1c1e46ed
AW
3397dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3398 struct match *match, const ovs_u128 *ufid,
ae2ceebd 3399 const struct nlattr *actions, size_t actions_len)
1c1e46ed 3400 OVS_REQUIRES(pmd->flow_mutex)
72865317 3401{
342b8904 3402 struct ds extra_info = DS_EMPTY_INITIALIZER;
0de8783a
JR
3403 struct dp_netdev_flow *flow;
3404 struct netdev_flow_key mask;
3453b4d6 3405 struct dpcls *cls;
342b8904 3406 size_t unit;
f4b835bb
JR
3407
3408 /* Make sure in_port is exact matched before we read it. */
3409 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3453b4d6 3410 odp_port_t in_port = match->flow.in_port.odp_port;
ed79f89a 3411
f4b835bb
JR
3412 /* As we select the dpcls based on the port number, each netdev flow
3413 * belonging to the same dpcls will have the same odp_port value.
3414 * For performance reasons we wildcard odp_port here in the mask. In the
3415 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3416 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3417 * will not be part of the subtable mask.
3418 * This will speed up the hash computation during dpcls_lookup() because
3419 * there is one less call to hash_add64() in this case. */
3420 match->wc.masks.in_port.odp_port = 0;
0de8783a 3421 netdev_flow_mask_init(&mask, match);
f4b835bb
JR
3422 match->wc.masks.in_port.odp_port = ODPP_NONE;
3423
0de8783a 3424 /* Make sure wc does not have metadata. */
5fcff47b
JR
3425 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3426 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 3427
0de8783a 3428 /* Do not allocate extra space. */
caeb4906 3429 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 3430 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 3431 flow->dead = false;
11e5cf1f 3432 flow->batch = NULL;
241bad15 3433 flow->mark = INVALID_FLOW_MARK;
bd5131ba 3434 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 3435 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 3436 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 3437 ovs_refcount_init(&flow->ref_cnt);
0de8783a 3438 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 3439
241bad15 3440 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
0de8783a 3441 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3453b4d6 3442
f4b835bb 3443 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3453b4d6
JS
3444 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3445 dpcls_insert(cls, &flow->cr, &mask);
72865317 3446
342b8904
IM
3447 ds_put_cstr(&extra_info, "miniflow_bits(");
3448 FLOWMAP_FOR_EACH_UNIT (unit) {
3449 if (unit) {
3450 ds_put_char(&extra_info, ',');
3451 }
3452 ds_put_format(&extra_info, "%d",
3453 count_1bits(flow->cr.mask->mf.map.bits[unit]));
3454 }
3455 ds_put_char(&extra_info, ')');
3456 flow->dp_extra_info = ds_steal_cstr(&extra_info);
3457 ds_destroy(&extra_info);
3458
4c75aaab
EJ
3459 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3460 dp_netdev_flow_hash(&flow->ufid));
3461
02bb2824 3462 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
241bad15 3463
beb75a40 3464 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
623540e4 3465 struct ds ds = DS_EMPTY_INITIALIZER;
9044f2c1
JG
3466 struct ofpbuf key_buf, mask_buf;
3467 struct odp_flow_key_parms odp_parms = {
3468 .flow = &match->flow,
3469 .mask = &match->wc.masks,
3470 .support = dp_netdev_support,
3471 };
3472
3473 ofpbuf_init(&key_buf, 0);
3474 ofpbuf_init(&mask_buf, 0);
623540e4 3475
9044f2c1
JG
3476 odp_flow_key_from_flow(&odp_parms, &key_buf);
3477 odp_parms.key_buf = &key_buf;
3478 odp_flow_key_from_mask(&odp_parms, &mask_buf);
0de8783a 3479
623540e4 3480 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
3481 odp_format_ufid(ufid, &ds);
3482 ds_put_cstr(&ds, " ");
9044f2c1
JG
3483 odp_flow_format(key_buf.data, key_buf.size,
3484 mask_buf.data, mask_buf.size,
3485 NULL, &ds, false);
623540e4 3486 ds_put_cstr(&ds, ", actions:");
0722f341 3487 format_odp_actions(&ds, actions, actions_len, NULL);
623540e4 3488
beb75a40 3489 VLOG_DBG("%s", ds_cstr(&ds));
623540e4 3490
9044f2c1
JG
3491 ofpbuf_uninit(&key_buf);
3492 ofpbuf_uninit(&mask_buf);
beb75a40
JS
3493
3494 /* Add a printout of the actual match installed. */
3495 struct match m;
3496 ds_clear(&ds);
3497 ds_put_cstr(&ds, "flow match: ");
3498 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3499 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
b2f4b622 3500 memset(&m.tun_md, 0, sizeof m.tun_md);
beb75a40
JS
3501 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3502
3503 VLOG_DBG("%s", ds_cstr(&ds));
3504
623540e4
EJ
3505 ds_destroy(&ds);
3506 }
3507
0de8783a 3508 return flow;
72865317
BP
3509}
3510
72865317 3511static int
f5d317a1
DDP
3512flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3513 struct netdev_flow_key *key,
3514 struct match *match,
3515 ovs_u128 *ufid,
3516 const struct dpif_flow_put *put,
3517 struct dpif_flow_stats *stats)
72865317 3518{
1763b4b8 3519 struct dp_netdev_flow *netdev_flow;
f5d317a1 3520 int error = 0;
72865317 3521
f5d317a1
DDP
3522 if (stats) {
3523 memset(stats, 0, sizeof *stats);
70e5ed6f
JS
3524 }
3525
1c1e46ed 3526 ovs_mutex_lock(&pmd->flow_mutex);
f5d317a1 3527 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
1763b4b8 3528 if (!netdev_flow) {
89625d1e 3529 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 3530 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
f5d317a1 3531 dp_netdev_flow_add(pmd, match, ufid, put->actions,
70e5ed6f 3532 put->actions_len);
0de8783a 3533 error = 0;
72865317 3534 } else {
5279f8fd 3535 error = EFBIG;
72865317
BP
3536 }
3537 } else {
5279f8fd 3538 error = ENOENT;
72865317
BP
3539 }
3540 } else {
beb75a40 3541 if (put->flags & DPIF_FP_MODIFY) {
8a4e3a85
BP
3542 struct dp_netdev_actions *new_actions;
3543 struct dp_netdev_actions *old_actions;
3544
3545 new_actions = dp_netdev_actions_create(put->actions,
3546 put->actions_len);
3547
61e7deb1
BP
3548 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3549 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 3550
02bb2824
YL
3551 queue_netdev_flow_put(pmd, netdev_flow, match,
3552 put->actions, put->actions_len);
241bad15 3553
f5d317a1 3554 if (stats) {
a309e4f5 3555 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
a84cb64a
BP
3556 }
3557 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
3558 /* XXX: The userspace datapath uses thread local statistics
3559 * (for flows), which should be updated only by the owning
3560 * thread. Since we cannot write on stats memory here,
3561 * we choose not to support this flag. Please note:
3562 * - This feature is currently used only by dpctl commands with
3563 * option --clear.
3564 * - Should the need arise, this operation can be implemented
3565 * by keeping a base value (to be update here) for each
3566 * counter, and subtracting it before outputting the stats */
3567 error = EOPNOTSUPP;
72865317 3568 }
8a4e3a85 3569
61e7deb1 3570 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 3571 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 3572 error = EEXIST;
2c0ea78f
GS
3573 } else {
3574 /* Overlapping flow. */
3575 error = EINVAL;
72865317
BP
3576 }
3577 }
1c1e46ed 3578 ovs_mutex_unlock(&pmd->flow_mutex);
5279f8fd 3579 return error;
72865317
BP
3580}
3581
72865317 3582static int
f5d317a1 3583dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
3584{
3585 struct dp_netdev *dp = get_dp_netdev(dpif);
beb75a40 3586 struct netdev_flow_key key, mask;
1c1e46ed 3587 struct dp_netdev_pmd_thread *pmd;
f5d317a1
DDP
3588 struct match match;
3589 ovs_u128 ufid;
3590 int error;
f0fb825a 3591 bool probe = put->flags & DPIF_FP_PROBE;
72865317 3592
f5d317a1
DDP
3593 if (put->stats) {
3594 memset(put->stats, 0, sizeof *put->stats);
3595 }
f0fb825a
EG
3596 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3597 probe);
f5d317a1
DDP
3598 if (error) {
3599 return error;
3600 }
3601 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3602 put->mask, put->mask_len,
f0fb825a 3603 &match.flow, &match.wc, probe);
f5d317a1
DDP
3604 if (error) {
3605 return error;
1c1e46ed
AW
3606 }
3607
f5d317a1
DDP
3608 if (put->ufid) {
3609 ufid = *put->ufid;
3610 } else {
7a5e0ee7 3611 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
f5d317a1
DDP
3612 }
3613
35fe9efb
IM
3614 /* The Netlink encoding of datapath flow keys cannot express
3615 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3616 * tag is interpreted as exact match on the fact that there is no
3617 * VLAN. Unless we refactor a lot of code that translates between
3618 * Netlink and struct flow representations, we have to do the same
3619 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3620 if (!match.wc.masks.vlans[0].tci) {
3621 match.wc.masks.vlans[0].tci = htons(0xffff);
3622 }
3623
f5d317a1 3624 /* Must produce a netdev_flow_key for lookup.
beb75a40
JS
3625 * Use the same method as employed to create the key when adding
3626 * the flow to the dplcs to make sure they match. */
3627 netdev_flow_mask_init(&mask, &match);
3628 netdev_flow_key_init_masked(&key, &match.flow, &mask);
f5d317a1
DDP
3629
3630 if (put->pmd_id == PMD_ID_NULL) {
3631 if (cmap_count(&dp->poll_threads) == 0) {
3632 return EINVAL;
3633 }
3634 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3635 struct dpif_flow_stats pmd_stats;
3636 int pmd_error;
3637
3638 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3639 &pmd_stats);
3640 if (pmd_error) {
3641 error = pmd_error;
3642 } else if (put->stats) {
3643 put->stats->n_packets += pmd_stats.n_packets;
3644 put->stats->n_bytes += pmd_stats.n_bytes;
3645 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3646 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3647 }
3648 }
3649 } else {
3650 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3651 if (!pmd) {
3652 return EINVAL;
3653 }
3654 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3655 dp_netdev_pmd_unref(pmd);
3656 }
3657
3658 return error;
3659}
3660
3661static int
3662flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3663 struct dpif_flow_stats *stats,
3664 const struct dpif_flow_del *del)
3665{
3666 struct dp_netdev_flow *netdev_flow;
3667 int error = 0;
3668
1c1e46ed
AW
3669 ovs_mutex_lock(&pmd->flow_mutex);
3670 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3671 del->key_len);
1763b4b8 3672 if (netdev_flow) {
f5d317a1 3673 if (stats) {
a309e4f5 3674 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
feebdea2 3675 }
1c1e46ed 3676 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 3677 } else {
5279f8fd 3678 error = ENOENT;
72865317 3679 }
1c1e46ed 3680 ovs_mutex_unlock(&pmd->flow_mutex);
f5d317a1
DDP
3681
3682 return error;
3683}
3684
3685static int
3686dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3687{
3688 struct dp_netdev *dp = get_dp_netdev(dpif);
3689 struct dp_netdev_pmd_thread *pmd;
3690 int error = 0;
3691
3692 if (del->stats) {
3693 memset(del->stats, 0, sizeof *del->stats);
3694 }
3695
3696 if (del->pmd_id == PMD_ID_NULL) {
3697 if (cmap_count(&dp->poll_threads) == 0) {
3698 return EINVAL;
3699 }
3700 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3701 struct dpif_flow_stats pmd_stats;
3702 int pmd_error;
3703
3704 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3705 if (pmd_error) {
3706 error = pmd_error;
3707 } else if (del->stats) {
3708 del->stats->n_packets += pmd_stats.n_packets;
3709 del->stats->n_bytes += pmd_stats.n_bytes;
3710 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3711 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3712 }
3713 }
3714 } else {
3715 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3716 if (!pmd) {
3717 return EINVAL;
3718 }
3719 error = flow_del_on_pmd(pmd, del->stats, del);
3720 dp_netdev_pmd_unref(pmd);
3721 }
3722
5279f8fd
BP
3723
3724 return error;
72865317
BP
3725}
3726
ac64794a
BP
3727struct dpif_netdev_flow_dump {
3728 struct dpif_flow_dump up;
1c1e46ed
AW
3729 struct cmap_position poll_thread_pos;
3730 struct cmap_position flow_pos;
3731 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
3732 int status;
3733 struct ovs_mutex mutex;
e723fd32
JS
3734};
3735
ac64794a
BP
3736static struct dpif_netdev_flow_dump *
3737dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 3738{
ac64794a 3739 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
3740}
3741
ac64794a 3742static struct dpif_flow_dump *
7e8b7199 3743dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
a692410a 3744 struct dpif_flow_dump_types *types OVS_UNUSED)
e723fd32 3745{
ac64794a 3746 struct dpif_netdev_flow_dump *dump;
e723fd32 3747
1c1e46ed 3748 dump = xzalloc(sizeof *dump);
ac64794a 3749 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 3750 dump->up.terse = terse;
ac64794a
BP
3751 ovs_mutex_init(&dump->mutex);
3752
3753 return &dump->up;
e723fd32
JS
3754}
3755
3756static int
ac64794a 3757dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 3758{
ac64794a 3759 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 3760
ac64794a
BP
3761 ovs_mutex_destroy(&dump->mutex);
3762 free(dump);
704a1e09
BP
3763 return 0;
3764}
3765
ac64794a
BP
3766struct dpif_netdev_flow_dump_thread {
3767 struct dpif_flow_dump_thread up;
3768 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
3769 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3770 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
3771};
3772
3773static struct dpif_netdev_flow_dump_thread *
3774dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3775{
3776 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3777}
3778
3779static struct dpif_flow_dump_thread *
3780dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3781{
3782 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3783 struct dpif_netdev_flow_dump_thread *thread;
3784
3785 thread = xmalloc(sizeof *thread);
3786 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3787 thread->dump = dump;
3788 return &thread->up;
3789}
3790
3791static void
3792dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3793{
3794 struct dpif_netdev_flow_dump_thread *thread
3795 = dpif_netdev_flow_dump_thread_cast(thread_);
3796
3797 free(thread);
3798}
3799
704a1e09 3800static int
ac64794a 3801dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 3802 struct dpif_flow *flows, int max_flows)
ac64794a
BP
3803{
3804 struct dpif_netdev_flow_dump_thread *thread
3805 = dpif_netdev_flow_dump_thread_cast(thread_);
3806 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 3807 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
a309e4f5
OM
3808 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3809 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
8bb113da
RW
3810 int n_flows = 0;
3811 int i;
14608a15 3812
ac64794a 3813 ovs_mutex_lock(&dump->mutex);
8bb113da 3814 if (!dump->status) {
1c1e46ed
AW
3815 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3816 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3817
3818 /* First call to dump_next(), extracts the first pmd thread.
3819 * If there is no pmd thread, returns immediately. */
3820 if (!pmd) {
3821 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3822 if (!pmd) {
3823 ovs_mutex_unlock(&dump->mutex);
3824 return n_flows;
8bb113da 3825
8bb113da 3826 }
d2ad7ef1 3827 }
1c1e46ed
AW
3828
3829 do {
3830 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3831 struct cmap_node *node;
3832
3833 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3834 if (!node) {
3835 break;
3836 }
3837 netdev_flows[n_flows] = CONTAINER_OF(node,
3838 struct dp_netdev_flow,
3839 node);
3840 }
3841 /* When finishing dumping the current pmd thread, moves to
3842 * the next. */
3843 if (n_flows < flow_limit) {
3844 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3845 dp_netdev_pmd_unref(pmd);
3846 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3847 if (!pmd) {
3848 dump->status = EOF;
3849 break;
3850 }
3851 }
3852 /* Keeps the reference to next caller. */
3853 dump->cur_pmd = pmd;
3854
3855 /* If the current dump is empty, do not exit the loop, since the
3856 * remaining pmds could have flows to be dumped. Just dumps again
3857 * on the new 'pmd'. */
3858 } while (!n_flows);
8a4e3a85 3859 }
ac64794a 3860 ovs_mutex_unlock(&dump->mutex);
ac64794a 3861
8bb113da
RW
3862 for (i = 0; i < n_flows; i++) {
3863 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3864 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3865 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3866 struct dpif_flow *f = &flows[i];
7af12bd7 3867 struct ofpbuf key, mask;
8bb113da 3868
7af12bd7
JS
3869 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3870 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
a309e4f5 3871 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
64bb477f 3872 dump->up.terse);
8bb113da 3873 }
feebdea2 3874
8bb113da 3875 return n_flows;
72865317
BP
3876}
3877
3878static int
758c456d 3879dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 3880 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
3881{
3882 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 3883 struct dp_netdev_pmd_thread *pmd;
1895cc8d 3884 struct dp_packet_batch pp;
72865317 3885
cf62fa4c
PS
3886 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3887 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
3888 return EINVAL;
3889 }
3890
65f13b50
AW
3891 /* Tries finding the 'pmd'. If NULL is returned, that means
3892 * the current thread is a non-pmd thread and should use
b19befae 3893 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
3894 pmd = ovsthread_getspecific(dp->per_pmd_key);
3895 if (!pmd) {
b19befae 3896 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
546e57d4
DDP
3897 if (!pmd) {
3898 return EBUSY;
3899 }
65f13b50
AW
3900 }
3901
05267613
AZ
3902 if (execute->probe) {
3903 /* If this is part of a probe, Drop the packet, since executing
3904 * the action may actually cause spurious packets be sent into
3905 * the network. */
d1ce9c20
YS
3906 if (pmd->core_id == NON_PMD_CORE_ID) {
3907 dp_netdev_pmd_unref(pmd);
3908 }
05267613
AZ
3909 return 0;
3910 }
3911
65f13b50
AW
3912 /* If the current thread is non-pmd thread, acquires
3913 * the 'non_pmd_mutex'. */
3914 if (pmd->core_id == NON_PMD_CORE_ID) {
3915 ovs_mutex_lock(&dp->non_pmd_mutex);
3916 }
1c1e46ed 3917
2fbadeb6
IM
3918 /* Update current time in PMD context. We don't care about EMC insertion
3919 * probability, because we are on a slow path. */
b010be17
IM
3920 pmd_thread_ctx_time_update(pmd);
3921
36d8de17
DDP
3922 /* The action processing expects the RSS hash to be valid, because
3923 * it's always initialized at the beginning of datapath processing.
3924 * In this case, though, 'execute->packet' may not have gone through
3925 * the datapath at all, it may have been generated by the upper layer
3926 * (OpenFlow packet-out, BFD frame, ...). */
3927 if (!dp_packet_rss_valid(execute->packet)) {
3928 dp_packet_set_rss_hash(execute->packet,
3929 flow_hash_5tuple(execute->flow, 0));
3930 }
3931
72c84bc2 3932 dp_packet_batch_init_packet(&pp, execute->packet);
9f17f104 3933 pp.do_not_steal = true;
66e4ad8a 3934 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
b010be17 3935 execute->actions, execute->actions_len);
c71ea3c4 3936 dp_netdev_pmd_flush_output_packets(pmd, true);
36d8de17 3937
65f13b50
AW
3938 if (pmd->core_id == NON_PMD_CORE_ID) {
3939 ovs_mutex_unlock(&dp->non_pmd_mutex);
e9985d6a 3940 dp_netdev_pmd_unref(pmd);
65f13b50 3941 }
8a4e3a85 3942
758c456d 3943 return 0;
72865317
BP
3944}
3945
1a0c894a 3946static void
57924fc9
SB
3947dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3948 enum dpif_offload_type offload_type OVS_UNUSED)
1a0c894a
BP
3949{
3950 size_t i;
3951
3952 for (i = 0; i < n_ops; i++) {
3953 struct dpif_op *op = ops[i];
3954
3955 switch (op->type) {
3956 case DPIF_OP_FLOW_PUT:
fa37affa 3957 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
1a0c894a
BP
3958 break;
3959
3960 case DPIF_OP_FLOW_DEL:
fa37affa 3961 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
1a0c894a
BP
3962 break;
3963
3964 case DPIF_OP_EXECUTE:
fa37affa 3965 op->error = dpif_netdev_execute(dpif, &op->execute);
1a0c894a 3966 break;
6fe09f8c
JS
3967
3968 case DPIF_OP_FLOW_GET:
fa37affa 3969 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
6fe09f8c 3970 break;
1a0c894a
BP
3971 }
3972 }
3973}
3974
5bf84282
NK
3975/* Enable or Disable PMD auto load balancing. */
3976static void
3977set_pmd_auto_lb(struct dp_netdev *dp)
3978{
3979 unsigned int cnt = 0;
3980 struct dp_netdev_pmd_thread *pmd;
3981 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3982
3983 bool enable_alb = false;
3984 bool multi_rxq = false;
3985 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3986
3987 /* Ensure that there is at least 2 non-isolated PMDs and
3988 * one of them is polling more than one rxq. */
3989 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3990 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3991 continue;
3992 }
3993
3994 if (hmap_count(&pmd->poll_list) > 1) {
3995 multi_rxq = true;
3996 }
3997 if (cnt && multi_rxq) {
3998 enable_alb = true;
3999 break;
4000 }
4001 cnt++;
4002 }
4003
4004 /* Enable auto LB if it is requested and cycle based assignment is true. */
4005 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
4006 pmd_alb->auto_lb_requested;
4007
4008 if (pmd_alb->is_enabled != enable_alb) {
4009 pmd_alb->is_enabled = enable_alb;
4010 if (pmd_alb->is_enabled) {
4011 VLOG_INFO("PMD auto load balance is enabled "
4012 "(with rebalance interval:%"PRIu64" msec)",
4013 pmd_alb->rebalance_intvl);
4014 } else {
4015 pmd_alb->rebalance_poll_timer = 0;
4016 VLOG_INFO("PMD auto load balance is disabled");
4017 }
4018 }
4019
4020}
4021
d4f6865c
DDP
4022/* Applies datapath configuration from the database. Some of the changes are
4023 * actually applied in dpif_netdev_run(). */
f2eee189 4024static int
d4f6865c 4025dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
f2eee189
AW
4026{
4027 struct dp_netdev *dp = get_dp_netdev(dpif);
d4f6865c 4028 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
e77c97b9
KT
4029 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4030 "cycles");
4c30b246
CL
4031 unsigned long long insert_prob =
4032 smap_get_ullong(other_config, "emc-insert-inv-prob",
4033 DEFAULT_EM_FLOW_INSERT_INV_PROB);
4034 uint32_t insert_min, cur_min;
c71ea3c4 4035 uint32_t tx_flush_interval, cur_tx_flush_interval;
5bf84282 4036 uint64_t rebalance_intvl;
c71ea3c4
IM
4037
4038 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4039 DEFAULT_TX_FLUSH_INTERVAL);
4040 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4041 if (tx_flush_interval != cur_tx_flush_interval) {
4042 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4043 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4044 tx_flush_interval);
4045 }
f2eee189 4046
a6a426d6
IM
4047 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4048 free(dp->pmd_cmask);
4049 dp->pmd_cmask = nullable_xstrdup(cmask);
4050 dp_netdev_request_reconfigure(dp);
f2eee189
AW
4051 }
4052
4c30b246
CL
4053 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4054 if (insert_prob <= UINT32_MAX) {
4055 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4056 } else {
4057 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4058 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4059 }
4060
4061 if (insert_min != cur_min) {
4062 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4063 if (insert_min == 0) {
2fbadeb6 4064 VLOG_INFO("EMC insertion probability changed to zero");
4c30b246
CL
4065 } else {
4066 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4067 insert_prob, (100 / (float)insert_prob));
4068 }
4069 }
4070
79f36875
JS
4071 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4072 bool cur_perf_enabled;
4073 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4074 if (perf_enabled != cur_perf_enabled) {
4075 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4076 if (perf_enabled) {
4077 VLOG_INFO("PMD performance metrics collection enabled");
4078 } else {
4079 VLOG_INFO("PMD performance metrics collection disabled");
4080 }
4081 }
4082
60d8ccae
YW
4083 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4084 bool cur_smc;
4085 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4086 if (smc_enable != cur_smc) {
4087 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4088 if (smc_enable) {
4089 VLOG_INFO("SMC cache is enabled");
4090 } else {
4091 VLOG_INFO("SMC cache is disabled");
4092 }
4093 }
e77c97b9
KT
4094
4095 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
4096 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
4097 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
4098 "Defaulting to 'cycles'.");
4099 pmd_rxq_assign_cyc = true;
4100 pmd_rxq_assign = "cycles";
4101 }
4102 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4103 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4104 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4105 pmd_rxq_assign);
4106 dp_netdev_request_reconfigure(dp);
4107 }
5bf84282
NK
4108
4109 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4110 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4111 false);
4112
4113 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4114 ALB_PMD_REBALANCE_POLL_INTERVAL);
4115
4116 /* Input is in min, convert it to msec. */
4117 rebalance_intvl =
4118 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4119
4120 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4121 pmd_alb->rebalance_intvl = rebalance_intvl;
4122 }
4123
4124 set_pmd_auto_lb(dp);
f2eee189
AW
4125 return 0;
4126}
4127
3eb67853
IM
4128/* Parses affinity list and returns result in 'core_ids'. */
4129static int
4130parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4131{
4132 unsigned i;
4133 char *list, *copy, *key, *value;
4134 int error = 0;
4135
4136 for (i = 0; i < n_rxq; i++) {
51c37a56 4137 core_ids[i] = OVS_CORE_UNSPEC;
3eb67853
IM
4138 }
4139
4140 if (!affinity_list) {
4141 return 0;
4142 }
4143
4144 list = copy = xstrdup(affinity_list);
4145
4146 while (ofputil_parse_key_value(&list, &key, &value)) {
4147 int rxq_id, core_id;
4148
4149 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4150 || !str_to_int(value, 0, &core_id) || core_id < 0) {
4151 error = EINVAL;
4152 break;
4153 }
4154
4155 if (rxq_id < n_rxq) {
4156 core_ids[rxq_id] = core_id;
4157 }
4158 }
4159
4160 free(copy);
4161 return error;
4162}
4163
4164/* Parses 'affinity_list' and applies configuration if it is valid. */
4165static int
4166dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4167 const char *affinity_list)
4168{
4169 unsigned *core_ids, i;
4170 int error = 0;
4171
4172 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4173 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4174 error = EINVAL;
4175 goto exit;
4176 }
4177
4178 for (i = 0; i < port->n_rxq; i++) {
4179 port->rxqs[i].core_id = core_ids[i];
4180 }
4181
4182exit:
4183 free(core_ids);
4184 return error;
4185}
4186
2fbadeb6
IM
4187/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4188 * of given PMD thread. */
4189static bool
4190dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4191 struct dp_netdev_port *port)
4192 OVS_EXCLUDED(pmd->port_mutex)
4193{
4194 struct rxq_poll *poll;
4195 bool found = false;
4196
4197 ovs_mutex_lock(&pmd->port_mutex);
4198 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4199 if (port == poll->rxq->port) {
4200 found = true;
4201 break;
4202 }
4203 }
4204 ovs_mutex_unlock(&pmd->port_mutex);
4205 return found;
4206}
4207
4208/* Updates port configuration from the database. The changes are actually
4209 * applied in dpif_netdev_run(). */
3eb67853
IM
4210static int
4211dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4212 const struct smap *cfg)
4213{
4214 struct dp_netdev *dp = get_dp_netdev(dpif);
4215 struct dp_netdev_port *port;
4216 int error = 0;
4217 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2fbadeb6 4218 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
3eb67853
IM
4219
4220 ovs_mutex_lock(&dp->port_mutex);
4221 error = get_port_by_number(dp, port_no, &port);
2fbadeb6
IM
4222 if (error) {
4223 goto unlock;
4224 }
4225
4226 if (emc_enabled != port->emc_enabled) {
4227 struct dp_netdev_pmd_thread *pmd;
4228 struct ds ds = DS_EMPTY_INITIALIZER;
4229 uint32_t cur_min, insert_prob;
4230
4231 port->emc_enabled = emc_enabled;
4232 /* Mark for reload all the threads that polls this port and request
4233 * for reconfiguration for the actual reloading of threads. */
4234 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4235 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4236 pmd->need_reload = true;
4237 }
4238 }
4239 dp_netdev_request_reconfigure(dp);
4240
4241 ds_put_format(&ds, "%s: EMC has been %s.",
4242 netdev_get_name(port->netdev),
4243 (emc_enabled) ? "enabled" : "disabled");
4244 if (emc_enabled) {
4245 ds_put_cstr(&ds, " Current insertion probability is ");
4246 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4247 if (!cur_min) {
4248 ds_put_cstr(&ds, "zero.");
4249 } else {
4250 insert_prob = UINT32_MAX / cur_min;
4251 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4252 insert_prob, 100 / (float) insert_prob);
4253 }
4254 }
4255 VLOG_INFO("%s", ds_cstr(&ds));
4256 ds_destroy(&ds);
4257 }
4258
4259 /* Checking for RXq affinity changes. */
4260 if (!netdev_is_pmd(port->netdev)
3eb67853
IM
4261 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4262 goto unlock;
4263 }
4264
4265 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4266 if (error) {
4267 goto unlock;
4268 }
4269 free(port->rxq_affinity_list);
4270 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4271
4272 dp_netdev_request_reconfigure(dp);
4273unlock:
4274 ovs_mutex_unlock(&dp->port_mutex);
4275 return error;
4276}
4277
5bf93d67
EJ
4278static int
4279dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4280 uint32_t queue_id, uint32_t *priority)
4281{
4282 *priority = queue_id;
4283 return 0;
4284}
4285
72865317 4286\f
9ff55ae2 4287/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
1401f6de 4288 * a copy of the 'size' bytes of 'actions' input parameters. */
a84cb64a
BP
4289struct dp_netdev_actions *
4290dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4291{
4292 struct dp_netdev_actions *netdev_actions;
4293
9ff55ae2
DDP
4294 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4295 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
4296 netdev_actions->size = size;
4297
4298 return netdev_actions;
4299}
4300
a84cb64a 4301struct dp_netdev_actions *
61e7deb1 4302dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 4303{
61e7deb1 4304 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
4305}
4306
61e7deb1
BP
4307static void
4308dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 4309{
61e7deb1 4310 free(actions);
a84cb64a
BP
4311}
4312\f
a19896ab
JS
4313static void
4314dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4315 enum rxq_cycles_counter_type type,
4316 unsigned long long cycles)
a2ac666d 4317{
a19896ab 4318 atomic_store_relaxed(&rx->cycles[type], cycles);
a2ac666d
CL
4319}
4320
4809891b 4321static void
a19896ab 4322dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4809891b
KT
4323 enum rxq_cycles_counter_type type,
4324 unsigned long long cycles)
4325{
a19896ab 4326 non_atomic_ullong_add(&rx->cycles[type], cycles);
4809891b
KT
4327}
4328
4329static uint64_t
4330dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4331 enum rxq_cycles_counter_type type)
4332{
4333 unsigned long long processing_cycles;
4334 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4335 return processing_cycles;
4336}
4337
4338static void
4339dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4340 unsigned long long cycles)
4341{
4ee87ad3
BP
4342 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4343 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4809891b
KT
4344}
4345
655856ef
KT
4346static uint64_t
4347dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4348{
4349 unsigned long long processing_cycles;
4350 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4351 return processing_cycles;
4352}
4353
79f36875
JS
4354#if ATOMIC_ALWAYS_LOCK_FREE_8B
4355static inline bool
4356pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4357{
4358 bool pmd_perf_enabled;
4359 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4360 return pmd_perf_enabled;
4361}
4362#else
4363/* If stores and reads of 64-bit integers are not atomic, the full PMD
4364 * performance metrics are not available as locked access to 64 bit
4365 * integers would be prohibitively expensive. */
4366static inline bool
4367pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4368{
4369 return false;
4370}
4371#endif
4372
c71ea3c4 4373static int
009e0033
IM
4374dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4375 struct tx_port *p)
4376{
58ed6df0 4377 int i;
009e0033 4378 int tx_qid;
cc4891f3 4379 int output_cnt;
009e0033 4380 bool dynamic_txqs;
58ed6df0
IM
4381 struct cycle_timer timer;
4382 uint64_t cycles;
c71ea3c4 4383 uint32_t tx_flush_interval;
58ed6df0
IM
4384
4385 cycle_timer_start(&pmd->perf_stats, &timer);
009e0033
IM
4386
4387 dynamic_txqs = p->port->dynamic_txqs;
4388 if (dynamic_txqs) {
4389 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4390 } else {
4391 tx_qid = pmd->static_tx_qid;
4392 }
4393
cc4891f3 4394 output_cnt = dp_packet_batch_size(&p->output_pkts);
58ed6df0 4395 ovs_assert(output_cnt > 0);
cc4891f3 4396
b30896c9 4397 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
009e0033 4398 dp_packet_batch_init(&p->output_pkts);
cc4891f3 4399
c71ea3c4
IM
4400 /* Update time of the next flush. */
4401 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4402 p->flush_time = pmd->ctx.now + tx_flush_interval;
4403
4404 ovs_assert(pmd->n_output_batches > 0);
4405 pmd->n_output_batches--;
4406
82a48ead
JS
4407 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4408 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
58ed6df0
IM
4409
4410 /* Distribute send cycles evenly among transmitted packets and assign to
4411 * their respective rx queues. */
4412 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4413 for (i = 0; i < output_cnt; i++) {
4414 if (p->output_pkts_rxqs[i]) {
4415 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4416 RXQ_CYCLES_PROC_CURR, cycles);
4417 }
4418 }
c71ea3c4
IM
4419
4420 return output_cnt;
009e0033
IM
4421}
4422
c71ea3c4
IM
4423static int
4424dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4425 bool force)
009e0033
IM
4426{
4427 struct tx_port *p;
c71ea3c4
IM
4428 int output_cnt = 0;
4429
4430 if (!pmd->n_output_batches) {
4431 return 0;
4432 }
009e0033
IM
4433
4434 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
c71ea3c4
IM
4435 if (!dp_packet_batch_is_empty(&p->output_pkts)
4436 && (force || pmd->ctx.now >= p->flush_time)) {
4437 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
009e0033
IM
4438 }
4439 }
c71ea3c4 4440 return output_cnt;
009e0033
IM
4441}
4442
a2ac666d 4443static int
65f13b50 4444dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
a19896ab 4445 struct dp_netdev_rxq *rxq,
947dc567 4446 odp_port_t port_no)
e4cfed38 4447{
79f36875 4448 struct pmd_perf_stats *s = &pmd->perf_stats;
1895cc8d 4449 struct dp_packet_batch batch;
a19896ab 4450 struct cycle_timer timer;
1895cc8d 4451 int error;
79f36875
JS
4452 int batch_cnt = 0;
4453 int rem_qlen = 0, *qlen_p = NULL;
58ed6df0 4454 uint64_t cycles;
e4cfed38 4455
a19896ab
JS
4456 /* Measure duration for polling and processing rx burst. */
4457 cycle_timer_start(&pmd->perf_stats, &timer);
58ed6df0
IM
4458
4459 pmd->ctx.last_rxq = rxq;
1895cc8d 4460 dp_packet_batch_init(&batch);
58ed6df0 4461
79f36875
JS
4462 /* Fetch the rx queue length only for vhostuser ports. */
4463 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4464 qlen_p = &rem_qlen;
4465 }
4466
4467 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
e4cfed38 4468 if (!error) {
a19896ab 4469 /* At least one packet received. */
3c33f0ff 4470 *recirc_depth_get() = 0;
009e0033 4471 pmd_thread_ctx_time_update(pmd);
940ac2ce 4472 batch_cnt = dp_packet_batch_size(&batch);
79f36875
JS
4473 if (pmd_perf_metrics_enabled(pmd)) {
4474 /* Update batch histogram. */
4475 s->current.batches++;
4476 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4477 /* Update the maximum vhost rx queue fill level. */
4478 if (rxq->is_vhost && rem_qlen >= 0) {
4479 uint32_t qfill = batch_cnt + rem_qlen;
4480 if (qfill > s->current.max_vhost_qfill) {
4481 s->current.max_vhost_qfill = qfill;
4482 }
4483 }
4484 }
4485 /* Process packet batch. */
947dc567 4486 dp_netdev_input(pmd, &batch, port_no);
e4cfed38 4487
a19896ab 4488 /* Assign processing cycles to rx queue. */
58ed6df0 4489 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
a19896ab
JS
4490 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4491
79f36875 4492 dp_netdev_pmd_flush_output_packets(pmd, false);
a19896ab
JS
4493 } else {
4494 /* Discard cycles. */
4495 cycle_timer_stop(&pmd->perf_stats, &timer);
4496 if (error != EAGAIN && error != EOPNOTSUPP) {
4497 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4498
4499 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4500 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4501 }
e4cfed38 4502 }
a2ac666d 4503
58ed6df0
IM
4504 pmd->ctx.last_rxq = NULL;
4505
79f36875 4506 return batch_cnt;
e4cfed38
PS
4507}
4508
e32971b8
DDP
4509static struct tx_port *
4510tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4511{
4512 struct tx_port *tx;
4513
4514 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4515 if (tx->port->port_no == port_no) {
4516 return tx;
4517 }
4518 }
4519
4520 return NULL;
4521}
4522
9df65060
VDA
4523static struct tx_bond *
4524tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4525{
4526 uint32_t hash = hash_bond_id(bond_id);
4527 struct tx_bond *tx;
4528
4529 CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4530 if (tx->bond_id == bond_id) {
4531 return tx;
4532 }
4533 }
4534 return NULL;
4535}
4536
dc36593c
DDP
4537static int
4538port_reconfigure(struct dp_netdev_port *port)
4539{
4540 struct netdev *netdev = port->netdev;
dc36593c
DDP
4541 int i, err;
4542
dc36593c
DDP
4543 /* Closes the existing 'rxq's. */
4544 for (i = 0; i < port->n_rxq; i++) {
947dc567
DDP
4545 netdev_rxq_close(port->rxqs[i].rx);
4546 port->rxqs[i].rx = NULL;
dc36593c 4547 }
4809891b 4548 unsigned last_nrxq = port->n_rxq;
dc36593c
DDP
4549 port->n_rxq = 0;
4550
050c60bf 4551 /* Allows 'netdev' to apply the pending configuration changes. */
606f6650 4552 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
e32971b8
DDP
4553 err = netdev_reconfigure(netdev);
4554 if (err && (err != EOPNOTSUPP)) {
4555 VLOG_ERR("Failed to set interface %s new configuration",
4556 netdev_get_name(netdev));
4557 return err;
4558 }
dc36593c 4559 }
050c60bf 4560 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3eb67853
IM
4561 port->rxqs = xrealloc(port->rxqs,
4562 sizeof *port->rxqs * netdev_n_rxq(netdev));
324c8374
IM
4563 /* Realloc 'used' counters for tx queues. */
4564 free(port->txq_used);
4565 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4566
dc36593c 4567 for (i = 0; i < netdev_n_rxq(netdev); i++) {
38259bd7
BP
4568 bool new_queue = i >= last_nrxq;
4569 if (new_queue) {
4570 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4571 }
4572
947dc567 4573 port->rxqs[i].port = port;
79f36875 4574 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
38259bd7 4575
947dc567 4576 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
dc36593c
DDP
4577 if (err) {
4578 return err;
4579 }
4580 port->n_rxq++;
4581 }
4582
3eb67853
IM
4583 /* Parse affinity list to apply configuration for new queues. */
4584 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4585
606f6650
EC
4586 /* If reconfiguration was successful mark it as such, so we can use it */
4587 port->need_reconfigure = false;
4588
dc36593c
DDP
4589 return 0;
4590}
4591
e32971b8
DDP
4592struct rr_numa_list {
4593 struct hmap numas; /* Contains 'struct rr_numa' */
4594};
4595
4596struct rr_numa {
4597 struct hmap_node node;
4598
4599 int numa_id;
4600
4601 /* Non isolated pmds on numa node 'numa_id' */
4602 struct dp_netdev_pmd_thread **pmds;
4603 int n_pmds;
4604
4605 int cur_index;
79da1e41 4606 bool idx_inc;
e32971b8
DDP
4607};
4608
4609static struct rr_numa *
4610rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4611{
4612 struct rr_numa *numa;
4613
4614 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4615 if (numa->numa_id == numa_id) {
4616 return numa;
4617 }
4618 }
4619
4620 return NULL;
4621}
4622
c37813fd
BM
4623/* Returns the next node in numa list following 'numa' in round-robin fashion.
4624 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4625 * Returns NULL if 'rr' numa list is empty. */
4626static struct rr_numa *
4627rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4628{
4629 struct hmap_node *node = NULL;
4630
4631 if (numa) {
4632 node = hmap_next(&rr->numas, &numa->node);
4633 }
4634 if (!node) {
4635 node = hmap_first(&rr->numas);
4636 }
4637
4638 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4639}
4640
e32971b8
DDP
4641static void
4642rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4643{
4644 struct dp_netdev_pmd_thread *pmd;
4645 struct rr_numa *numa;
4646
4647 hmap_init(&rr->numas);
4648
4649 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4650 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4651 continue;
4652 }
4653
4654 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4655 if (!numa) {
4656 numa = xzalloc(sizeof *numa);
4657 numa->numa_id = pmd->numa_id;
4658 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4659 }
4660 numa->n_pmds++;
4661 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4662 numa->pmds[numa->n_pmds - 1] = pmd;
79da1e41
KT
4663 /* At least one pmd so initialise curr_idx and idx_inc. */
4664 numa->cur_index = 0;
4665 numa->idx_inc = true;
e32971b8
DDP
4666 }
4667}
4668
e77c97b9
KT
4669/*
4670 * Returns the next pmd from the numa node.
4671 *
4672 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4673 * either an up or down walk, switching between up/down when the first or last
4674 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4675 *
4676 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4677 * core reached. e.g. 1,2,3,1,2,3,1,2...
4678 */
e32971b8 4679static struct dp_netdev_pmd_thread *
e77c97b9 4680rr_numa_get_pmd(struct rr_numa *numa, bool updown)
e32971b8 4681{
79da1e41
KT
4682 int numa_idx = numa->cur_index;
4683
4684 if (numa->idx_inc == true) {
4685 /* Incrementing through list of pmds. */
4686 if (numa->cur_index == numa->n_pmds-1) {
4687 /* Reached the last pmd. */
e77c97b9
KT
4688 if (updown) {
4689 numa->idx_inc = false;
4690 } else {
4691 numa->cur_index = 0;
4692 }
79da1e41
KT
4693 } else {
4694 numa->cur_index++;
4695 }
4696 } else {
4697 /* Decrementing through list of pmds. */
4698 if (numa->cur_index == 0) {
4699 /* Reached the first pmd. */
4700 numa->idx_inc = true;
4701 } else {
4702 numa->cur_index--;
4703 }
4704 }
4705 return numa->pmds[numa_idx];
e32971b8
DDP
4706}
4707
4708static void
4709rr_numa_list_destroy(struct rr_numa_list *rr)
4710{
4711 struct rr_numa *numa;
4712
4713 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4714 free(numa->pmds);
4715 free(numa);
4716 }
4717 hmap_destroy(&rr->numas);
4718}
4719
655856ef
KT
4720/* Sort Rx Queues by the processing cycles they are consuming. */
4721static int
cc131ac1 4722compare_rxq_cycles(const void *a, const void *b)
655856ef 4723{
28080276
KT
4724 struct dp_netdev_rxq *qa;
4725 struct dp_netdev_rxq *qb;
8368866e 4726 uint64_t cycles_qa, cycles_qb;
655856ef
KT
4727
4728 qa = *(struct dp_netdev_rxq **) a;
4729 qb = *(struct dp_netdev_rxq **) b;
4730
8368866e
KT
4731 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4732 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
655856ef 4733
8368866e
KT
4734 if (cycles_qa != cycles_qb) {
4735 return (cycles_qa < cycles_qb) ? 1 : -1;
a130f1a8
KT
4736 } else {
4737 /* Cycles are the same so tiebreak on port/queue id.
4738 * Tiebreaking (as opposed to return 0) ensures consistent
4739 * sort results across multiple OS's. */
f0aa3801
BP
4740 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4741 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4742 if (port_qa != port_qb) {
4743 return port_qa > port_qb ? 1 : -1;
a130f1a8
KT
4744 } else {
4745 return netdev_rxq_get_queue_id(qa->rx)
4746 - netdev_rxq_get_queue_id(qb->rx);
4747 }
655856ef 4748 }
655856ef
KT
4749}
4750
e32971b8
DDP
4751/* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4752 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4753 * pmds to unpinned queues.
4754 *
4755 * The function doesn't touch the pmd threads, it just stores the assignment
4756 * in the 'pmd' member of each rxq. */
4757static void
4758rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4759{
4760 struct dp_netdev_port *port;
4761 struct rr_numa_list rr;
c37813fd 4762 struct rr_numa *non_local_numa = NULL;
655856ef 4763 struct dp_netdev_rxq ** rxqs = NULL;
97bf8f47 4764 int n_rxqs = 0;
655856ef
KT
4765 struct rr_numa *numa = NULL;
4766 int numa_id;
e77c97b9 4767 bool assign_cyc = dp->pmd_rxq_assign_cyc;
e32971b8
DDP
4768
4769 HMAP_FOR_EACH (port, node, &dp->ports) {
e32971b8
DDP
4770 if (!netdev_is_pmd(port->netdev)) {
4771 continue;
4772 }
4773
e32971b8
DDP
4774 for (int qid = 0; qid < port->n_rxq; qid++) {
4775 struct dp_netdev_rxq *q = &port->rxqs[qid];
4776
4777 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4778 struct dp_netdev_pmd_thread *pmd;
4779
4780 pmd = dp_netdev_get_pmd(dp, q->core_id);
4781 if (!pmd) {
4782 VLOG_WARN("There is no PMD thread on core %d. Queue "
4783 "%d on port \'%s\' will not be polled.",
4784 q->core_id, qid, netdev_get_name(port->netdev));
4785 } else {
4786 q->pmd = pmd;
4787 pmd->isolated = true;
433a3fa5
GM
4788 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4789 "rx queue %d.", pmd->core_id, pmd->numa_id,
4790 netdev_rxq_get_name(q->rx),
4791 netdev_rxq_get_queue_id(q->rx));
e32971b8
DDP
4792 dp_netdev_pmd_unref(pmd);
4793 }
4794 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
8368866e
KT
4795 uint64_t cycle_hist = 0;
4796
655856ef
KT
4797 if (n_rxqs == 0) {
4798 rxqs = xmalloc(sizeof *rxqs);
e32971b8 4799 } else {
655856ef 4800 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
e32971b8 4801 }
8368866e 4802
e77c97b9
KT
4803 if (assign_cyc) {
4804 /* Sum the queue intervals and store the cycle history. */
4805 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4806 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4807 }
4808 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4809 cycle_hist);
4810 }
655856ef
KT
4811 /* Store the queue. */
4812 rxqs[n_rxqs++] = q;
e32971b8
DDP
4813 }
4814 }
4815 }
4816
e77c97b9 4817 if (n_rxqs > 1 && assign_cyc) {
655856ef
KT
4818 /* Sort the queues in order of the processing cycles
4819 * they consumed during their last pmd interval. */
cc131ac1 4820 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
655856ef
KT
4821 }
4822
4823 rr_numa_list_populate(dp, &rr);
4824 /* Assign the sorted queues to pmds in round robin. */
97bf8f47 4825 for (int i = 0; i < n_rxqs; i++) {
655856ef
KT
4826 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4827 numa = rr_numa_list_lookup(&rr, numa_id);
4828 if (!numa) {
4829 /* There are no pmds on the queue's local NUMA node.
4830 Round robin on the NUMA nodes that do have pmds. */
4831 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4832 if (!non_local_numa) {
4833 VLOG_ERR("There is no available (non-isolated) pmd "
4834 "thread for port \'%s\' queue %d. This queue "
4835 "will not be polled. Is pmd-cpu-mask set to "
4836 "zero? Or are all PMDs isolated to other "
4837 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4838 netdev_rxq_get_queue_id(rxqs[i]->rx));
4839 continue;
4840 }
e77c97b9 4841 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
655856ef
KT
4842 VLOG_WARN("There's no available (non-isolated) pmd thread "
4843 "on numa node %d. Queue %d on port \'%s\' will "
4844 "be assigned to the pmd on core %d "
4845 "(numa node %d). Expect reduced performance.",
4846 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4847 netdev_rxq_get_name(rxqs[i]->rx),
4848 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4849 } else {
e77c97b9
KT
4850 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4851 if (assign_cyc) {
4852 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4853 "rx queue %d "
4854 "(measured processing cycles %"PRIu64").",
4855 rxqs[i]->pmd->core_id, numa_id,
4856 netdev_rxq_get_name(rxqs[i]->rx),
4857 netdev_rxq_get_queue_id(rxqs[i]->rx),
4858 dp_netdev_rxq_get_cycles(rxqs[i],
4859 RXQ_CYCLES_PROC_HIST));
4860 } else {
4861 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4862 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4863 netdev_rxq_get_name(rxqs[i]->rx),
4864 netdev_rxq_get_queue_id(rxqs[i]->rx));
4865 }
655856ef
KT
4866 }
4867 }
4868
e32971b8 4869 rr_numa_list_destroy(&rr);
655856ef 4870 free(rxqs);
e32971b8
DDP
4871}
4872
140dd699
IM
4873static void
4874reload_affected_pmds(struct dp_netdev *dp)
4875{
4876 struct dp_netdev_pmd_thread *pmd;
4877
4878 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4879 if (pmd->need_reload) {
241bad15 4880 flow_mark_flush(pmd);
140dd699 4881 dp_netdev_reload_pmd__(pmd);
8f077b31
DM
4882 }
4883 }
4884
4885 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4886 if (pmd->need_reload) {
4887 if (pmd->core_id != NON_PMD_CORE_ID) {
4888 bool reload;
4889
4890 do {
4891 atomic_read_explicit(&pmd->reload, &reload,
4892 memory_order_acquire);
4893 } while (reload);
4894 }
140dd699
IM
4895 pmd->need_reload = false;
4896 }
4897 }
4898}
4899
6e3c6fa4
DDP
4900static void
4901reconfigure_pmd_threads(struct dp_netdev *dp)
4902 OVS_REQUIRES(dp->port_mutex)
4903{
e32971b8
DDP
4904 struct dp_netdev_pmd_thread *pmd;
4905 struct ovs_numa_dump *pmd_cores;
140dd699
IM
4906 struct ovs_numa_info_core *core;
4907 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4908 struct hmapx_node *node;
e32971b8 4909 bool changed = false;
140dd699 4910 bool need_to_adjust_static_tx_qids = false;
e32971b8
DDP
4911
4912 /* The pmd threads should be started only if there's a pmd port in the
4913 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4914 * NR_PMD_THREADS per numa node. */
4915 if (!has_pmd_port(dp)) {
4916 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4917 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4918 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4919 } else {
4920 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4921 }
4922
140dd699
IM
4923 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4924 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4925 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4926 /* Adjustment is required to keep 'static_tx_qid's sequential and
4927 * avoid possible issues, for example, imbalanced tx queue usage
4928 * and unnecessary locking caused by remapping on netdev level. */
4929 need_to_adjust_static_tx_qids = true;
4930 }
4931
4932 /* Check for unwanted pmd threads */
4933 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4934 if (pmd->core_id == NON_PMD_CORE_ID) {
4935 continue;
4936 }
4937 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4938 pmd->core_id)) {
4939 hmapx_add(&to_delete, pmd);
4940 } else if (need_to_adjust_static_tx_qids) {
e2cafa86 4941 atomic_store_relaxed(&pmd->reload_tx_qid, true);
140dd699 4942 pmd->need_reload = true;
e32971b8
DDP
4943 }
4944 }
4945
140dd699
IM
4946 HMAPX_FOR_EACH (node, &to_delete) {
4947 pmd = (struct dp_netdev_pmd_thread *) node->data;
4948 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4949 pmd->numa_id, pmd->core_id);
4950 dp_netdev_del_pmd(dp, pmd);
4951 }
4952 changed = !hmapx_is_empty(&to_delete);
4953 hmapx_destroy(&to_delete);
e32971b8 4954
140dd699
IM
4955 if (need_to_adjust_static_tx_qids) {
4956 /* 'static_tx_qid's are not sequential now.
4957 * Reload remaining threads to fix this. */
4958 reload_affected_pmds(dp);
4959 }
e32971b8 4960
140dd699
IM
4961 /* Check for required new pmd threads */
4962 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4963 pmd = dp_netdev_get_pmd(dp, core->core_id);
4964 if (!pmd) {
8afbf2fa
IM
4965 struct ds name = DS_EMPTY_INITIALIZER;
4966
140dd699 4967 pmd = xzalloc(sizeof *pmd);
e32971b8 4968 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
8afbf2fa
IM
4969
4970 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4971 pmd->thread = ovs_thread_create(ds_cstr(&name),
4972 pmd_thread_main, pmd);
4973 ds_destroy(&name);
4974
140dd699
IM
4975 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4976 pmd->numa_id, pmd->core_id);
4977 changed = true;
4978 } else {
4979 dp_netdev_pmd_unref(pmd);
e32971b8 4980 }
140dd699
IM
4981 }
4982
4983 if (changed) {
4984 struct ovs_numa_info_numa *numa;
e32971b8
DDP
4985
4986 /* Log the number of pmd threads per numa node. */
4987 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
140dd699 4988 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
e32971b8
DDP
4989 numa->n_cores, numa->numa_id);
4990 }
4991 }
4992
4993 ovs_numa_dump_destroy(pmd_cores);
4994}
4995
e32971b8
DDP
4996static void
4997pmd_remove_stale_ports(struct dp_netdev *dp,
4998 struct dp_netdev_pmd_thread *pmd)
4999 OVS_EXCLUDED(pmd->port_mutex)
5000 OVS_REQUIRES(dp->port_mutex)
5001{
5002 struct rxq_poll *poll, *poll_next;
5003 struct tx_port *tx, *tx_next;
5004
5005 ovs_mutex_lock(&pmd->port_mutex);
5006 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5007 struct dp_netdev_port *port = poll->rxq->port;
5008
5009 if (port->need_reconfigure
5010 || !hmap_contains(&dp->ports, &port->node)) {
5011 dp_netdev_del_rxq_from_pmd(pmd, poll);
5012 }
5013 }
5014 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
5015 struct dp_netdev_port *port = tx->port;
5016
5017 if (port->need_reconfigure
5018 || !hmap_contains(&dp->ports, &port->node)) {
5019 dp_netdev_del_port_tx_from_pmd(pmd, tx);
5020 }
5021 }
5022 ovs_mutex_unlock(&pmd->port_mutex);
5023}
5024
5025/* Must be called each time a port is added/removed or the cmask changes.
5026 * This creates and destroys pmd threads, reconfigures ports, opens their
5027 * rxqs and assigns all rxqs/txqs to pmd threads. */
5028static void
5029reconfigure_datapath(struct dp_netdev *dp)
5030 OVS_REQUIRES(dp->port_mutex)
5031{
6d9fead1 5032 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
e32971b8
DDP
5033 struct dp_netdev_pmd_thread *pmd;
5034 struct dp_netdev_port *port;
5035 int wanted_txqs;
6e3c6fa4 5036
a6a426d6
IM
5037 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5038
e32971b8
DDP
5039 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5040 * on the system and the user configuration. */
5041 reconfigure_pmd_threads(dp);
6e3c6fa4 5042
e32971b8 5043 wanted_txqs = cmap_count(&dp->poll_threads);
324c8374 5044
e32971b8
DDP
5045 /* The number of pmd threads might have changed, or a port can be new:
5046 * adjust the txqs. */
5047 HMAP_FOR_EACH (port, node, &dp->ports) {
5048 netdev_set_tx_multiq(port->netdev, wanted_txqs);
324c8374
IM
5049 }
5050
e32971b8
DDP
5051 /* Step 2: Remove from the pmd threads ports that have been removed or
5052 * need reconfiguration. */
5053
5054 /* Check for all the ports that need reconfiguration. We cache this in
85a4f238 5055 * 'port->need_reconfigure', because netdev_is_reconf_required() can
f598f462
IM
5056 * change at any time.
5057 * Also mark for reconfiguration all ports which will likely change their
5058 * 'dynamic_txqs' parameter. It's required to stop using them before
5059 * changing this setting and it's simpler to mark ports here and allow
5060 * 'pmd_remove_stale_ports' to remove them from threads. There will be
5061 * no actual reconfiguration in 'port_reconfigure' because it's
5062 * unnecessary. */
e32971b8 5063 HMAP_FOR_EACH (port, node, &dp->ports) {
f598f462
IM
5064 if (netdev_is_reconf_required(port->netdev)
5065 || (port->dynamic_txqs
5066 != (netdev_n_txq(port->netdev) < wanted_txqs))) {
e32971b8
DDP
5067 port->need_reconfigure = true;
5068 }
5069 }
5070
5071 /* Remove from the pmd threads all the ports that have been deleted or
5072 * need reconfiguration. */
5073 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5074 pmd_remove_stale_ports(dp, pmd);
5075 }
5076
5077 /* Reload affected pmd threads. We must wait for the pmd threads before
5078 * reconfiguring the ports, because a port cannot be reconfigured while
5079 * it's being used. */
5080 reload_affected_pmds(dp);
5081
5082 /* Step 3: Reconfigure ports. */
5083
5084 /* We only reconfigure the ports that we determined above, because they're
5085 * not being used by any pmd thread at the moment. If a port fails to
5086 * reconfigure we remove it from the datapath. */
f582b6df
BP
5087 struct dp_netdev_port *next_port;
5088 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
dc36593c 5089 int err;
6e3c6fa4 5090
e32971b8
DDP
5091 if (!port->need_reconfigure) {
5092 continue;
5093 }
5094
dc36593c
DDP
5095 err = port_reconfigure(port);
5096 if (err) {
5097 hmap_remove(&dp->ports, &port->node);
5098 seq_change(dp->port_seq);
5099 port_destroy(port);
324c8374 5100 } else {
e32971b8 5101 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
6e3c6fa4
DDP
5102 }
5103 }
e32971b8
DDP
5104
5105 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
5106 * for now, we just update the 'pmd' pointer in each rxq to point to the
5107 * wanted thread according to the scheduling policy. */
5108
5109 /* Reset all the pmd threads to non isolated. */
5110 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5111 pmd->isolated = false;
5112 }
5113
5114 /* Reset all the queues to unassigned */
5115 HMAP_FOR_EACH (port, node, &dp->ports) {
5116 for (int i = 0; i < port->n_rxq; i++) {
5117 port->rxqs[i].pmd = NULL;
5118 }
5119 }
5120
5121 /* Add pinned queues and mark pmd threads isolated. */
5122 rxq_scheduling(dp, true);
5123
5124 /* Add non-pinned queues. */
5125 rxq_scheduling(dp, false);
5126
5127 /* Step 5: Remove queues not compliant with new scheduling. */
6d9fead1
DM
5128
5129 /* Count all the threads that will have at least one queue to poll. */
5130 HMAP_FOR_EACH (port, node, &dp->ports) {
5131 for (int qid = 0; qid < port->n_rxq; qid++) {
5132 struct dp_netdev_rxq *q = &port->rxqs[qid];
5133
5134 if (q->pmd) {
5135 hmapx_add(&busy_threads, q->pmd);
5136 }
5137 }
5138 }
5139
e32971b8
DDP
5140 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5141 struct rxq_poll *poll, *poll_next;
5142
5143 ovs_mutex_lock(&pmd->port_mutex);
5144 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5145 if (poll->rxq->pmd != pmd) {
5146 dp_netdev_del_rxq_from_pmd(pmd, poll);
6d9fead1
DM
5147
5148 /* This pmd might sleep after this step if it has no rxq
5149 * remaining. Tell it to busy wait for new assignment if it
5150 * has at least one scheduled queue. */
5151 if (hmap_count(&pmd->poll_list) == 0 &&
5152 hmapx_contains(&busy_threads, pmd)) {
5153 atomic_store_relaxed(&pmd->wait_for_reload, true);
5154 }
e32971b8
DDP
5155 }
5156 }
5157 ovs_mutex_unlock(&pmd->port_mutex);
5158 }
5159
6d9fead1
DM
5160 hmapx_destroy(&busy_threads);
5161
e32971b8
DDP
5162 /* Reload affected pmd threads. We must wait for the pmd threads to remove
5163 * the old queues before readding them, otherwise a queue can be polled by
5164 * two threads at the same time. */
5165 reload_affected_pmds(dp);
5166
5167 /* Step 6: Add queues from scheduling, if they're not there already. */
5168 HMAP_FOR_EACH (port, node, &dp->ports) {
5169 if (!netdev_is_pmd(port->netdev)) {
5170 continue;
5171 }
5172
5173 for (int qid = 0; qid < port->n_rxq; qid++) {
5174 struct dp_netdev_rxq *q = &port->rxqs[qid];
5175
5176 if (q->pmd) {
5177 ovs_mutex_lock(&q->pmd->port_mutex);
5178 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5179 ovs_mutex_unlock(&q->pmd->port_mutex);
5180 }
5181 }
5182 }
5183
9df65060
VDA
5184 /* Add every port and bond to the tx port and bond caches of
5185 * every pmd thread, if it's not there already and if this pmd
5186 * has at least one rxq to poll.
5187 */
e32971b8
DDP
5188 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5189 ovs_mutex_lock(&pmd->port_mutex);
5190 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
9df65060
VDA
5191 struct tx_bond *bond;
5192
e32971b8
DDP
5193 HMAP_FOR_EACH (port, node, &dp->ports) {
5194 dp_netdev_add_port_tx_to_pmd(pmd, port);
5195 }
9df65060
VDA
5196
5197 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5198 dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5199 }
e32971b8
DDP
5200 }
5201 ovs_mutex_unlock(&pmd->port_mutex);
5202 }
5203
5204 /* Reload affected pmd threads. */
5205 reload_affected_pmds(dp);
5bf84282
NK
5206
5207 /* Check if PMD Auto LB is to be enabled */
5208 set_pmd_auto_lb(dp);
6e3c6fa4
DDP
5209}
5210
050c60bf
DDP
5211/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5212static bool
5213ports_require_restart(const struct dp_netdev *dp)
5214 OVS_REQUIRES(dp->port_mutex)
5215{
5216 struct dp_netdev_port *port;
5217
5218 HMAP_FOR_EACH (port, node, &dp->ports) {
5219 if (netdev_is_reconf_required(port->netdev)) {
5220 return true;
5221 }
5222 }
5223
5224 return false;
5225}
5226
5bf84282
NK
5227/* Calculates variance in the values stored in array 'a'. 'n' is the number
5228 * of elements in array to be considered for calculating vairance.
5229 * Usage example: data array 'a' contains the processing load of each pmd and
5230 * 'n' is the number of PMDs. It returns the variance in processing load of
5231 * PMDs*/
5232static uint64_t
5233variance(uint64_t a[], int n)
5234{
5235 /* Compute mean (average of elements). */
5236 uint64_t sum = 0;
5237 uint64_t mean = 0;
5238 uint64_t sqDiff = 0;
5239
5240 if (!n) {
5241 return 0;
5242 }
5243
5244 for (int i = 0; i < n; i++) {
5245 sum += a[i];
5246 }
5247
5248 if (sum) {
5249 mean = sum / n;
5250
5251 /* Compute sum squared differences with mean. */
5252 for (int i = 0; i < n; i++) {
5253 sqDiff += (a[i] - mean)*(a[i] - mean);
5254 }
5255 }
5256 return (sqDiff ? (sqDiff / n) : 0);
5257}
5258
5259
5260/* Returns the variance in the PMDs usage as part of dry run of rxqs
5261 * assignment to PMDs. */
5262static bool
5263get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5264 uint32_t num_pmds, uint64_t *predicted_variance)
5265 OVS_REQUIRES(dp->port_mutex)
5266{
5267 struct dp_netdev_port *port;
5268 struct dp_netdev_pmd_thread *pmd;
5269 struct dp_netdev_rxq **rxqs = NULL;
5270 struct rr_numa *numa = NULL;
5271 struct rr_numa_list rr;
5272 int n_rxqs = 0;
5273 bool ret = false;
5274 uint64_t *pmd_usage;
5275
5276 if (!predicted_variance) {
5277 return ret;
5278 }
5279
5280 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5281
5282 HMAP_FOR_EACH (port, node, &dp->ports) {
5283 if (!netdev_is_pmd(port->netdev)) {
5284 continue;
5285 }
5286
5287 for (int qid = 0; qid < port->n_rxq; qid++) {
5288 struct dp_netdev_rxq *q = &port->rxqs[qid];
5289 uint64_t cycle_hist = 0;
5290
5291 if (q->pmd->isolated) {
5292 continue;
5293 }
5294
5295 if (n_rxqs == 0) {
5296 rxqs = xmalloc(sizeof *rxqs);
5297 } else {
5298 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5299 }
5300
5301 /* Sum the queue intervals and store the cycle history. */
5302 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5303 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5304 }
5305 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5306 cycle_hist);
5307 /* Store the queue. */
5308 rxqs[n_rxqs++] = q;
5309 }
5310 }
5311 if (n_rxqs > 1) {
5312 /* Sort the queues in order of the processing cycles
5313 * they consumed during their last pmd interval. */
5314 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5315 }
5316 rr_numa_list_populate(dp, &rr);
5317
5318 for (int i = 0; i < n_rxqs; i++) {
5319 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5320 numa = rr_numa_list_lookup(&rr, numa_id);
5321 if (!numa) {
5322 /* Abort if cross NUMA polling. */
5323 VLOG_DBG("PMD auto lb dry run."
5324 " Aborting due to cross-numa polling.");
5325 goto cleanup;
5326 }
5327
5328 pmd = rr_numa_get_pmd(numa, true);
5329 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5330 "to be assigned port \'%s\' rx queue %d "
5331 "(measured processing cycles %"PRIu64").",
5332 pmd->core_id, numa_id,
5333 netdev_rxq_get_name(rxqs[i]->rx),
5334 netdev_rxq_get_queue_id(rxqs[i]->rx),
5335 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5336
5337 for (int id = 0; id < num_pmds; id++) {
5338 if (pmd->core_id == core_list[id]) {
5339 /* Add the processing cycles of rxq to pmd polling it. */
5340 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5341 RXQ_CYCLES_PROC_HIST);
5342 }
5343 }
5344 }
5345
5346 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5347 uint64_t total_cycles = 0;
5348
5349 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5350 continue;
5351 }
5352
5353 /* Get the total pmd cycles for an interval. */
5354 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5355 /* Estimate the cycles to cover all intervals. */
5356 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5357 for (int id = 0; id < num_pmds; id++) {
5358 if (pmd->core_id == core_list[id]) {
5359 if (pmd_usage[id]) {
5360 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5361 }
5362 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5363 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5364 }
5365 }
5366 }
5367 *predicted_variance = variance(pmd_usage, num_pmds);
5368 ret = true;
5369
5370cleanup:
5371 rr_numa_list_destroy(&rr);
5372 free(rxqs);
5373 free(pmd_usage);
5374 return ret;
5375}
5376
5377/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5378 * better distribution of load on PMDs. */
5379static bool
5380pmd_rebalance_dry_run(struct dp_netdev *dp)
5381 OVS_REQUIRES(dp->port_mutex)
5382{
5383 struct dp_netdev_pmd_thread *pmd;
5384 uint64_t *curr_pmd_usage;
5385
5386 uint64_t curr_variance;
5387 uint64_t new_variance;
5388 uint64_t improvement = 0;
5389 uint32_t num_pmds;
5390 uint32_t *pmd_corelist;
eef85380 5391 struct rxq_poll *poll;
5bf84282
NK
5392 bool ret;
5393
5394 num_pmds = cmap_count(&dp->poll_threads);
5395
5396 if (num_pmds > 1) {
5397 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5398 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5399 } else {
5400 return false;
5401 }
5402
5403 num_pmds = 0;
5404 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5405 uint64_t total_cycles = 0;
5406 uint64_t total_proc = 0;
5407
5408 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5409 continue;
5410 }
5411
5412 /* Get the total pmd cycles for an interval. */
5413 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5414 /* Estimate the cycles to cover all intervals. */
5415 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5416
eef85380
IM
5417 ovs_mutex_lock(&pmd->port_mutex);
5418 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5bf84282 5419 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
eef85380 5420 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5bf84282 5421 }
5bf84282 5422 }
eef85380
IM
5423 ovs_mutex_unlock(&pmd->port_mutex);
5424
5bf84282
NK
5425 if (total_proc) {
5426 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5427 }
5428
5429 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5430 pmd->core_id, curr_pmd_usage[num_pmds]);
5431
5432 if (atomic_count_get(&pmd->pmd_overloaded)) {
5433 atomic_count_set(&pmd->pmd_overloaded, 0);
5434 }
5435
5436 pmd_corelist[num_pmds] = pmd->core_id;
5437 num_pmds++;
5438 }
5439
5440 curr_variance = variance(curr_pmd_usage, num_pmds);
5441 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5442
5443 if (ret) {
5444 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5445 " Predicted PMD variance: %"PRIu64"",
5446 curr_variance, new_variance);
5447
5448 if (new_variance < curr_variance) {
5449 improvement =
5450 ((curr_variance - new_variance) * 100) / curr_variance;
5451 }
5452 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5453 ret = false;
5454 }
5455 }
5456
5457 free(curr_pmd_usage);
5458 free(pmd_corelist);
5459 return ret;
5460}
5461
5462
a36de779
PS
5463/* Return true if needs to revalidate datapath flows. */
5464static bool
e4cfed38
PS
5465dpif_netdev_run(struct dpif *dpif)
5466{
5467 struct dp_netdev_port *port;
5468 struct dp_netdev *dp = get_dp_netdev(dpif);
546e57d4 5469 struct dp_netdev_pmd_thread *non_pmd;
a36de779 5470 uint64_t new_tnl_seq;
c71ea3c4 5471 bool need_to_flush = true;
5bf84282
NK
5472 bool pmd_rebalance = false;
5473 long long int now = time_msec();
5474 struct dp_netdev_pmd_thread *pmd;
e4cfed38 5475
e9985d6a 5476 ovs_mutex_lock(&dp->port_mutex);
546e57d4
DDP
5477 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5478 if (non_pmd) {
5479 ovs_mutex_lock(&dp->non_pmd_mutex);
5480 HMAP_FOR_EACH (port, node, &dp->ports) {
5481 if (!netdev_is_pmd(port->netdev)) {
5482 int i;
55c955bd 5483
2fbadeb6
IM
5484 if (port->emc_enabled) {
5485 atomic_read_relaxed(&dp->emc_insert_min,
5486 &non_pmd->ctx.emc_insert_min);
5487 } else {
5488 non_pmd->ctx.emc_insert_min = 0;
5489 }
5490
546e57d4 5491 for (i = 0; i < port->n_rxq; i++) {
35c91567
DM
5492
5493 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5494 continue;
5495 }
5496
c71ea3c4
IM
5497 if (dp_netdev_process_rxq_port(non_pmd,
5498 &port->rxqs[i],
5499 port->port_no)) {
5500 need_to_flush = false;
5501 }
546e57d4 5502 }
55c955bd 5503 }
e4cfed38 5504 }
c71ea3c4
IM
5505 if (need_to_flush) {
5506 /* We didn't receive anything in the process loop.
5507 * Check if we need to send something.
5508 * There was no time updates on current iteration. */
5509 pmd_thread_ctx_time_update(non_pmd);
5510 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5511 }
5512
b010be17 5513 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
546e57d4 5514 ovs_mutex_unlock(&dp->non_pmd_mutex);
6e3c6fa4 5515
546e57d4
DDP
5516 dp_netdev_pmd_unref(non_pmd);
5517 }
1c1e46ed 5518
5bf84282
NK
5519 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5520 if (pmd_alb->is_enabled) {
5521 if (!pmd_alb->rebalance_poll_timer) {
5522 pmd_alb->rebalance_poll_timer = now;
5523 } else if ((pmd_alb->rebalance_poll_timer +
5524 pmd_alb->rebalance_intvl) < now) {
5525 pmd_alb->rebalance_poll_timer = now;
5526 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5527 if (atomic_count_get(&pmd->pmd_overloaded) >=
5528 PMD_RXQ_INTERVAL_MAX) {
5529 pmd_rebalance = true;
5530 break;
5531 }
5532 }
5533
5534 if (pmd_rebalance &&
5535 !dp_netdev_is_reconf_required(dp) &&
5536 !ports_require_restart(dp) &&
5537 pmd_rebalance_dry_run(dp)) {
5538 VLOG_INFO("PMD auto lb dry run."
5539 " requesting datapath reconfigure.");
5540 dp_netdev_request_reconfigure(dp);
5541 }
5542 }
5543 }
5544
a6a426d6 5545 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
e32971b8 5546 reconfigure_datapath(dp);
6e3c6fa4
DDP
5547 }
5548 ovs_mutex_unlock(&dp->port_mutex);
5549
53902038 5550 tnl_neigh_cache_run();
7f9b8504 5551 tnl_port_map_run();
a36de779
PS
5552 new_tnl_seq = seq_read(tnl_conf_seq);
5553
5554 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5555 dp->last_tnl_conf_seq = new_tnl_seq;
5556 return true;
5557 }
5558 return false;
e4cfed38
PS
5559}
5560
5561static void
5562dpif_netdev_wait(struct dpif *dpif)
5563{
5564 struct dp_netdev_port *port;
5565 struct dp_netdev *dp = get_dp_netdev(dpif);
5566
59e6d833 5567 ovs_mutex_lock(&dp_netdev_mutex);
e9985d6a
DDP
5568 ovs_mutex_lock(&dp->port_mutex);
5569 HMAP_FOR_EACH (port, node, &dp->ports) {
050c60bf 5570 netdev_wait_reconf_required(port->netdev);
55c955bd
PS
5571 if (!netdev_is_pmd(port->netdev)) {
5572 int i;
5573
490e82af 5574 for (i = 0; i < port->n_rxq; i++) {
947dc567 5575 netdev_rxq_wait(port->rxqs[i].rx);
55c955bd 5576 }
e4cfed38
PS
5577 }
5578 }
e9985d6a 5579 ovs_mutex_unlock(&dp->port_mutex);
59e6d833 5580 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 5581 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
5582}
5583
d0cca6c3
DDP
5584static void
5585pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5586{
5587 struct tx_port *tx_port_cached;
5588
c71ea3c4
IM
5589 /* Flush all the queued packets. */
5590 dp_netdev_pmd_flush_output_packets(pmd, true);
324c8374 5591 /* Free all used tx queue ids. */
b010be17 5592 dpif_netdev_xps_revalidate_pmd(pmd, true);
324c8374 5593
57eebbb4
DDP
5594 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5595 free(tx_port_cached);
5596 }
5597 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
d0cca6c3
DDP
5598 free(tx_port_cached);
5599 }
5600}
5601
5602/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
899363ed
BB
5603 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5604 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5605 * one txq. */
d0cca6c3
DDP
5606static void
5607pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5608 OVS_REQUIRES(pmd->port_mutex)
5609{
5610 struct tx_port *tx_port, *tx_port_cached;
5611
5612 pmd_free_cached_ports(pmd);
57eebbb4
DDP
5613 hmap_shrink(&pmd->send_port_cache);
5614 hmap_shrink(&pmd->tnl_port_cache);
d0cca6c3
DDP
5615
5616 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
57eebbb4
DDP
5617 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5618 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5619 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5620 hash_port_no(tx_port_cached->port->port_no));
5621 }
5622
5623 if (netdev_n_txq(tx_port->port->netdev)) {
5624 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5625 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5626 hash_port_no(tx_port_cached->port->port_no));
5627 }
d0cca6c3
DDP
5628 }
5629}
5630
140dd699
IM
5631static void
5632pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5633{
5634 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5635 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5636 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5637 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5638 }
5639 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5640
5641 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5642 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5643}
5644
5645static void
5646pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5647{
5648 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5649 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5650 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5651}
5652
e4cfed38 5653static int
d0cca6c3 5654pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
947dc567 5655 struct polled_queue **ppoll_list)
e4cfed38 5656{
947dc567 5657 struct polled_queue *poll_list = *ppoll_list;
ae7ad0a1
IM
5658 struct rxq_poll *poll;
5659 int i;
e4cfed38 5660
d0cca6c3 5661 ovs_mutex_lock(&pmd->port_mutex);
947dc567
DDP
5662 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5663 * sizeof *poll_list);
a1fdee13 5664
ae7ad0a1 5665 i = 0;
947dc567 5666 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
922b28d4 5667 poll_list[i].rxq = poll->rxq;
947dc567 5668 poll_list[i].port_no = poll->rxq->port->port_no;
2fbadeb6 5669 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
35c91567
DM
5670 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5671 poll_list[i].change_seq =
5672 netdev_get_change_seq(poll->rxq->port->netdev);
947dc567 5673 i++;
e4cfed38 5674 }
d0cca6c3
DDP
5675
5676 pmd_load_cached_ports(pmd);
5677
5678 ovs_mutex_unlock(&pmd->port_mutex);
e4cfed38 5679
e4cfed38 5680 *ppoll_list = poll_list;
d42f9307 5681 return i;
e4cfed38
PS
5682}
5683
6c3eee82 5684static void *
e4cfed38 5685pmd_thread_main(void *f_)
6c3eee82 5686{
65f13b50 5687 struct dp_netdev_pmd_thread *pmd = f_;
82a48ead 5688 struct pmd_perf_stats *s = &pmd->perf_stats;
e4cfed38 5689 unsigned int lc = 0;
947dc567 5690 struct polled_queue *poll_list;
6d9fead1 5691 bool wait_for_reload = false;
e2cafa86 5692 bool reload_tx_qid;
d42f9307 5693 bool exiting;
6d9fead1 5694 bool reload;
e4cfed38
PS
5695 int poll_cnt;
5696 int i;
a2ac666d 5697 int process_packets = 0;
6c3eee82 5698
e4cfed38
PS
5699 poll_list = NULL;
5700
65f13b50
AW
5701 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5702 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6930c7e0
DDP
5703 ovs_numa_thread_setaffinity_core(pmd->core_id);
5704 dpdk_set_lcore_id(pmd->core_id);
d0cca6c3 5705 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
60d8ccae 5706 dfc_cache_init(&pmd->flow_cache);
140dd699 5707 pmd_alloc_static_tx_qid(pmd);
ae7ad0a1 5708
e2cafa86 5709reload:
5bf84282
NK
5710 atomic_count_init(&pmd->pmd_overloaded, 0);
5711
7dd671f0
MK
5712 /* List port/core affinity */
5713 for (i = 0; i < poll_cnt; i++) {
ce179f11 5714 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
922b28d4
KT
5715 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5716 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4f5d13e2
KT
5717 /* Reset the rxq current cycles counter. */
5718 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
7dd671f0
MK
5719 }
5720
2788a1b1 5721 if (!poll_cnt) {
6d9fead1
DM
5722 if (wait_for_reload) {
5723 /* Don't sleep, control thread will ask for a reload shortly. */
5724 do {
5725 atomic_read_explicit(&pmd->reload, &reload,
5726 memory_order_acquire);
5727 } while (!reload);
5728 } else {
5729 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5730 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5731 poll_block();
5732 }
2788a1b1 5733 }
2788a1b1
DDP
5734 }
5735
2a2c67b4
KT
5736 pmd->intrvl_tsc_prev = 0;
5737 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
a19896ab 5738 cycles_counter_update(s);
79f36875
JS
5739 /* Protect pmd stats from external clearing while polling. */
5740 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
e4cfed38 5741 for (;;) {
79f36875 5742 uint64_t rx_packets = 0, tx_packets = 0;
c71ea3c4 5743
a19896ab 5744 pmd_perf_start_iteration(s);
79f36875 5745
e4cfed38 5746 for (i = 0; i < poll_cnt; i++) {
2fbadeb6 5747
35c91567
DM
5748 if (!poll_list[i].rxq_enabled) {
5749 continue;
5750 }
5751
2fbadeb6
IM
5752 if (poll_list[i].emc_enabled) {
5753 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5754 &pmd->ctx.emc_insert_min);
5755 } else {
5756 pmd->ctx.emc_insert_min = 0;
5757 }
5758
a2ac666d 5759 process_packets =
a19896ab 5760 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
a2ac666d 5761 poll_list[i].port_no);
79f36875 5762 rx_packets += process_packets;
e4cfed38
PS
5763 }
5764
79f36875 5765 if (!rx_packets) {
c71ea3c4
IM
5766 /* We didn't receive anything in the process loop.
5767 * Check if we need to send something.
5768 * There was no time updates on current iteration. */
5769 pmd_thread_ctx_time_update(pmd);
79f36875 5770 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
c71ea3c4
IM
5771 }
5772
e4cfed38 5773 if (lc++ > 1024) {
e4cfed38 5774 lc = 0;
84067a4c 5775
fbe0962b 5776 coverage_try_clear();
4809891b 5777 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
9dede5cf 5778 if (!ovsrcu_try_quiesce()) {
60d8ccae 5779 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
9dede5cf 5780 }
84067a4c 5781
35c91567
DM
5782 for (i = 0; i < poll_cnt; i++) {
5783 uint64_t current_seq =
5784 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5785 if (poll_list[i].change_seq != current_seq) {
5786 poll_list[i].change_seq = current_seq;
5787 poll_list[i].rxq_enabled =
5788 netdev_rxq_enabled(poll_list[i].rxq->rx);
5789 }
5790 }
6c3eee82 5791 }
68a0625b
DM
5792
5793 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5794 if (OVS_UNLIKELY(reload)) {
5795 break;
5796 }
5797
79f36875
JS
5798 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5799 pmd_perf_metrics_enabled(pmd));
e4cfed38 5800 }
79f36875 5801 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6c3eee82 5802
d0cca6c3 5803 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6d9fead1 5804 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
e2cafa86 5805 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
299c8d61 5806 atomic_read_relaxed(&pmd->exit, &exiting);
d42f9307
DDP
5807 /* Signal here to make sure the pmd finishes
5808 * reloading the updated configuration. */
5809 dp_netdev_pmd_reload_done(pmd);
5810
e2cafa86
DM
5811 if (reload_tx_qid) {
5812 pmd_free_static_tx_qid(pmd);
5813 pmd_alloc_static_tx_qid(pmd);
5814 }
9bbf1c3d 5815
d42f9307 5816 if (!exiting) {
e4cfed38
PS
5817 goto reload;
5818 }
6c3eee82 5819
e2cafa86 5820 pmd_free_static_tx_qid(pmd);
60d8ccae 5821 dfc_cache_uninit(&pmd->flow_cache);
e4cfed38 5822 free(poll_list);
d0cca6c3 5823 pmd_free_cached_ports(pmd);
6c3eee82
BP
5824 return NULL;
5825}
5826
6b31e073
RW
5827static void
5828dp_netdev_disable_upcall(struct dp_netdev *dp)
5829 OVS_ACQUIRES(dp->upcall_rwlock)
5830{
5831 fat_rwlock_wrlock(&dp->upcall_rwlock);
5832}
5833
5dddf960
JR
5834\f
5835/* Meters */
5836static void
5837dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5838 struct ofputil_meter_features *features)
5839{
4b27db64
JR
5840 features->max_meters = MAX_METERS;
5841 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5842 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5843 features->max_bands = MAX_BANDS;
5dddf960
JR
5844 features->max_color = 0;
5845}
5846
425a7b9e
JP
5847/* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5848 * that exceed a band are dropped in-place. */
4b27db64
JR
5849static void
5850dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5851 uint32_t meter_id, long long int now)
5852{
5853 struct dp_meter *meter;
5854 struct dp_meter_band *band;
79c81260 5855 struct dp_packet *packet;
4b27db64
JR
5856 long long int long_delta_t; /* msec */
5857 uint32_t delta_t; /* msec */
5c41c31e 5858 uint32_t delta_in_us; /* usec */
79c81260 5859 const size_t cnt = dp_packet_batch_size(packets_);
4b27db64
JR
5860 uint32_t bytes, volume;
5861 int exceeded_band[NETDEV_MAX_BURST];
5862 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5863 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5864
5865 if (meter_id >= MAX_METERS) {
5866 return;
5867 }
5868
5869 meter_lock(dp, meter_id);
5870 meter = dp->meters[meter_id];
5871 if (!meter) {
5872 goto out;
5873 }
5874
5875 /* Initialize as negative values. */
5876 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5877 /* Initialize as zeroes. */
5878 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5879
5880 /* All packets will hit the meter at the same time. */
42697ca7 5881 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
4b27db64 5882
acc5df0e
IM
5883 if (long_delta_t < 0) {
5884 /* This condition means that we have several threads fighting for a
5885 meter lock, and the one who received the packets a bit later wins.
5886 Assuming that all racing threads received packets at the same time
5887 to avoid overflow. */
5888 long_delta_t = 0;
5c41c31e
JL
5889 delta_in_us = 0;
5890 } else {
5891 delta_in_us = (now - meter->used) % 1000;
acc5df0e
IM
5892 }
5893
4b27db64
JR
5894 /* Make sure delta_t will not be too large, so that bucket will not
5895 * wrap around below. */
5896 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5897 ? meter->max_delta_t : (uint32_t)long_delta_t;
5898
5899 /* Update meter stats. */
5900 meter->used = now;
5901 meter->packet_count += cnt;
5902 bytes = 0;
e883448e 5903 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5904 bytes += dp_packet_size(packet);
4b27db64
JR
5905 }
5906 meter->byte_count += bytes;
5907
5908 /* Meters can operate in terms of packets per second or kilobits per
5909 * second. */
5910 if (meter->flags & OFPMF13_PKTPS) {
5911 /* Rate in packets/second, bucket 1/1000 packets. */
5912 /* msec * packets/sec = 1/1000 packets. */
5913 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5914 } else {
5915 /* Rate in kbps, bucket in bits. */
5916 /* msec * kbps = bits */
5917 volume = bytes * 8;
5918 }
5919
5920 /* Update all bands and find the one hit with the highest rate for each
5921 * packet (if any). */
5922 for (int m = 0; m < meter->n_bands; ++m) {
5923 band = &meter->bands[m];
5924
5925 /* Update band's bucket. */
5926 band->bucket += delta_t * band->up.rate;
5c41c31e 5927 band->bucket += delta_in_us * band->up.rate / 1000;
4b27db64
JR
5928 if (band->bucket > band->up.burst_size) {
5929 band->bucket = band->up.burst_size;
5930 }
5931
5932 /* Drain the bucket for all the packets, if possible. */
5933 if (band->bucket >= volume) {
5934 band->bucket -= volume;
5935 } else {
5936 int band_exceeded_pkt;
5937
5938 /* Band limit hit, must process packet-by-packet. */
5939 if (meter->flags & OFPMF13_PKTPS) {
5940 band_exceeded_pkt = band->bucket / 1000;
5941 band->bucket %= 1000; /* Remainder stays in bucket. */
5942
5943 /* Update the exceeding band for each exceeding packet.
5944 * (Only one band will be fired by a packet, and that
5945 * can be different for each packet.) */
e883448e 5946 for (int i = band_exceeded_pkt; i < cnt; i++) {
4b27db64
JR
5947 if (band->up.rate > exceeded_rate[i]) {
5948 exceeded_rate[i] = band->up.rate;
5949 exceeded_band[i] = m;
5950 }
5951 }
5952 } else {
5953 /* Packet sizes differ, must process one-by-one. */
5954 band_exceeded_pkt = cnt;
e883448e 5955 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5956 uint32_t bits = dp_packet_size(packet) * 8;
4b27db64
JR
5957
5958 if (band->bucket >= bits) {
5959 band->bucket -= bits;
5960 } else {
5961 if (i < band_exceeded_pkt) {
5962 band_exceeded_pkt = i;
5963 }
5964 /* Update the exceeding band for the exceeding packet.
5965 * (Only one band will be fired by a packet, and that
5966 * can be different for each packet.) */
5967 if (band->up.rate > exceeded_rate[i]) {
5968 exceeded_rate[i] = band->up.rate;
5969 exceeded_band[i] = m;
5970 }
5971 }
5972 }
5973 }
5974 /* Remember the first exceeding packet. */
5975 if (exceeded_pkt > band_exceeded_pkt) {
5976 exceeded_pkt = band_exceeded_pkt;
5977 }
5978 }
5979 }
5980
425a7b9e
JP
5981 /* Fire the highest rate band exceeded by each packet, and drop
5982 * packets if needed. */
4b27db64 5983 size_t j;
79c81260 5984 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4b27db64
JR
5985 if (exceeded_band[j] >= 0) {
5986 /* Meter drop packet. */
5987 band = &meter->bands[exceeded_band[j]];
5988 band->packet_count += 1;
5989 band->byte_count += dp_packet_size(packet);
a13a0209 5990 COVERAGE_INC(datapath_drop_meter);
4b27db64
JR
5991 dp_packet_delete(packet);
5992 } else {
5993 /* Meter accepts packet. */
5994 dp_packet_batch_refill(packets_, packet, j);
5995 }
5996 }
5997 out:
5998 meter_unlock(dp, meter_id);
5999}
6000
6001/* Meter set/get/del processing is still single-threaded. */
5dddf960 6002static int
8101f03f 6003dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
4b27db64 6004 struct ofputil_meter_config *config)
5dddf960 6005{
4b27db64 6006 struct dp_netdev *dp = get_dp_netdev(dpif);
8101f03f 6007 uint32_t mid = meter_id.uint32;
4b27db64
JR
6008 struct dp_meter *meter;
6009 int i;
6010
4b27db64
JR
6011 if (mid >= MAX_METERS) {
6012 return EFBIG; /* Meter_id out of range. */
6013 }
6014
6508c845 6015 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
4b27db64
JR
6016 return EBADF; /* Unsupported flags set */
6017 }
2029ce9a 6018
6508c845
JP
6019 if (config->n_bands > MAX_BANDS) {
6020 return EINVAL;
2029ce9a
AVA
6021 }
6022
4b27db64
JR
6023 for (i = 0; i < config->n_bands; ++i) {
6024 switch (config->bands[i].type) {
6025 case OFPMBT13_DROP:
6026 break;
6027 default:
6028 return ENODEV; /* Unsupported band type */
6029 }
6030 }
6031
6032 /* Allocate meter */
6033 meter = xzalloc(sizeof *meter
6034 + config->n_bands * sizeof(struct dp_meter_band));
4b27db64 6035
d0db81ea
JP
6036 meter->flags = config->flags;
6037 meter->n_bands = config->n_bands;
6038 meter->max_delta_t = 0;
6039 meter->used = time_usec();
4b27db64 6040
d0db81ea
JP
6041 /* set up bands */
6042 for (i = 0; i < config->n_bands; ++i) {
6043 uint32_t band_max_delta_t;
4b27db64 6044
d0db81ea
JP
6045 /* Set burst size to a workable value if none specified. */
6046 if (config->bands[i].burst_size == 0) {
6047 config->bands[i].burst_size = config->bands[i].rate;
6048 }
6049
6050 meter->bands[i].up = config->bands[i];
6051 /* Convert burst size to the bucket units: */
6052 /* pkts => 1/1000 packets, kilobits => bits. */
6053 meter->bands[i].up.burst_size *= 1000;
6054 /* Initialize bucket to empty. */
6055 meter->bands[i].bucket = 0;
6056
6057 /* Figure out max delta_t that is enough to fill any bucket. */
6058 band_max_delta_t
6059 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
6060 if (band_max_delta_t > meter->max_delta_t) {
6061 meter->max_delta_t = band_max_delta_t;
6062 }
4b27db64 6063 }
d0db81ea
JP
6064
6065 meter_lock(dp, mid);
6066 dp_delete_meter(dp, mid); /* Free existing meter, if any */
6067 dp->meters[mid] = meter;
6068 meter_unlock(dp, mid);
6069
6070 return 0;
5dddf960
JR
6071}
6072
6073static int
4b27db64
JR
6074dpif_netdev_meter_get(const struct dpif *dpif,
6075 ofproto_meter_id meter_id_,
6076 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 6077{
4b27db64 6078 const struct dp_netdev *dp = get_dp_netdev(dpif);
4b27db64 6079 uint32_t meter_id = meter_id_.uint32;
866bc756 6080 int retval = 0;
4b27db64
JR
6081
6082 if (meter_id >= MAX_METERS) {
6083 return EFBIG;
6084 }
866bc756
JP
6085
6086 meter_lock(dp, meter_id);
6087 const struct dp_meter *meter = dp->meters[meter_id];
4b27db64 6088 if (!meter) {
866bc756
JP
6089 retval = ENOENT;
6090 goto done;
4b27db64
JR
6091 }
6092 if (stats) {
6093 int i = 0;
6094
4b27db64
JR
6095 stats->packet_in_count = meter->packet_count;
6096 stats->byte_in_count = meter->byte_count;
6097
6098 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6099 stats->bands[i].packet_count = meter->bands[i].packet_count;
6100 stats->bands[i].byte_count = meter->bands[i].byte_count;
6101 }
4b27db64
JR
6102
6103 stats->n_bands = i;
6104 }
866bc756
JP
6105
6106done:
6107 meter_unlock(dp, meter_id);
6108 return retval;
5dddf960
JR
6109}
6110
6111static int
4b27db64
JR
6112dpif_netdev_meter_del(struct dpif *dpif,
6113 ofproto_meter_id meter_id_,
6114 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 6115{
4b27db64
JR
6116 struct dp_netdev *dp = get_dp_netdev(dpif);
6117 int error;
6118
6119 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6120 if (!error) {
6121 uint32_t meter_id = meter_id_.uint32;
6122
6123 meter_lock(dp, meter_id);
6124 dp_delete_meter(dp, meter_id);
6125 meter_unlock(dp, meter_id);
4b27db64
JR
6126 }
6127 return error;
5dddf960
JR
6128}
6129
6130\f
6b31e073
RW
6131static void
6132dpif_netdev_disable_upcall(struct dpif *dpif)
6133 OVS_NO_THREAD_SAFETY_ANALYSIS
6134{
6135 struct dp_netdev *dp = get_dp_netdev(dpif);
6136 dp_netdev_disable_upcall(dp);
6137}
6138
6139static void
6140dp_netdev_enable_upcall(struct dp_netdev *dp)
6141 OVS_RELEASES(dp->upcall_rwlock)
6142{
6143 fat_rwlock_unlock(&dp->upcall_rwlock);
6144}
6145
6146static void
6147dpif_netdev_enable_upcall(struct dpif *dpif)
6148 OVS_NO_THREAD_SAFETY_ANALYSIS
6149{
6150 struct dp_netdev *dp = get_dp_netdev(dpif);
6151 dp_netdev_enable_upcall(dp);
6152}
6153
ae7ad0a1 6154static void
accf8626
AW
6155dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6156{
6d9fead1 6157 atomic_store_relaxed(&pmd->wait_for_reload, false);
e2cafa86 6158 atomic_store_relaxed(&pmd->reload_tx_qid, false);
2788a1b1 6159 pmd->last_reload_seq = seq_read(pmd->reload_seq);
8f077b31 6160 atomic_store_explicit(&pmd->reload, false, memory_order_release);
accf8626
AW
6161}
6162
1c1e46ed 6163/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
546e57d4
DDP
6164 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6165 * 'core_id' is NON_PMD_CORE_ID).
1c1e46ed
AW
6166 *
6167 * Caller must unrefs the returned reference. */
65f13b50 6168static struct dp_netdev_pmd_thread *
bd5131ba 6169dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
6170{
6171 struct dp_netdev_pmd_thread *pmd;
55847abe 6172 const struct cmap_node *pnode;
65f13b50 6173
b19befae 6174 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
6175 if (!pnode) {
6176 return NULL;
6177 }
65f13b50
AW
6178 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6179
1c1e46ed 6180 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
6181}
6182
f2eee189
AW
6183/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6184static void
6185dp_netdev_set_nonpmd(struct dp_netdev *dp)
e9985d6a 6186 OVS_REQUIRES(dp->port_mutex)
f2eee189
AW
6187{
6188 struct dp_netdev_pmd_thread *non_pmd;
6189
6190 non_pmd = xzalloc(sizeof *non_pmd);
00873463 6191 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
f2eee189
AW
6192}
6193
1c1e46ed
AW
6194/* Caller must have valid pointer to 'pmd'. */
6195static bool
6196dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6197{
6198 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6199}
6200
6201static void
6202dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6203{
6204 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6205 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6206 }
6207}
6208
6209/* Given cmap position 'pos', tries to ref the next node. If try_ref()
6210 * fails, keeps checking for next node until reaching the end of cmap.
6211 *
6212 * Caller must unrefs the returned reference. */
6213static struct dp_netdev_pmd_thread *
6214dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6215{
6216 struct dp_netdev_pmd_thread *next;
6217
6218 do {
6219 struct cmap_node *node;
6220
6221 node = cmap_next_position(&dp->poll_threads, pos);
6222 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6223 : NULL;
6224 } while (next && !dp_netdev_pmd_try_ref(next));
6225
6226 return next;
6227}
6228
65f13b50 6229/* Configures the 'pmd' based on the input argument. */
6c3eee82 6230static void
65f13b50 6231dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
00873463 6232 unsigned core_id, int numa_id)
65f13b50
AW
6233{
6234 pmd->dp = dp;
65f13b50
AW
6235 pmd->core_id = core_id;
6236 pmd->numa_id = numa_id;
e32971b8 6237 pmd->need_reload = false;
c71ea3c4 6238 pmd->n_output_batches = 0;
1c1e46ed
AW
6239
6240 ovs_refcount_init(&pmd->ref_cnt);
299c8d61 6241 atomic_init(&pmd->exit, false);
2788a1b1
DDP
6242 pmd->reload_seq = seq_create();
6243 pmd->last_reload_seq = seq_read(pmd->reload_seq);
14e3e12a 6244 atomic_init(&pmd->reload, false);
1c1e46ed 6245 ovs_mutex_init(&pmd->flow_mutex);
d0cca6c3 6246 ovs_mutex_init(&pmd->port_mutex);
9df65060 6247 ovs_mutex_init(&pmd->bond_mutex);
1c1e46ed 6248 cmap_init(&pmd->flow_table);
3453b4d6 6249 cmap_init(&pmd->classifiers);
58ed6df0 6250 pmd->ctx.last_rxq = NULL;
b010be17
IM
6251 pmd_thread_ctx_time_update(pmd);
6252 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6253 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
947dc567 6254 hmap_init(&pmd->poll_list);
d0cca6c3 6255 hmap_init(&pmd->tx_ports);
57eebbb4
DDP
6256 hmap_init(&pmd->tnl_port_cache);
6257 hmap_init(&pmd->send_port_cache);
9df65060 6258 cmap_init(&pmd->tx_bonds);
65f13b50
AW
6259 /* init the 'flow_cache' since there is no
6260 * actual thread created for NON_PMD_CORE_ID. */
6261 if (core_id == NON_PMD_CORE_ID) {
60d8ccae 6262 dfc_cache_init(&pmd->flow_cache);
140dd699 6263 pmd_alloc_static_tx_qid(pmd);
65f13b50 6264 }
82a48ead 6265 pmd_perf_stats_init(&pmd->perf_stats);
65f13b50
AW
6266 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6267 hash_int(core_id, 0));
6268}
6269
1c1e46ed
AW
6270static void
6271dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6272{
3453b4d6
JS
6273 struct dpcls *cls;
6274
1c1e46ed 6275 dp_netdev_pmd_flow_flush(pmd);
57eebbb4
DDP
6276 hmap_destroy(&pmd->send_port_cache);
6277 hmap_destroy(&pmd->tnl_port_cache);
d0cca6c3 6278 hmap_destroy(&pmd->tx_ports);
9df65060 6279 cmap_destroy(&pmd->tx_bonds);
947dc567 6280 hmap_destroy(&pmd->poll_list);
3453b4d6
JS
6281 /* All flows (including their dpcls_rules) have been deleted already */
6282 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6283 dpcls_destroy(cls);
7c269972 6284 ovsrcu_postpone(free, cls);
3453b4d6
JS
6285 }
6286 cmap_destroy(&pmd->classifiers);
1c1e46ed
AW
6287 cmap_destroy(&pmd->flow_table);
6288 ovs_mutex_destroy(&pmd->flow_mutex);
2788a1b1 6289 seq_destroy(pmd->reload_seq);
d0cca6c3 6290 ovs_mutex_destroy(&pmd->port_mutex);
9df65060 6291 ovs_mutex_destroy(&pmd->bond_mutex);
1c1e46ed
AW
6292 free(pmd);
6293}
6294
6295/* Stops the pmd thread, removes it from the 'dp->poll_threads',
6296 * and unrefs the struct. */
65f13b50 6297static void
e4e74c3a 6298dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 6299{
d0cca6c3
DDP
6300 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6301 * but extra cleanup is necessary */
65f13b50 6302 if (pmd->core_id == NON_PMD_CORE_ID) {
febf4a7a 6303 ovs_mutex_lock(&dp->non_pmd_mutex);
60d8ccae 6304 dfc_cache_uninit(&pmd->flow_cache);
d0cca6c3 6305 pmd_free_cached_ports(pmd);
140dd699 6306 pmd_free_static_tx_qid(pmd);
febf4a7a 6307 ovs_mutex_unlock(&dp->non_pmd_mutex);
65f13b50 6308 } else {
299c8d61 6309 atomic_store_relaxed(&pmd->exit, true);
65f13b50 6310 dp_netdev_reload_pmd__(pmd);
65f13b50
AW
6311 xpthread_join(pmd->thread, NULL);
6312 }
ae7ad0a1 6313
d0cca6c3 6314 dp_netdev_pmd_clear_ports(pmd);
ae7ad0a1 6315
e4e74c3a
AW
6316 /* Purges the 'pmd''s flows after stopping the thread, but before
6317 * destroying the flows, so that the flow stats can be collected. */
6318 if (dp->dp_purge_cb) {
6319 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6320 }
65f13b50 6321 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 6322 dp_netdev_pmd_unref(pmd);
65f13b50 6323}
6c3eee82 6324
e32971b8
DDP
6325/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6326 * thread. */
65f13b50 6327static void
e32971b8 6328dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
65f13b50
AW
6329{
6330 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
6331 struct dp_netdev_pmd_thread **pmd_list;
6332 size_t k = 0, n_pmds;
6333
e32971b8 6334 n_pmds = cmap_count(&dp->poll_threads);
d916785c 6335 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
6336
6337 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
e32971b8 6338 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
b9584f21
DDP
6339 continue;
6340 }
d916785c
DDP
6341 /* We cannot call dp_netdev_del_pmd(), since it alters
6342 * 'dp->poll_threads' (while we're iterating it) and it
6343 * might quiesce. */
6344 ovs_assert(k < n_pmds);
6345 pmd_list[k++] = pmd;
6c3eee82 6346 }
d916785c
DDP
6347
6348 for (size_t i = 0; i < k; i++) {
6349 dp_netdev_del_pmd(dp, pmd_list[i]);
6350 }
6351 free(pmd_list);
65f13b50 6352}
6c3eee82 6353
d0cca6c3
DDP
6354/* Deletes all rx queues from pmd->poll_list and all the ports from
6355 * pmd->tx_ports. */
cc245ce8 6356static void
d0cca6c3 6357dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
6358{
6359 struct rxq_poll *poll;
d0cca6c3 6360 struct tx_port *port;
9df65060 6361 struct tx_bond *tx;
cc245ce8 6362
d0cca6c3 6363 ovs_mutex_lock(&pmd->port_mutex);
947dc567 6364 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
cc245ce8
IM
6365 free(poll);
6366 }
d0cca6c3
DDP
6367 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6368 free(port);
6369 }
6370 ovs_mutex_unlock(&pmd->port_mutex);
9df65060
VDA
6371
6372 ovs_mutex_lock(&pmd->bond_mutex);
6373 CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6374 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6375 ovsrcu_postpone(free, tx);
6376 }
6377 ovs_mutex_unlock(&pmd->bond_mutex);
cc245ce8
IM
6378}
6379
e32971b8 6380/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
b68872d8 6381static void
e32971b8
DDP
6382dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6383 struct dp_netdev_rxq *rxq)
6384 OVS_REQUIRES(pmd->port_mutex)
b68872d8 6385{
e32971b8
DDP
6386 int qid = netdev_rxq_get_queue_id(rxq->rx);
6387 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6388 struct rxq_poll *poll;
b68872d8 6389
e32971b8
DDP
6390 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6391 if (poll->rxq == rxq) {
6392 /* 'rxq' is already polled by this thread. Do nothing. */
6393 return;
d0cca6c3 6394 }
cc245ce8 6395 }
cc245ce8 6396
e32971b8
DDP
6397 poll = xmalloc(sizeof *poll);
6398 poll->rxq = rxq;
6399 hmap_insert(&pmd->poll_list, &poll->node, hash);
b68872d8 6400
e32971b8 6401 pmd->need_reload = true;
ae7ad0a1
IM
6402}
6403
e32971b8 6404/* Delete 'poll' from poll_list of PMD thread. */
ae7ad0a1 6405static void
e32971b8
DDP
6406dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6407 struct rxq_poll *poll)
d0cca6c3 6408 OVS_REQUIRES(pmd->port_mutex)
ae7ad0a1 6409{
e32971b8
DDP
6410 hmap_remove(&pmd->poll_list, &poll->node);
6411 free(poll);
ae7ad0a1 6412
e32971b8 6413 pmd->need_reload = true;
ae7ad0a1
IM
6414}
6415
d0cca6c3
DDP
6416/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6417 * changes to take effect. */
cc245ce8 6418static void
d0cca6c3
DDP
6419dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6420 struct dp_netdev_port *port)
e32971b8 6421 OVS_REQUIRES(pmd->port_mutex)
d0cca6c3 6422{
57eebbb4
DDP
6423 struct tx_port *tx;
6424
e32971b8
DDP
6425 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6426 if (tx) {
6427 /* 'port' is already on this thread tx cache. Do nothing. */
6428 return;
6429 }
6430
57eebbb4 6431 tx = xzalloc(sizeof *tx);
d0cca6c3 6432
324c8374
IM
6433 tx->port = port;
6434 tx->qid = -1;
c71ea3c4 6435 tx->flush_time = 0LL;
009e0033 6436 dp_packet_batch_init(&tx->output_pkts);
d0cca6c3 6437
324c8374 6438 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
e32971b8 6439 pmd->need_reload = true;
d0cca6c3
DDP
6440}
6441
e32971b8
DDP
6442/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6443 * changes to take effect. */
b9584f21 6444static void
e32971b8
DDP
6445dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6446 struct tx_port *tx)
6447 OVS_REQUIRES(pmd->port_mutex)
b9584f21 6448{
e32971b8
DDP
6449 hmap_remove(&pmd->tx_ports, &tx->node);
6450 free(tx);
6451 pmd->need_reload = true;
6c3eee82 6452}
9df65060
VDA
6453
6454/* Add bond to the tx bond cmap of 'pmd'. */
6455static void
6456dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6457 struct tx_bond *bond, bool update)
6458 OVS_EXCLUDED(pmd->bond_mutex)
6459{
6460 struct tx_bond *tx;
6461
6462 ovs_mutex_lock(&pmd->bond_mutex);
6463 tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6464
6465 if (tx && !update) {
6466 /* It's not an update and the entry already exists. Do nothing. */
6467 goto unlock;
6468 }
6469
6470 if (tx) {
6471 struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6472
6473 /* Copy the stats for each bucket. */
6474 for (int i = 0; i < BOND_BUCKETS; i++) {
6475 uint64_t n_packets, n_bytes;
6476
6477 atomic_read_relaxed(&tx->slave_buckets[i].n_packets, &n_packets);
6478 atomic_read_relaxed(&tx->slave_buckets[i].n_bytes, &n_bytes);
6479 atomic_init(&new_tx->slave_buckets[i].n_packets, n_packets);
6480 atomic_init(&new_tx->slave_buckets[i].n_bytes, n_bytes);
6481 }
6482 cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6483 hash_bond_id(bond->bond_id));
6484 ovsrcu_postpone(free, tx);
6485 } else {
6486 tx = xmemdup(bond, sizeof *bond);
6487 cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6488 }
6489unlock:
6490 ovs_mutex_unlock(&pmd->bond_mutex);
6491}
6492
6493/* Delete bond from the tx bond cmap of 'pmd'. */
6494static void
6495dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6496 uint32_t bond_id)
6497 OVS_EXCLUDED(pmd->bond_mutex)
6498{
6499 struct tx_bond *tx;
6500
6501 ovs_mutex_lock(&pmd->bond_mutex);
6502 tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6503 if (tx) {
6504 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6505 ovsrcu_postpone(free, tx);
6506 }
6507 ovs_mutex_unlock(&pmd->bond_mutex);
6508}
6c3eee82 6509\f
b5cbbcf6
AZ
6510static char *
6511dpif_netdev_get_datapath_version(void)
6512{
6513 return xstrdup("<built-in>");
6514}
6515
72865317 6516static void
1c1e46ed 6517dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 6518 uint16_t tcp_flags, long long now)
72865317 6519{
eb94da30 6520 uint16_t flags;
72865317 6521
eb94da30
DDP
6522 atomic_store_relaxed(&netdev_flow->stats.used, now);
6523 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6524 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6525 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6526 flags |= tcp_flags;
6527 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
6528}
6529
623540e4 6530static int
e14deea0 6531dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 6532 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
6533 enum dpif_upcall_type type, const struct nlattr *userdata,
6534 struct ofpbuf *actions, struct ofpbuf *put_actions)
6535{
1c1e46ed 6536 struct dp_netdev *dp = pmd->dp;
623540e4 6537
623540e4
EJ
6538 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6539 return ENODEV;
6540 }
6541
6542 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6543 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 6544 char *packet_str;
cf62fa4c 6545 struct ofpbuf key;
5262eea1
JG
6546 struct odp_flow_key_parms odp_parms = {
6547 .flow = flow,
1dea1435 6548 .mask = wc ? &wc->masks : NULL,
2494ccd7 6549 .support = dp_netdev_support,
5262eea1 6550 };
623540e4
EJ
6551
6552 ofpbuf_init(&key, 0);
5262eea1 6553 odp_flow_key_from_flow(&odp_parms, &key);
2482b0b0 6554 packet_str = ofp_dp_packet_to_string(packet_);
623540e4 6555
6fd6ed71 6556 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
6557
6558 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6559 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6560
6561 ofpbuf_uninit(&key);
6562 free(packet_str);
6fd6ed71 6563
623540e4
EJ
6564 ds_destroy(&ds);
6565 }
6566
8d8ab6c2
JG
6567 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6568 actions, wc, put_actions, dp->upcall_aux);
623540e4
EJ
6569}
6570
bde94613
FA
6571static inline uint32_t
6572dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6573 const struct miniflow *mf)
6574{
6575 uint32_t hash;
6576
6577 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6578 hash = dp_packet_get_rss_hash(packet);
6579 } else {
6580 hash = miniflow_hash_5tuple(mf, 0);
6581 dp_packet_set_rss_hash(packet, hash);
6582 }
6583
6584 return hash;
6585}
6586
9bbf1c3d 6587static inline uint32_t
048963aa
DDP
6588dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6589 const struct miniflow *mf)
9bbf1c3d 6590{
048963aa 6591 uint32_t hash, recirc_depth;
9bbf1c3d 6592
f2f44f5d
DDP
6593 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6594 hash = dp_packet_get_rss_hash(packet);
6595 } else {
9bbf1c3d 6596 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 6597 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 6598 }
048963aa
DDP
6599
6600 /* The RSS hash must account for the recirculation depth to avoid
6601 * collisions in the exact match cache */
6602 recirc_depth = *recirc_depth_get_unsafe();
6603 if (OVS_UNLIKELY(recirc_depth)) {
6604 hash = hash_finish(hash, recirc_depth);
048963aa 6605 }
9bbf1c3d
DDP
6606 return hash;
6607}
6608
f7ce4811 6609struct packet_batch_per_flow {
8cbf4f47
DDP
6610 unsigned int byte_count;
6611 uint16_t tcp_flags;
8cbf4f47
DDP
6612 struct dp_netdev_flow *flow;
6613
1895cc8d 6614 struct dp_packet_batch array;
8cbf4f47
DDP
6615};
6616
6617static inline void
f7ce4811
PS
6618packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6619 struct dp_packet *packet,
aab96ec4 6620 uint16_t tcp_flags)
8cbf4f47 6621{
cf62fa4c 6622 batch->byte_count += dp_packet_size(packet);
aab96ec4 6623 batch->tcp_flags |= tcp_flags;
940ac2ce 6624 dp_packet_batch_add(&batch->array, packet);
8cbf4f47
DDP
6625}
6626
6627static inline void
f7ce4811
PS
6628packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6629 struct dp_netdev_flow *flow)
8cbf4f47 6630{
11e5cf1f 6631 flow->batch = batch;
8cbf4f47 6632
11e5cf1f 6633 batch->flow = flow;
1895cc8d 6634 dp_packet_batch_init(&batch->array);
8cbf4f47
DDP
6635 batch->byte_count = 0;
6636 batch->tcp_flags = 0;
8cbf4f47
DDP
6637}
6638
6639static inline void
f7ce4811 6640packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
b010be17 6641 struct dp_netdev_pmd_thread *pmd)
8cbf4f47
DDP
6642{
6643 struct dp_netdev_actions *actions;
6644 struct dp_netdev_flow *flow = batch->flow;
6645
940ac2ce
PC
6646 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6647 batch->byte_count,
05f9e707 6648 batch->tcp_flags, pmd->ctx.now / 1000);
8cbf4f47
DDP
6649
6650 actions = dp_netdev_flow_get_actions(flow);
6651
66e4ad8a 6652 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
b010be17 6653 actions->actions, actions->size);
8cbf4f47
DDP
6654}
6655
8aaa125d 6656static inline void
e14deea0 6657dp_netdev_queue_batches(struct dp_packet *pkt,
aab96ec4 6658 struct dp_netdev_flow *flow, uint16_t tcp_flags,
47a45d86
KT
6659 struct packet_batch_per_flow *batches,
6660 size_t *n_batches)
9bbf1c3d 6661{
f7ce4811 6662 struct packet_batch_per_flow *batch = flow->batch;
11e5cf1f 6663
f9fe365b
AZ
6664 if (OVS_UNLIKELY(!batch)) {
6665 batch = &batches[(*n_batches)++];
f7ce4811 6666 packet_batch_per_flow_init(batch, flow);
9bbf1c3d
DDP
6667 }
6668
aab96ec4 6669 packet_batch_per_flow_update(batch, pkt, tcp_flags);
9bbf1c3d
DDP
6670}
6671
9b4f08cd
VDA
6672static inline void
6673packet_enqueue_to_flow_map(struct dp_packet *packet,
6674 struct dp_netdev_flow *flow,
6675 uint16_t tcp_flags,
6676 struct dp_packet_flow_map *flow_map,
6677 size_t index)
6678{
6679 struct dp_packet_flow_map *map = &flow_map[index];
6680 map->flow = flow;
6681 map->packet = packet;
6682 map->tcp_flags = tcp_flags;
6683}
6684
60d8ccae
YW
6685/* SMC lookup function for a batch of packets.
6686 * By doing batching SMC lookup, we can use prefetch
6687 * to hide memory access latency.
6688 */
6689static inline void
6690smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6691 struct netdev_flow_key *keys,
6692 struct netdev_flow_key **missed_keys,
6693 struct dp_packet_batch *packets_,
9b4f08cd
VDA
6694 const int cnt,
6695 struct dp_packet_flow_map *flow_map,
6696 uint8_t *index_map)
60d8ccae
YW
6697{
6698 int i;
6699 struct dp_packet *packet;
6700 size_t n_smc_hit = 0, n_missed = 0;
6701 struct dfc_cache *cache = &pmd->flow_cache;
6702 struct smc_cache *smc_cache = &cache->smc_cache;
6703 const struct cmap_node *flow_node;
9b4f08cd
VDA
6704 int recv_idx;
6705 uint16_t tcp_flags;
60d8ccae
YW
6706
6707 /* Prefetch buckets for all packets */
6708 for (i = 0; i < cnt; i++) {
6709 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6710 }
6711
6712 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6713 struct dp_netdev_flow *flow = NULL;
6714 flow_node = smc_entry_get(pmd, keys[i].hash);
6715 bool hit = false;
9b4f08cd
VDA
6716 /* Get the original order of this packet in received batch. */
6717 recv_idx = index_map[i];
60d8ccae
YW
6718
6719 if (OVS_LIKELY(flow_node != NULL)) {
6720 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6721 /* Since we dont have per-port megaflow to check the port
6722 * number, we need to verify that the input ports match. */
6723 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6724 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
9b4f08cd
VDA
6725 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6726
60d8ccae 6727 /* SMC hit and emc miss, we insert into EMC */
60d8ccae
YW
6728 keys[i].len =
6729 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
18e08953 6730 emc_probabilistic_insert(pmd, &keys[i], flow);
9b4f08cd
VDA
6731 /* Add these packets into the flow map in the same order
6732 * as received.
6733 */
6734 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6735 flow_map, recv_idx);
60d8ccae
YW
6736 n_smc_hit++;
6737 hit = true;
6738 break;
6739 }
6740 }
6741 if (hit) {
6742 continue;
6743 }
6744 }
6745
6746 /* SMC missed. Group missed packets together at
6747 * the beginning of the 'packets' array. */
6748 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6749
6750 /* Preserve the order of packet for flow batching. */
6751 index_map[n_missed] = recv_idx;
6752
60d8ccae
YW
6753 /* Put missed keys to the pointer arrays return to the caller */
6754 missed_keys[n_missed++] = &keys[i];
6755 }
6756
6757 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6758}
6759
6760/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
a90ed026 6761 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d 6762 * miniflow is copied into 'keys' and the packet pointer is moved at the
60d8ccae
YW
6763 * beginning of the 'packets' array. The pointers of missed keys are put in the
6764 * missed_keys pointer array for future processing.
9bbf1c3d
DDP
6765 *
6766 * The function returns the number of packets that needs to be processed in the
6767 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026 6768 *
02305520
FA
6769 * For performance reasons a caller may choose not to initialize the metadata
6770 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6771 * is not valid and must be initialized by this function using 'port_no'.
6772 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6773 * will be ignored.
9bbf1c3d
DDP
6774 */
6775static inline size_t
60d8ccae 6776dfc_processing(struct dp_netdev_pmd_thread *pmd,
72c84bc2 6777 struct dp_packet_batch *packets_,
1895cc8d 6778 struct netdev_flow_key *keys,
60d8ccae 6779 struct netdev_flow_key **missed_keys,
f7ce4811 6780 struct packet_batch_per_flow batches[], size_t *n_batches,
9b4f08cd
VDA
6781 struct dp_packet_flow_map *flow_map,
6782 size_t *n_flows, uint8_t *index_map,
a90ed026 6783 bool md_is_valid, odp_port_t port_no)
72865317 6784{
b89c678b 6785 struct netdev_flow_key *key = &keys[0];
60d8ccae
YW
6786 size_t n_missed = 0, n_emc_hit = 0;
6787 struct dfc_cache *cache = &pmd->flow_cache;
72c84bc2 6788 struct dp_packet *packet;
45df9fef 6789 const size_t cnt = dp_packet_batch_size(packets_);
2fbadeb6 6790 uint32_t cur_min = pmd->ctx.emc_insert_min;
72c84bc2 6791 int i;
aab96ec4 6792 uint16_t tcp_flags;
60d8ccae 6793 bool smc_enable_db;
9b4f08cd
VDA
6794 size_t map_cnt = 0;
6795 bool batch_enable = true;
8cbf4f47 6796
60d8ccae 6797 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
82a48ead
JS
6798 pmd_perf_update_counter(&pmd->perf_stats,
6799 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6800 cnt);
f79b1ddb 6801
45df9fef 6802 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
9bbf1c3d 6803 struct dp_netdev_flow *flow;
aab96ec4 6804 uint32_t mark;
9bbf1c3d 6805
5a2fed48
AZ
6806 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6807 dp_packet_delete(packet);
a13a0209 6808 COVERAGE_INC(datapath_drop_rx_invalid_packet);
84d6d5eb
EJ
6809 continue;
6810 }
8cbf4f47 6811
45df9fef 6812 if (i != cnt - 1) {
72c84bc2 6813 struct dp_packet **packets = packets_->packets;
a90ed026 6814 /* Prefetch next packet data and metadata. */
72a5e2b8 6815 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 6816 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
6817 }
6818
a90ed026
DDP
6819 if (!md_is_valid) {
6820 pkt_metadata_init(&packet->md, port_no);
6821 }
aab96ec4
YL
6822
6823 if ((*recirc_depth_get() == 0) &&
6824 dp_packet_has_flow_mark(packet, &mark)) {
6825 flow = mark_to_flow_find(pmd, mark);
9b4f08cd 6826 if (OVS_LIKELY(flow)) {
aab96ec4 6827 tcp_flags = parse_tcp_flags(packet);
9b4f08cd
VDA
6828 if (OVS_LIKELY(batch_enable)) {
6829 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6830 n_batches);
6831 } else {
6832 /* Flow batching should be performed only after fast-path
6833 * processing is also completed for packets with emc miss
6834 * or else it will result in reordering of packets with
6835 * same datapath flows. */
6836 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6837 flow_map, map_cnt++);
6838 }
aab96ec4
YL
6839 continue;
6840 }
6841 }
6842
5a2fed48 6843 miniflow_extract(packet, &key->mf);
d262ac2c 6844 key->len = 0; /* Not computed yet. */
b137383e
IM
6845 key->hash =
6846 (md_is_valid == false)
6847 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6848 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6849
6850 /* If EMC is disabled skip emc_lookup */
6851 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
8aaa125d 6852 if (OVS_LIKELY(flow)) {
aab96ec4 6853 tcp_flags = miniflow_get_tcp_flags(&key->mf);
60d8ccae 6854 n_emc_hit++;
9b4f08cd
VDA
6855 if (OVS_LIKELY(batch_enable)) {
6856 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6857 n_batches);
6858 } else {
6859 /* Flow batching should be performed only after fast-path
6860 * processing is also completed for packets with emc miss
6861 * or else it will result in reordering of packets with
6862 * same datapath flows. */
6863 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6864 flow_map, map_cnt++);
6865 }
8aaa125d 6866 } else {
d1aa0b94 6867 /* Exact match cache missed. Group missed packets together at
72c84bc2
AZ
6868 * the beginning of the 'packets' array. */
6869 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6870
6871 /* Preserve the order of packet for flow batching. */
6872 index_map[n_missed] = map_cnt;
6873 flow_map[map_cnt++].flow = NULL;
6874
400486f7 6875 /* 'key[n_missed]' contains the key of the current packet and it
60d8ccae
YW
6876 * will be passed to SMC lookup. The next key should be extracted
6877 * to 'keys[n_missed + 1]'.
6878 * We also maintain a pointer array to keys missed both SMC and EMC
6879 * which will be returned to the caller for future processing. */
6880 missed_keys[n_missed] = key;
400486f7 6881 key = &keys[++n_missed];
9b4f08cd
VDA
6882
6883 /* Skip batching for subsequent packets to avoid reordering. */
6884 batch_enable = false;
9bbf1c3d
DDP
6885 }
6886 }
9b4f08cd
VDA
6887 /* Count of packets which are not flow batched. */
6888 *n_flows = map_cnt;
9bbf1c3d 6889
60d8ccae
YW
6890 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6891
6892 if (!smc_enable_db) {
6893 return dp_packet_batch_size(packets_);
6894 }
6895
6896 /* Packets miss EMC will do a batch lookup in SMC if enabled */
9b4f08cd
VDA
6897 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6898 n_missed, flow_map, index_map);
4f150744 6899
72c84bc2 6900 return dp_packet_batch_size(packets_);
9bbf1c3d
DDP
6901}
6902
82a48ead 6903static inline int
47a45d86
KT
6904handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6905 struct dp_packet *packet,
a260d966 6906 const struct netdev_flow_key *key,
82a48ead 6907 struct ofpbuf *actions, struct ofpbuf *put_actions)
a260d966
PS
6908{
6909 struct ofpbuf *add_actions;
6910 struct dp_packet_batch b;
6911 struct match match;
6912 ovs_u128 ufid;
6913 int error;
79f36875 6914 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
a260d966
PS
6915
6916 match.tun_md.valid = false;
6917 miniflow_expand(&key->mf, &match.flow);
c98eedf9 6918 memset(&match.wc, 0, sizeof match.wc);
a260d966
PS
6919
6920 ofpbuf_clear(actions);
6921 ofpbuf_clear(put_actions);
6922
7a5e0ee7 6923 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
a260d966
PS
6924 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6925 &ufid, DPIF_UC_MISS, NULL, actions,
6926 put_actions);
6927 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6928 dp_packet_delete(packet);
a13a0209 6929 COVERAGE_INC(datapath_drop_upcall_error);
82a48ead 6930 return error;
a260d966
PS
6931 }
6932
6933 /* The Netlink encoding of datapath flow keys cannot express
6934 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6935 * tag is interpreted as exact match on the fact that there is no
6936 * VLAN. Unless we refactor a lot of code that translates between
6937 * Netlink and struct flow representations, we have to do the same
35fe9efb 6938 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
f0fb825a
EG
6939 if (!match.wc.masks.vlans[0].tci) {
6940 match.wc.masks.vlans[0].tci = htons(0xffff);
a260d966
PS
6941 }
6942
6943 /* We can't allow the packet batching in the next loop to execute
6944 * the actions. Otherwise, if there are any slow path actions,
6945 * we'll send the packet up twice. */
72c84bc2 6946 dp_packet_batch_init_packet(&b, packet);
66e4ad8a 6947 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
b010be17 6948 actions->data, actions->size);
a260d966
PS
6949
6950 add_actions = put_actions->size ? put_actions : actions;
6951 if (OVS_LIKELY(error != ENOSPC)) {
6952 struct dp_netdev_flow *netdev_flow;
6953
6954 /* XXX: There's a race window where a flow covering this packet
6955 * could have already been installed since we last did the flow
6956 * lookup before upcall. This could be solved by moving the
6957 * mutex lock outside the loop, but that's an awful long time
af741ca3 6958 * to be locking revalidators out of making flow modifications. */
a260d966 6959 ovs_mutex_lock(&pmd->flow_mutex);
3453b4d6 6960 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
a260d966
PS
6961 if (OVS_LIKELY(!netdev_flow)) {
6962 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6963 add_actions->data,
6964 add_actions->size);
6965 }
6966 ovs_mutex_unlock(&pmd->flow_mutex);
60d8ccae
YW
6967 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6968 smc_insert(pmd, key, hash);
4c30b246 6969 emc_probabilistic_insert(pmd, key, netdev_flow);
a260d966 6970 }
79f36875
JS
6971 if (pmd_perf_metrics_enabled(pmd)) {
6972 /* Update upcall stats. */
6973 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6974 struct pmd_perf_stats *s = &pmd->perf_stats;
6975 s->current.upcalls++;
6976 s->current.upcall_cycles += cycles;
6977 histogram_add_sample(&s->cycles_per_upcall, cycles);
6978 }
82a48ead 6979 return error;
a260d966
PS
6980}
6981
9bbf1c3d 6982static inline void
65f13b50 6983fast_path_processing(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6984 struct dp_packet_batch *packets_,
60d8ccae 6985 struct netdev_flow_key **keys,
9b4f08cd
VDA
6986 struct dp_packet_flow_map *flow_map,
6987 uint8_t *index_map,
b010be17 6988 odp_port_t in_port)
9bbf1c3d 6989{
31c82130 6990 const size_t cnt = dp_packet_batch_size(packets_);
1a0d5831 6991#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
6992 const size_t PKT_ARRAY_SIZE = cnt;
6993#else
1a0d5831 6994 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 6995 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 6996#endif
31c82130 6997 struct dp_packet *packet;
3453b4d6 6998 struct dpcls *cls;
0de8783a 6999 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50 7000 struct dp_netdev *dp = pmd->dp;
82a48ead 7001 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
3453b4d6 7002 int lookup_cnt = 0, add_lookup_cnt;
9bbf1c3d
DDP
7003 bool any_miss;
7004
e883448e 7005 for (size_t i = 0; i < cnt; i++) {
0de8783a 7006 /* Key length is needed in all the cases, hash computed on demand. */
60d8ccae 7007 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
9bbf1c3d 7008 }
3453b4d6
JS
7009 /* Get the classifier for the in_port */
7010 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7011 if (OVS_LIKELY(cls)) {
60d8ccae
YW
7012 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7013 rules, cnt, &lookup_cnt);
3453b4d6
JS
7014 } else {
7015 any_miss = true;
7016 memset(rules, 0, sizeof(rules));
7017 }
623540e4
EJ
7018 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7019 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7020 struct ofpbuf actions, put_actions;
623540e4
EJ
7021
7022 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7023 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7024
e883448e 7025 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 7026 struct dp_netdev_flow *netdev_flow;
623540e4 7027
0de8783a 7028 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
7029 continue;
7030 }
7031
7032 /* It's possible that an earlier slow path execution installed
0de8783a 7033 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 7034 * to catch it here than execute a miss. */
60d8ccae 7035 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
3453b4d6 7036 &add_lookup_cnt);
623540e4 7037 if (netdev_flow) {
3453b4d6 7038 lookup_cnt += add_lookup_cnt;
0de8783a 7039 rules[i] = &netdev_flow->cr;
623540e4
EJ
7040 continue;
7041 }
7042
60d8ccae 7043 int error = handle_packet_upcall(pmd, packet, keys[i],
82a48ead
JS
7044 &actions, &put_actions);
7045
7046 if (OVS_UNLIKELY(error)) {
7047 upcall_fail_cnt++;
7048 } else {
7049 upcall_ok_cnt++;
7050 }
623540e4
EJ
7051 }
7052
7053 ofpbuf_uninit(&actions);
7054 ofpbuf_uninit(&put_actions);
7055 fat_rwlock_unlock(&dp->upcall_rwlock);
ac8c2081 7056 } else if (OVS_UNLIKELY(any_miss)) {
e883448e 7057 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 7058 if (OVS_UNLIKELY(!rules[i])) {
31c82130 7059 dp_packet_delete(packet);
a13a0209 7060 COVERAGE_INC(datapath_drop_lock_error);
82a48ead 7061 upcall_fail_cnt++;
ac8c2081
DDP
7062 }
7063 }
623540e4 7064 }
84d6d5eb 7065
e883448e 7066 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
84d6d5eb 7067 struct dp_netdev_flow *flow;
9b4f08cd
VDA
7068 /* Get the original order of this packet in received batch. */
7069 int recv_idx = index_map[i];
7070 uint16_t tcp_flags;
8cbf4f47 7071
0de8783a 7072 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
7073 continue;
7074 }
7075
84d6d5eb 7076 flow = dp_netdev_flow_cast(rules[i]);
60d8ccae
YW
7077 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
7078 smc_insert(pmd, keys[i], hash);
0de8783a 7079
60d8ccae 7080 emc_probabilistic_insert(pmd, keys[i], flow);
9b4f08cd
VDA
7081 /* Add these packets into the flow map in the same order
7082 * as received.
7083 */
7084 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7085 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7086 flow_map, recv_idx);
8cbf4f47
DDP
7087 }
7088
82a48ead
JS
7089 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7090 cnt - upcall_ok_cnt - upcall_fail_cnt);
7091 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7092 lookup_cnt);
7093 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7094 upcall_ok_cnt);
7095 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7096 upcall_fail_cnt);
72865317
BP
7097}
7098
a90ed026
DDP
7099/* Packets enter the datapath from a port (or from recirculation) here.
7100 *
02305520
FA
7101 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7102 * When false the metadata in 'packets' need to be initialized. */
adcf00ba 7103static void
a90ed026 7104dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7105 struct dp_packet_batch *packets,
a90ed026 7106 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 7107{
1a0d5831 7108#if !defined(__CHECKER__) && !defined(_WIN32)
37eabc70 7109 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
9bbf1c3d 7110#else
1a0d5831 7111 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 7112 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 7113#endif
47a45d86
KT
7114 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7115 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
60d8ccae 7116 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
f7ce4811 7117 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
72c84bc2 7118 size_t n_batches;
9b4f08cd
VDA
7119 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7120 uint8_t index_map[PKT_ARRAY_SIZE];
7121 size_t n_flows, i;
7122
3453b4d6 7123 odp_port_t in_port;
9bbf1c3d 7124
8aaa125d 7125 n_batches = 0;
60d8ccae 7126 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
9b4f08cd
VDA
7127 flow_map, &n_flows, index_map, md_is_valid, port_no);
7128
72c84bc2 7129 if (!dp_packet_batch_is_empty(packets)) {
3453b4d6
JS
7130 /* Get ingress port from first packet's metadata. */
7131 in_port = packets->packets[0]->md.in_port.odp_port;
60d8ccae 7132 fast_path_processing(pmd, packets, missed_keys,
9b4f08cd 7133 flow_map, index_map, in_port);
8aaa125d
DDP
7134 }
7135
9b4f08cd
VDA
7136 /* Batch rest of packets which are in flow map. */
7137 for (i = 0; i < n_flows; i++) {
7138 struct dp_packet_flow_map *map = &flow_map[i];
7139
7140 if (OVS_UNLIKELY(!map->flow)) {
7141 continue;
7142 }
7143 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7144 batches, &n_batches);
7145 }
7146
ad9f0581
BB
7147 /* All the flow batches need to be reset before any call to
7148 * packet_batch_per_flow_execute() as it could potentially trigger
7149 * recirculation. When a packet matching flow ‘j’ happens to be
7150 * recirculated, the nested call to dp_netdev_input__() could potentially
7151 * classify the packet as matching another flow - say 'k'. It could happen
7152 * that in the previous call to dp_netdev_input__() that same flow 'k' had
7153 * already its own batches[k] still waiting to be served. So if its
7154 * ‘batch’ member is not reset, the recirculated packet would be wrongly
7155 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
603f2ce0
EJ
7156 for (i = 0; i < n_batches; i++) {
7157 batches[i].flow->batch = NULL;
7158 }
7159
8aaa125d 7160 for (i = 0; i < n_batches; i++) {
b010be17 7161 packet_batch_per_flow_execute(&batches[i], pmd);
9bbf1c3d
DDP
7162 }
7163}
7164
a90ed026
DDP
7165static void
7166dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7167 struct dp_packet_batch *packets,
a90ed026
DDP
7168 odp_port_t port_no)
7169{
3453b4d6 7170 dp_netdev_input__(pmd, packets, false, port_no);
a90ed026
DDP
7171}
7172
7173static void
7174dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7175 struct dp_packet_batch *packets)
a90ed026 7176{
3453b4d6 7177 dp_netdev_input__(pmd, packets, true, 0);
a90ed026
DDP
7178}
7179
9080a111 7180struct dp_netdev_execute_aux {
65f13b50 7181 struct dp_netdev_pmd_thread *pmd;
66e4ad8a 7182 const struct flow *flow;
9080a111
JR
7183};
7184
e4e74c3a
AW
7185static void
7186dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7187 void *aux)
7188{
7189 struct dp_netdev *dp = get_dp_netdev(dpif);
7190 dp->dp_purge_aux = aux;
7191 dp->dp_purge_cb = cb;
7192}
7193
6b31e073 7194static void
623540e4
EJ
7195dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7196 void *aux)
6b31e073
RW
7197{
7198 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 7199 dp->upcall_aux = aux;
6b31e073
RW
7200 dp->upcall_cb = cb;
7201}
7202
324c8374
IM
7203static void
7204dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 7205 bool purge)
324c8374
IM
7206{
7207 struct tx_port *tx;
7208 struct dp_netdev_port *port;
7209 long long interval;
7210
57eebbb4 7211 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
9f7a3035 7212 if (!tx->port->dynamic_txqs) {
324c8374
IM
7213 continue;
7214 }
b010be17 7215 interval = pmd->ctx.now - tx->last_used;
05f9e707 7216 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
324c8374
IM
7217 port = tx->port;
7218 ovs_mutex_lock(&port->txq_used_mutex);
7219 port->txq_used[tx->qid]--;
7220 ovs_mutex_unlock(&port->txq_used_mutex);
7221 tx->qid = -1;
7222 }
7223 }
7224}
7225
7226static int
7227dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 7228 struct tx_port *tx)
324c8374
IM
7229{
7230 struct dp_netdev_port *port;
7231 long long interval;
7232 int i, min_cnt, min_qid;
7233
b010be17
IM
7234 interval = pmd->ctx.now - tx->last_used;
7235 tx->last_used = pmd->ctx.now;
324c8374 7236
05f9e707 7237 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
324c8374
IM
7238 return tx->qid;
7239 }
7240
7241 port = tx->port;
7242
7243 ovs_mutex_lock(&port->txq_used_mutex);
7244 if (tx->qid >= 0) {
7245 port->txq_used[tx->qid]--;
7246 tx->qid = -1;
7247 }
7248
7249 min_cnt = -1;
7250 min_qid = 0;
7251 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7252 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7253 min_cnt = port->txq_used[i];
7254 min_qid = i;
7255 }
7256 }
7257
7258 port->txq_used[min_qid]++;
7259 tx->qid = min_qid;
7260
7261 ovs_mutex_unlock(&port->txq_used_mutex);
7262
b010be17 7263 dpif_netdev_xps_revalidate_pmd(pmd, false);
324c8374
IM
7264
7265 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7266 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7267 return min_qid;
7268}
7269
d0cca6c3 7270static struct tx_port *
57eebbb4
DDP
7271pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7272 odp_port_t port_no)
7273{
7274 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7275}
7276
7277static struct tx_port *
7278pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7279 odp_port_t port_no)
d0cca6c3 7280{
57eebbb4 7281 return tx_port_lookup(&pmd->send_port_cache, port_no);
d0cca6c3
DDP
7282}
7283
a36de779 7284static int
d0cca6c3 7285push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
1895cc8d
PS
7286 const struct nlattr *attr,
7287 struct dp_packet_batch *batch)
a36de779 7288{
d0cca6c3 7289 struct tx_port *tun_port;
a36de779 7290 const struct ovs_action_push_tnl *data;
4c742796 7291 int err;
a36de779
PS
7292
7293 data = nl_attr_get(attr);
7294
81765c00 7295 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
a36de779 7296 if (!tun_port) {
4c742796
PS
7297 err = -EINVAL;
7298 goto error;
a36de779 7299 }
324c8374 7300 err = netdev_push_header(tun_port->port->netdev, batch, data);
4c742796
PS
7301 if (!err) {
7302 return 0;
7303 }
7304error:
7305 dp_packet_delete_batch(batch, true);
7306 return err;
a36de779
PS
7307}
7308
66525ef3
PS
7309static void
7310dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7d7ded7a 7311 struct dp_packet *packet, bool should_steal,
66525ef3
PS
7312 struct flow *flow, ovs_u128 *ufid,
7313 struct ofpbuf *actions,
b010be17 7314 const struct nlattr *userdata)
66525ef3
PS
7315{
7316 struct dp_packet_batch b;
7317 int error;
7318
7319 ofpbuf_clear(actions);
7320
7321 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7322 DPIF_UC_ACTION, userdata, actions,
7323 NULL);
7324 if (!error || error == ENOSPC) {
72c84bc2 7325 dp_packet_batch_init_packet(&b, packet);
7d7ded7a 7326 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
b010be17 7327 actions->data, actions->size);
7d7ded7a 7328 } else if (should_steal) {
66525ef3 7329 dp_packet_delete(packet);
a13a0209 7330 COVERAGE_INC(datapath_drop_userspace_action_error);
66525ef3
PS
7331 }
7332}
7333
9df65060
VDA
7334static bool
7335dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7336 struct dp_packet_batch *packets_,
7337 bool should_steal, odp_port_t port_no)
7338{
7339 struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7340 struct dp_packet_batch out;
7341
7342 if (!OVS_LIKELY(p)) {
7343 COVERAGE_ADD(datapath_drop_invalid_port,
7344 dp_packet_batch_size(packets_));
7345 dp_packet_delete_batch(packets_, should_steal);
7346 return false;
7347 }
7348 if (!should_steal) {
7349 dp_packet_batch_clone(&out, packets_);
7350 dp_packet_batch_reset_cutlen(packets_);
7351 packets_ = &out;
7352 }
7353 dp_packet_batch_apply_cutlen(packets_);
7354#ifdef DPDK_NETDEV
7355 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7356 && packets_->packets[0]->source
7357 != p->output_pkts.packets[0]->source)) {
7358 /* XXX: netdev-dpdk assumes that all packets in a single
7359 * output batch has the same source. Flush here to
7360 * avoid memory access issues. */
7361 dp_netdev_pmd_flush_output_on_port(pmd, p);
7362 }
7363#endif
7364 if (dp_packet_batch_size(&p->output_pkts)
7365 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7366 /* Flush here to avoid overflow. */
7367 dp_netdev_pmd_flush_output_on_port(pmd, p);
7368 }
7369 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7370 pmd->n_output_batches++;
7371 }
7372
7373 struct dp_packet *packet;
7374 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7375 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7376 pmd->ctx.last_rxq;
7377 dp_packet_batch_add(&p->output_pkts, packet);
7378 }
7379 return true;
7380}
7381
7382static void
7383dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7384 struct dp_packet_batch *packets_,
7385 bool should_steal, uint32_t bond)
7386{
7387 struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7388 struct dp_packet_batch out;
7389 struct dp_packet *packet;
7390
7391 if (!p_bond) {
7392 COVERAGE_ADD(datapath_drop_invalid_bond,
7393 dp_packet_batch_size(packets_));
7394 dp_packet_delete_batch(packets_, should_steal);
7395 return;
7396 }
7397 if (!should_steal) {
7398 dp_packet_batch_clone(&out, packets_);
7399 dp_packet_batch_reset_cutlen(packets_);
7400 packets_ = &out;
7401 }
7402 dp_packet_batch_apply_cutlen(packets_);
7403
7404 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7405 /*
7406 * Lookup the bond-hash table using hash to get the slave.
7407 */
7408 uint32_t hash = dp_packet_get_rss_hash(packet);
7409 struct slave_entry *s_entry = &p_bond->slave_buckets[hash & BOND_MASK];
7410 odp_port_t bond_member = s_entry->slave_id;
7411 uint32_t size = dp_packet_size(packet);
7412 struct dp_packet_batch output_pkt;
7413
7414 dp_packet_batch_init_packet(&output_pkt, packet);
7415 if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7416 bond_member))) {
7417 /* Update slave stats. */
7418 non_atomic_ullong_add(&s_entry->n_packets, 1);
7419 non_atomic_ullong_add(&s_entry->n_bytes, size);
7420 }
7421 }
7422}
7423
a36de779 7424static void
1895cc8d 7425dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7d7ded7a 7426 const struct nlattr *a, bool should_steal)
4b27db64 7427 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
7428{
7429 struct dp_netdev_execute_aux *aux = aux_;
623540e4 7430 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
7431 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7432 struct dp_netdev *dp = pmd->dp;
09f9da0b 7433 int type = nl_attr_type(a);
d0cca6c3 7434 struct tx_port *p;
a13a0209 7435 uint32_t packet_count, packets_dropped;
9080a111 7436
09f9da0b
JR
7437 switch ((enum ovs_action_attr)type) {
7438 case OVS_ACTION_ATTR_OUTPUT:
9df65060
VDA
7439 dp_execute_output_action(pmd, packets_, should_steal,
7440 nl_attr_get_odp_port(a));
7441 return;
c71ea3c4 7442
9df65060
VDA
7443 case OVS_ACTION_ATTR_LB_OUTPUT:
7444 dp_execute_lb_output_action(pmd, packets_, should_steal,
7445 nl_attr_get_u32(a));
7446 return;
09f9da0b 7447
a36de779 7448 case OVS_ACTION_ATTR_TUNNEL_PUSH:
47e1b3b6
IM
7449 if (should_steal) {
7450 /* We're requested to push tunnel header, but also we need to take
7451 * the ownership of these packets. Thus, we can avoid performing
7452 * the action, because the caller will not use the result anyway.
7453 * Just break to free the batch. */
7454 break;
a36de779 7455 }
47e1b3b6 7456 dp_packet_batch_apply_cutlen(packets_);
a13a0209
AT
7457 packet_count = dp_packet_batch_size(packets_);
7458 if (push_tnl_action(pmd, a, packets_)) {
7459 COVERAGE_ADD(datapath_drop_tunnel_push_error,
7460 packet_count);
7461 }
47e1b3b6 7462 return;
a36de779
PS
7463
7464 case OVS_ACTION_ATTR_TUNNEL_POP:
7465 if (*depth < MAX_RECIRC_DEPTH) {
aaca4fe0 7466 struct dp_packet_batch *orig_packets_ = packets_;
8611f9a4 7467 odp_port_t portno = nl_attr_get_odp_port(a);
a36de779 7468
57eebbb4 7469 p = pmd_tnl_port_cache_lookup(pmd, portno);
a36de779 7470 if (p) {
1895cc8d 7471 struct dp_packet_batch tnl_pkt;
a36de779 7472
7d7ded7a 7473 if (!should_steal) {
aaca4fe0
WT
7474 dp_packet_batch_clone(&tnl_pkt, packets_);
7475 packets_ = &tnl_pkt;
7476 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
7477 }
7478
aaca4fe0
WT
7479 dp_packet_batch_apply_cutlen(packets_);
7480
a13a0209 7481 packet_count = dp_packet_batch_size(packets_);
324c8374 7482 netdev_pop_header(p->port->netdev, packets_);
a13a0209
AT
7483 packets_dropped =
7484 packet_count - dp_packet_batch_size(packets_);
7485 if (packets_dropped) {
7486 COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7487 packets_dropped);
7488 }
72c84bc2 7489 if (dp_packet_batch_is_empty(packets_)) {
1c8f98d9
PS
7490 return;
7491 }
9235b479 7492
72c84bc2 7493 struct dp_packet *packet;
e883448e 7494 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7495 packet->md.in_port.odp_port = portno;
a36de779 7496 }
9235b479
PS
7497
7498 (*depth)++;
7499 dp_netdev_recirculate(pmd, packets_);
7500 (*depth)--;
a36de779
PS
7501 return;
7502 }
a13a0209
AT
7503 COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7504 dp_packet_batch_size(packets_));
7505 } else {
7506 COVERAGE_ADD(datapath_drop_recirc_error,
7507 dp_packet_batch_size(packets_));
a36de779
PS
7508 }
7509 break;
7510
623540e4
EJ
7511 case OVS_ACTION_ATTR_USERSPACE:
7512 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
aaca4fe0 7513 struct dp_packet_batch *orig_packets_ = packets_;
623540e4 7514 const struct nlattr *userdata;
aaca4fe0 7515 struct dp_packet_batch usr_pkt;
623540e4
EJ
7516 struct ofpbuf actions;
7517 struct flow flow;
7af12bd7 7518 ovs_u128 ufid;
aaca4fe0 7519 bool clone = false;
4fc65926 7520
623540e4
EJ
7521 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7522 ofpbuf_init(&actions, 0);
8cbf4f47 7523
aaca4fe0 7524 if (packets_->trunc) {
7d7ded7a 7525 if (!should_steal) {
aaca4fe0
WT
7526 dp_packet_batch_clone(&usr_pkt, packets_);
7527 packets_ = &usr_pkt;
aaca4fe0
WT
7528 clone = true;
7529 dp_packet_batch_reset_cutlen(orig_packets_);
7530 }
7531
7532 dp_packet_batch_apply_cutlen(packets_);
7533 }
7534
72c84bc2 7535 struct dp_packet *packet;
e883448e 7536 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7537 flow_extract(packet, &flow);
7a5e0ee7 7538 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7d7ded7a 7539 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
b010be17 7540 &ufid, &actions, userdata);
db73f716 7541 }
aaca4fe0
WT
7542
7543 if (clone) {
7544 dp_packet_delete_batch(packets_, true);
7545 }
7546
623540e4
EJ
7547 ofpbuf_uninit(&actions);
7548 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 7549
ac8c2081
DDP
7550 return;
7551 }
a13a0209
AT
7552 COVERAGE_ADD(datapath_drop_lock_error,
7553 dp_packet_batch_size(packets_));
09f9da0b 7554 break;
572f732a 7555
adcf00ba
AZ
7556 case OVS_ACTION_ATTR_RECIRC:
7557 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 7558 struct dp_packet_batch recirc_pkts;
572f732a 7559
7d7ded7a 7560 if (!should_steal) {
1895cc8d
PS
7561 dp_packet_batch_clone(&recirc_pkts, packets_);
7562 packets_ = &recirc_pkts;
28e2fa02 7563 }
8cbf4f47 7564
72c84bc2 7565 struct dp_packet *packet;
e883448e 7566 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7567 packet->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 7568 }
28e2fa02
DDP
7569
7570 (*depth)++;
1895cc8d 7571 dp_netdev_recirculate(pmd, packets_);
adcf00ba
AZ
7572 (*depth)--;
7573
ac8c2081 7574 return;
adcf00ba 7575 }
ac8c2081 7576
a13a0209
AT
7577 COVERAGE_ADD(datapath_drop_recirc_error,
7578 dp_packet_batch_size(packets_));
ac8c2081 7579 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 7580 break;
572f732a 7581
5cf3edb3
DDP
7582 case OVS_ACTION_ATTR_CT: {
7583 const struct nlattr *b;
a76a37ef 7584 bool force = false;
5cf3edb3
DDP
7585 bool commit = false;
7586 unsigned int left;
7587 uint16_t zone = 0;
2078901a 7588 uint32_t tp_id = 0;
5cf3edb3
DDP
7589 const char *helper = NULL;
7590 const uint32_t *setmark = NULL;
7591 const struct ovs_key_ct_labels *setlabel = NULL;
4cddb1f0
DB
7592 struct nat_action_info_t nat_action_info;
7593 struct nat_action_info_t *nat_action_info_ref = NULL;
7594 bool nat_config = false;
5cf3edb3
DDP
7595
7596 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7597 nl_attr_get_size(a)) {
7598 enum ovs_ct_attr sub_type = nl_attr_type(b);
7599
7600 switch(sub_type) {
b80e259f 7601 case OVS_CT_ATTR_FORCE_COMMIT:
a76a37ef
JR
7602 force = true;
7603 /* fall through. */
5cf3edb3
DDP
7604 case OVS_CT_ATTR_COMMIT:
7605 commit = true;
7606 break;
7607 case OVS_CT_ATTR_ZONE:
7608 zone = nl_attr_get_u16(b);
7609 break;
7610 case OVS_CT_ATTR_HELPER:
7611 helper = nl_attr_get_string(b);
7612 break;
7613 case OVS_CT_ATTR_MARK:
7614 setmark = nl_attr_get(b);
7615 break;
7616 case OVS_CT_ATTR_LABELS:
7617 setlabel = nl_attr_get(b);
7618 break;
8e83854c
JR
7619 case OVS_CT_ATTR_EVENTMASK:
7620 /* Silently ignored, as userspace datapath does not generate
7621 * netlink events. */
7622 break;
ebe62ec1 7623 case OVS_CT_ATTR_TIMEOUT:
2078901a
WT
7624 if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
7625 VLOG_WARN("Invalid Timeout Policy ID: %s.",
7626 nl_attr_get_string(b));
7627 tp_id = DEFAULT_TP_ID;
7628 }
ebe62ec1 7629 break;
4cddb1f0
DB
7630 case OVS_CT_ATTR_NAT: {
7631 const struct nlattr *b_nest;
7632 unsigned int left_nest;
7633 bool ip_min_specified = false;
7634 bool proto_num_min_specified = false;
7635 bool ip_max_specified = false;
7636 bool proto_num_max_specified = false;
7637 memset(&nat_action_info, 0, sizeof nat_action_info);
7638 nat_action_info_ref = &nat_action_info;
7639
7640 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7641 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7642
7643 switch (sub_type_nest) {
7644 case OVS_NAT_ATTR_SRC:
7645 case OVS_NAT_ATTR_DST:
7646 nat_config = true;
7647 nat_action_info.nat_action |=
7648 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7649 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7650 break;
7651 case OVS_NAT_ATTR_IP_MIN:
7652 memcpy(&nat_action_info.min_addr,
7653 nl_attr_get(b_nest),
7654 nl_attr_get_size(b_nest));
7655 ip_min_specified = true;
7656 break;
7657 case OVS_NAT_ATTR_IP_MAX:
7658 memcpy(&nat_action_info.max_addr,
7659 nl_attr_get(b_nest),
7660 nl_attr_get_size(b_nest));
7661 ip_max_specified = true;
7662 break;
7663 case OVS_NAT_ATTR_PROTO_MIN:
7664 nat_action_info.min_port =
7665 nl_attr_get_u16(b_nest);
7666 proto_num_min_specified = true;
7667 break;
7668 case OVS_NAT_ATTR_PROTO_MAX:
7669 nat_action_info.max_port =
7670 nl_attr_get_u16(b_nest);
7671 proto_num_max_specified = true;
7672 break;
7673 case OVS_NAT_ATTR_PERSISTENT:
7674 case OVS_NAT_ATTR_PROTO_HASH:
7675 case OVS_NAT_ATTR_PROTO_RANDOM:
7676 break;
7677 case OVS_NAT_ATTR_UNSPEC:
7678 case __OVS_NAT_ATTR_MAX:
7679 OVS_NOT_REACHED();
7680 }
7681 }
7682
7683 if (ip_min_specified && !ip_max_specified) {
7684 nat_action_info.max_addr = nat_action_info.min_addr;
7685 }
7686 if (proto_num_min_specified && !proto_num_max_specified) {
7687 nat_action_info.max_port = nat_action_info.min_port;
7688 }
7689 if (proto_num_min_specified || proto_num_max_specified) {
7690 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7691 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7692 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7693 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7694 }
7695 }
7696 break;
7697 }
5cf3edb3
DDP
7698 case OVS_CT_ATTR_UNSPEC:
7699 case __OVS_CT_ATTR_MAX:
7700 OVS_NOT_REACHED();
7701 }
7702 }
7703
4cddb1f0
DB
7704 /* We won't be able to function properly in this case, hence
7705 * complain loudly. */
7706 if (nat_config && !commit) {
7707 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7708 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7709 }
7710
57593fd2 7711 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
bd7d93f8 7712 commit, zone, setmark, setlabel, aux->flow->tp_src,
b010be17 7713 aux->flow->tp_dst, helper, nat_action_info_ref,
2078901a 7714 pmd->ctx.now / 1000, tp_id);
07659514 7715 break;
5cf3edb3 7716 }
07659514 7717
5dddf960 7718 case OVS_ACTION_ATTR_METER:
4b27db64 7719 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
b010be17 7720 pmd->ctx.now);
4b27db64
JR
7721 break;
7722
09f9da0b
JR
7723 case OVS_ACTION_ATTR_PUSH_VLAN:
7724 case OVS_ACTION_ATTR_POP_VLAN:
7725 case OVS_ACTION_ATTR_PUSH_MPLS:
7726 case OVS_ACTION_ATTR_POP_MPLS:
7727 case OVS_ACTION_ATTR_SET:
6d670e7f 7728 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 7729 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 7730 case OVS_ACTION_ATTR_HASH:
09f9da0b 7731 case OVS_ACTION_ATTR_UNSPEC:
aaca4fe0 7732 case OVS_ACTION_ATTR_TRUNC:
6fcecb85
YY
7733 case OVS_ACTION_ATTR_PUSH_ETH:
7734 case OVS_ACTION_ATTR_POP_ETH:
535e3acf 7735 case OVS_ACTION_ATTR_CLONE:
f59cb331
YY
7736 case OVS_ACTION_ATTR_PUSH_NSH:
7737 case OVS_ACTION_ATTR_POP_NSH:
1fe178d2 7738 case OVS_ACTION_ATTR_CT_CLEAR:
5b34f8fc 7739 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
a13a0209 7740 case OVS_ACTION_ATTR_DROP:
09f9da0b
JR
7741 case __OVS_ACTION_ATTR_MAX:
7742 OVS_NOT_REACHED();
da546e07 7743 }
ac8c2081 7744
7d7ded7a 7745 dp_packet_delete_batch(packets_, should_steal);
98403001
BP
7746}
7747
4edb9ae9 7748static void
65f13b50 7749dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7750 struct dp_packet_batch *packets,
7d7ded7a 7751 bool should_steal, const struct flow *flow,
b010be17 7752 const struct nlattr *actions, size_t actions_len)
72865317 7753{
b010be17 7754 struct dp_netdev_execute_aux aux = { pmd, flow };
9080a111 7755
7d7ded7a 7756 odp_execute_actions(&aux, packets, should_steal, actions,
8cbf4f47 7757 actions_len, dp_execute_cb);
72865317
BP
7758}
7759
4d4e68ed
DDP
7760struct dp_netdev_ct_dump {
7761 struct ct_dpif_dump_state up;
7762 struct conntrack_dump dump;
7763 struct conntrack *ct;
7764 struct dp_netdev *dp;
7765};
7766
7767static int
7768dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
ded30c74 7769 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
7770{
7771 struct dp_netdev *dp = get_dp_netdev(dpif);
7772 struct dp_netdev_ct_dump *dump;
7773
7774 dump = xzalloc(sizeof *dump);
7775 dump->dp = dp;
57593fd2 7776 dump->ct = dp->conntrack;
4d4e68ed 7777
57593fd2 7778 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
4d4e68ed
DDP
7779
7780 *dump_ = &dump->up;
7781
7782 return 0;
7783}
7784
7785static int
7786dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7787 struct ct_dpif_dump_state *dump_,
7788 struct ct_dpif_entry *entry)
7789{
7790 struct dp_netdev_ct_dump *dump;
7791
7792 INIT_CONTAINER(dump, dump_, up);
7793
7794 return conntrack_dump_next(&dump->dump, entry);
7795}
7796
7797static int
7798dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7799 struct ct_dpif_dump_state *dump_)
7800{
7801 struct dp_netdev_ct_dump *dump;
7802 int err;
7803
7804 INIT_CONTAINER(dump, dump_, up);
7805
7806 err = conntrack_dump_done(&dump->dump);
7807
7808 free(dump);
7809
7810 return err;
7811}
7812
5d9cbb4c 7813static int
817a7657
YHW
7814dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7815 const struct ct_dpif_tuple *tuple)
5d9cbb4c
DDP
7816{
7817 struct dp_netdev *dp = get_dp_netdev(dpif);
7818
817a7657 7819 if (tuple) {
57593fd2 7820 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
817a7657 7821 }
57593fd2 7822 return conntrack_flush(dp->conntrack, zone);
5d9cbb4c
DDP
7823}
7824
c92339ad
DB
7825static int
7826dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7827{
7828 struct dp_netdev *dp = get_dp_netdev(dpif);
7829
57593fd2 7830 return conntrack_set_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7831}
7832
7833static int
7834dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7835{
7836 struct dp_netdev *dp = get_dp_netdev(dpif);
7837
57593fd2 7838 return conntrack_get_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7839}
7840
875075b3
DB
7841static int
7842dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7843{
7844 struct dp_netdev *dp = get_dp_netdev(dpif);
7845
57593fd2 7846 return conntrack_get_nconns(dp->conntrack, nconns);
875075b3
DB
7847}
7848
64207120
DB
7849static int
7850dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7851{
7852 struct dp_netdev *dp = get_dp_netdev(dpif);
7853
7854 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7855}
7856
7857static int
7858dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7859{
7860 struct dp_netdev *dp = get_dp_netdev(dpif);
7861 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7862 return 0;
7863}
7864
a7f33fdb
DB
7865static int
7866dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
7867 const uint32_t *default_limits,
7868 const struct ovs_list *zone_limits)
7869{
7870 int err = 0;
7871 struct dp_netdev *dp = get_dp_netdev(dpif);
7872 if (default_limits) {
7873 err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
7874 if (err != 0) {
7875 return err;
7876 }
7877 }
7878
7879 struct ct_dpif_zone_limit *zone_limit;
7880 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7881 err = zone_limit_update(dp->conntrack, zone_limit->zone,
7882 zone_limit->limit);
7883 if (err != 0) {
7884 break;
7885 }
7886 }
7887 return err;
7888}
7889
7890static int
7891dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
7892 uint32_t *default_limit,
7893 const struct ovs_list *zone_limits_request,
7894 struct ovs_list *zone_limits_reply)
7895{
7896 struct dp_netdev *dp = get_dp_netdev(dpif);
7897 struct conntrack_zone_limit czl;
7898
7899 czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
7900 if (czl.zone == DEFAULT_ZONE) {
7901 *default_limit = czl.limit;
7902 } else {
7903 return EINVAL;
7904 }
7905
7906 if (!ovs_list_is_empty(zone_limits_request)) {
7907 struct ct_dpif_zone_limit *zone_limit;
7908 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
7909 czl = zone_limit_get(dp->conntrack, zone_limit->zone);
7910 if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
7911 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
7912 czl.limit, czl.count);
7913 } else {
7914 return EINVAL;
7915 }
7916 }
7917 } else {
7918 for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
7919 czl = zone_limit_get(dp->conntrack, z);
7920 if (czl.zone == z) {
7921 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
7922 czl.count);
7923 }
7924 }
7925 }
7926
7927 return 0;
7928}
7929
7930static int
7931dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
7932 const struct ovs_list *zone_limits)
7933{
7934 int err = 0;
7935 struct dp_netdev *dp = get_dp_netdev(dpif);
7936 struct ct_dpif_zone_limit *zone_limit;
7937 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7938 err = zone_limit_delete(dp->conntrack, zone_limit->zone);
7939 if (err != 0) {
7940 break;
7941 }
7942 }
7943
7944 return err;
7945}
7946
2078901a
WT
7947static int
7948dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
7949 const struct ct_dpif_timeout_policy *dpif_tp)
7950{
7951 struct timeout_policy tp;
7952 struct dp_netdev *dp;
7953
7954 dp = get_dp_netdev(dpif);
7955 memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
7956 return timeout_policy_update(dp->conntrack, &tp);
7957}
7958
7959static int
7960dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
7961 struct ct_dpif_timeout_policy *dpif_tp)
7962{
7963 struct timeout_policy *tp;
7964 struct dp_netdev *dp;
7965 int err = 0;
7966
7967 dp = get_dp_netdev(dpif);
7968 tp = timeout_policy_get(dp->conntrack, tp_id);
7969 if (!tp) {
7970 return ENOENT;
7971 }
7972 memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
7973 return err;
7974}
7975
7976static int
7977dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
7978 uint32_t tp_id)
7979{
7980 struct dp_netdev *dp;
7981 int err = 0;
7982
7983 dp = get_dp_netdev(dpif);
7984 err = timeout_policy_delete(dp->conntrack, tp_id);
7985 return err;
7986}
7987
7988static int
7989dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
7990 uint32_t tp_id,
7991 uint16_t dl_type OVS_UNUSED,
7992 uint8_t nw_proto OVS_UNUSED,
7993 char **tp_name, bool *is_generic)
7994{
7995 struct ds ds = DS_EMPTY_INITIALIZER;
7996
7997 ds_put_format(&ds, "%"PRIu32, tp_id);
7998 *tp_name = ds_steal_cstr(&ds);
7999 *is_generic = true;
8000 return 0;
8001}
8002
4ea96698
DB
8003static int
8004dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8005{
8006 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 8007 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
4ea96698
DB
8008}
8009
8010static int
8011dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8012{
8013 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 8014 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
4ea96698
DB
8015}
8016
8017static int
8018dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8019{
8020 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 8021 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
4ea96698
DB
8022}
8023
8024/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8025 * diverge. */
8026static int
8027dpif_netdev_ipf_get_status(struct dpif *dpif,
8028 struct dpif_ipf_status *dpif_ipf_status)
8029{
8030 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 8031 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
4ea96698
DB
8032 (struct ipf_status *) dpif_ipf_status);
8033 return 0;
8034}
8035
8036static int
8037dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8038 struct ipf_dump_ctx **ipf_dump_ctx)
8039{
8040 return ipf_dump_start(ipf_dump_ctx);
8041}
8042
8043static int
8044dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8045{
8046 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 8047 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
4ea96698
DB
8048 dump);
8049}
8050
8051static int
8052dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8053{
8054 return ipf_dump_done(ipf_dump_ctx);
8055
8056}
8057
9df65060
VDA
8058static int
8059dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8060 odp_port_t *slave_map)
8061{
8062 struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8063 struct dp_netdev *dp = get_dp_netdev(dpif);
8064 struct dp_netdev_pmd_thread *pmd;
8065
8066 /* Prepare new bond mapping. */
8067 new_tx->bond_id = bond_id;
8068 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8069 new_tx->slave_buckets[bucket].slave_id = slave_map[bucket];
8070 }
8071
8072 ovs_mutex_lock(&dp->bond_mutex);
8073 /* Check if bond already existed. */
8074 struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8075 if (old_tx) {
8076 cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8077 hash_bond_id(bond_id));
8078 ovsrcu_postpone(free, old_tx);
8079 } else {
8080 cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8081 }
8082 ovs_mutex_unlock(&dp->bond_mutex);
8083
8084 /* Update all PMDs with new bond mapping. */
8085 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8086 dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8087 }
8088 return 0;
8089}
8090
8091static int
8092dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8093{
8094 struct dp_netdev *dp = get_dp_netdev(dpif);
8095 struct dp_netdev_pmd_thread *pmd;
8096 struct tx_bond *tx;
8097
8098 ovs_mutex_lock(&dp->bond_mutex);
8099 /* Check if bond existed. */
8100 tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8101 if (tx) {
8102 cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8103 ovsrcu_postpone(free, tx);
8104 } else {
8105 /* Bond is not present. */
8106 ovs_mutex_unlock(&dp->bond_mutex);
8107 return ENOENT;
8108 }
8109 ovs_mutex_unlock(&dp->bond_mutex);
8110
8111 /* Remove the bond map in all pmds. */
8112 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8113 dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8114 }
8115 return 0;
8116}
8117
8118static int
8119dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8120 uint64_t *n_bytes)
8121{
8122 struct dp_netdev *dp = get_dp_netdev(dpif);
8123 struct dp_netdev_pmd_thread *pmd;
8124
8125 if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8126 return ENOENT;
8127 }
8128
8129 /* Search the bond in all PMDs. */
8130 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8131 struct tx_bond *pmd_bond_entry
8132 = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8133
8134 if (!pmd_bond_entry) {
8135 continue;
8136 }
8137
8138 /* Read bond stats. */
8139 for (int i = 0; i < BOND_BUCKETS; i++) {
8140 uint64_t pmd_n_bytes;
8141
8142 atomic_read_relaxed(&pmd_bond_entry->slave_buckets[i].n_bytes,
8143 &pmd_n_bytes);
8144 n_bytes[i] += pmd_n_bytes;
8145 }
8146 }
8147 return 0;
8148}
8149
72865317 8150const struct dpif_class dpif_netdev_class = {
72865317 8151 "netdev",
f87c1357 8152 true, /* cleanup_required */
6553d06b 8153 dpif_netdev_init,
2197d7ab 8154 dpif_netdev_enumerate,
0aeaabc8 8155 dpif_netdev_port_open_type,
72865317
BP
8156 dpif_netdev_open,
8157 dpif_netdev_close,
7dab847a 8158 dpif_netdev_destroy,
e4cfed38
PS
8159 dpif_netdev_run,
8160 dpif_netdev_wait,
72865317 8161 dpif_netdev_get_stats,
dcdcad68 8162 NULL, /* set_features */
72865317
BP
8163 dpif_netdev_port_add,
8164 dpif_netdev_port_del,
3eb67853 8165 dpif_netdev_port_set_config,
72865317
BP
8166 dpif_netdev_port_query_by_number,
8167 dpif_netdev_port_query_by_name,
98403001 8168 NULL, /* port_get_pid */
b0ec0f27
BP
8169 dpif_netdev_port_dump_start,
8170 dpif_netdev_port_dump_next,
8171 dpif_netdev_port_dump_done,
72865317
BP
8172 dpif_netdev_port_poll,
8173 dpif_netdev_port_poll_wait,
72865317 8174 dpif_netdev_flow_flush,
ac64794a
BP
8175 dpif_netdev_flow_dump_create,
8176 dpif_netdev_flow_dump_destroy,
8177 dpif_netdev_flow_dump_thread_create,
8178 dpif_netdev_flow_dump_thread_destroy,
704a1e09 8179 dpif_netdev_flow_dump_next,
1a0c894a 8180 dpif_netdev_operate,
6b31e073
RW
8181 NULL, /* recv_set */
8182 NULL, /* handlers_set */
d4f6865c 8183 dpif_netdev_set_config,
5bf93d67 8184 dpif_netdev_queue_to_priority,
6b31e073
RW
8185 NULL, /* recv */
8186 NULL, /* recv_wait */
8187 NULL, /* recv_purge */
e4e74c3a 8188 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
8189 dpif_netdev_register_upcall_cb,
8190 dpif_netdev_enable_upcall,
8191 dpif_netdev_disable_upcall,
b5cbbcf6 8192 dpif_netdev_get_datapath_version,
4d4e68ed
DDP
8193 dpif_netdev_ct_dump_start,
8194 dpif_netdev_ct_dump_next,
8195 dpif_netdev_ct_dump_done,
5d9cbb4c 8196 dpif_netdev_ct_flush,
c92339ad
DB
8197 dpif_netdev_ct_set_maxconns,
8198 dpif_netdev_ct_get_maxconns,
875075b3 8199 dpif_netdev_ct_get_nconns,
64207120
DB
8200 dpif_netdev_ct_set_tcp_seq_chk,
8201 dpif_netdev_ct_get_tcp_seq_chk,
a7f33fdb
DB
8202 dpif_netdev_ct_set_limits,
8203 dpif_netdev_ct_get_limits,
8204 dpif_netdev_ct_del_limits,
2078901a
WT
8205 dpif_netdev_ct_set_timeout_policy,
8206 dpif_netdev_ct_get_timeout_policy,
8207 dpif_netdev_ct_del_timeout_policy,
1f161318
YHW
8208 NULL, /* ct_timeout_policy_dump_start */
8209 NULL, /* ct_timeout_policy_dump_next */
8210 NULL, /* ct_timeout_policy_dump_done */
2078901a 8211 dpif_netdev_ct_get_timeout_policy_name,
4ea96698
DB
8212 dpif_netdev_ipf_set_enabled,
8213 dpif_netdev_ipf_set_min_frag,
8214 dpif_netdev_ipf_set_max_nfrags,
8215 dpif_netdev_ipf_get_status,
8216 dpif_netdev_ipf_dump_start,
8217 dpif_netdev_ipf_dump_next,
8218 dpif_netdev_ipf_dump_done,
5dddf960
JR
8219 dpif_netdev_meter_get_features,
8220 dpif_netdev_meter_set,
8221 dpif_netdev_meter_get,
8222 dpif_netdev_meter_del,
9df65060
VDA
8223 dpif_netdev_bond_add,
8224 dpif_netdev_bond_del,
8225 dpif_netdev_bond_stats_get,
72865317 8226};
614c4892 8227
74cc3969
BP
8228static void
8229dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8230 const char *argv[], void *aux OVS_UNUSED)
8231{
e9985d6a 8232 struct dp_netdev_port *port;
74cc3969 8233 struct dp_netdev *dp;
ff073a71 8234 odp_port_t port_no;
74cc3969 8235
8a4e3a85 8236 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
8237 dp = shash_find_data(&dp_netdevs, argv[1]);
8238 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 8239 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
8240 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8241 return;
8242 }
8a4e3a85
BP
8243 ovs_refcount_ref(&dp->ref_cnt);
8244 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 8245
59e6d833 8246 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 8247 if (get_port_by_name(dp, argv[2], &port)) {
74cc3969 8248 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 8249 goto exit;
74cc3969
BP
8250 }
8251
ff073a71
BP
8252 port_no = u32_to_odp(atoi(argv[3]));
8253 if (!port_no || port_no == ODPP_NONE) {
74cc3969 8254 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 8255 goto exit;
74cc3969 8256 }
ff073a71 8257 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 8258 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 8259 goto exit;
74cc3969 8260 }
59e6d833 8261
e9985d6a
DDP
8262 /* Remove port. */
8263 hmap_remove(&dp->ports, &port->node);
e32971b8 8264 reconfigure_datapath(dp);
59e6d833 8265
e9985d6a
DDP
8266 /* Reinsert with new port number. */
8267 port->port_no = port_no;
8268 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
e32971b8 8269 reconfigure_datapath(dp);
59e6d833 8270
d33ed218 8271 seq_change(dp->port_seq);
74cc3969 8272 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
8273
8274exit:
59e6d833 8275 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 8276 dp_netdev_unref(dp);
74cc3969
BP
8277}
8278
0cbfe35d
BP
8279static void
8280dpif_dummy_register__(const char *type)
8281{
8282 struct dpif_class *class;
8283
8284 class = xmalloc(sizeof *class);
8285 *class = dpif_netdev_class;
8286 class->type = xstrdup(type);
8287 dp_register_provider(class);
8288}
8289
8420c7ad
BP
8290static void
8291dpif_dummy_override(const char *type)
8292{
65d43fdc
YT
8293 int error;
8294
8295 /*
8296 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8297 * a userland-only build. It's useful for testsuite.
8298 */
8299 error = dp_unregister_provider(type);
8300 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
8301 dpif_dummy_register__(type);
8302 }
8303}
8304
614c4892 8305void
8420c7ad 8306dpif_dummy_register(enum dummy_level level)
614c4892 8307{
8420c7ad 8308 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
8309 struct sset types;
8310 const char *type;
8311
8312 sset_init(&types);
8313 dp_enumerate_types(&types);
8314 SSET_FOR_EACH (type, &types) {
8420c7ad 8315 dpif_dummy_override(type);
0cbfe35d
BP
8316 }
8317 sset_destroy(&types);
8420c7ad
BP
8318 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8319 dpif_dummy_override("system");
614c4892 8320 }
0cbfe35d
BP
8321
8322 dpif_dummy_register__("dummy");
74cc3969
BP
8323
8324 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 8325 "dp port new-number",
74cc3969 8326 3, 3, dpif_dummy_change_port_number, NULL);
614c4892 8327}
0de8783a
JR
8328\f
8329/* Datapath Classifier. */
8330
0fcf0776
ZF
8331static void
8332dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8333{
8334 cmap_destroy(&subtable->rules);
a0b36b39 8335 ovsrcu_postpone(free, subtable->mf_masks);
0fcf0776
ZF
8336 ovsrcu_postpone(free, subtable);
8337}
8338
0de8783a
JR
8339/* Initializes 'cls' as a classifier that initially contains no classification
8340 * rules. */
8341static void
8342dpcls_init(struct dpcls *cls)
8343{
8344 cmap_init(&cls->subtables_map);
da9cfca6 8345 pvector_init(&cls->subtables);
0de8783a
JR
8346}
8347
8348static void
8349dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8350{
3453b4d6 8351 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
da9cfca6 8352 pvector_remove(&cls->subtables, subtable);
0de8783a
JR
8353 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8354 subtable->mask.hash);
0fcf0776 8355 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
0de8783a
JR
8356}
8357
8358/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
8359 * caller's responsibility.
8360 * May only be called after all the readers have been terminated. */
8361static void
8362dpcls_destroy(struct dpcls *cls)
8363{
8364 if (cls) {
8365 struct dpcls_subtable *subtable;
8366
8367 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 8368 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
8369 dpcls_destroy_subtable(cls, subtable);
8370 }
8371 cmap_destroy(&cls->subtables_map);
da9cfca6 8372 pvector_destroy(&cls->subtables);
0de8783a
JR
8373 }
8374}
8375
8376static struct dpcls_subtable *
8377dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8378{
8379 struct dpcls_subtable *subtable;
8380
8381 /* Need to add one. */
caeb4906
JR
8382 subtable = xmalloc(sizeof *subtable
8383 - sizeof subtable->mask.mf + mask->len);
0de8783a 8384 cmap_init(&subtable->rules);
3453b4d6 8385 subtable->hit_cnt = 0;
0de8783a 8386 netdev_flow_key_clone(&subtable->mask, mask);
aadede3d 8387
a0b36b39
HH
8388 /* The count of bits in the mask defines the space required for masks.
8389 * Then call gen_masks() to create the appropriate masks, avoiding the cost
8390 * of doing runtime calculations. */
8391 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
8392 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
8393 subtable->mf_bits_set_unit0 = unit0;
8394 subtable->mf_bits_set_unit1 = unit1;
8395 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
8396 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
8397
f54d8f00
HH
8398 /* Probe for a specialized generic lookup function. */
8399 subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
8400
8401 /* If not set, assign generic lookup. Generic works for any miniflow. */
8402 if (!subtable->lookup_func) {
8403 subtable->lookup_func = dpcls_subtable_lookup_generic;
8404 }
aadede3d 8405
0de8783a 8406 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
3453b4d6 8407 /* Add the new subtable at the end of the pvector (with no hits yet) */
da9cfca6 8408 pvector_insert(&cls->subtables, subtable, 0);
84dbfb2b 8409 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
3453b4d6 8410 cmap_count(&cls->subtables_map), subtable, cls->in_port);
da9cfca6 8411 pvector_publish(&cls->subtables);
0de8783a
JR
8412
8413 return subtable;
8414}
8415
8416static inline struct dpcls_subtable *
8417dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8418{
8419 struct dpcls_subtable *subtable;
8420
8421 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8422 &cls->subtables_map) {
8423 if (netdev_flow_key_equal(&subtable->mask, mask)) {
8424 return subtable;
8425 }
8426 }
8427 return dpcls_create_subtable(cls, mask);
8428}
8429
3453b4d6
JS
8430
8431/* Periodically sort the dpcls subtable vectors according to hit counts */
8432static void
8433dpcls_sort_subtable_vector(struct dpcls *cls)
8434{
8435 struct pvector *pvec = &cls->subtables;
8436 struct dpcls_subtable *subtable;
8437
8438 PVECTOR_FOR_EACH (subtable, pvec) {
8439 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8440 subtable->hit_cnt = 0;
8441 }
8442 pvector_publish(pvec);
8443}
8444
8445static inline void
4809891b
KT
8446dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8447 struct polled_queue *poll_list, int poll_cnt)
3453b4d6
JS
8448{
8449 struct dpcls *cls;
5bf84282
NK
8450 uint64_t tot_idle = 0, tot_proc = 0;
8451 unsigned int pmd_load = 0;
3453b4d6 8452
b010be17 8453 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
2a2c67b4 8454 uint64_t curr_tsc;
5bf84282
NK
8455 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8456 if (pmd_alb->is_enabled && !pmd->isolated
8457 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8458 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8459 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8460 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8461 {
8462 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8463 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8464 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8465 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8466
8467 if (tot_proc) {
8468 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8469 }
8470
8471 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8472 atomic_count_inc(&pmd->pmd_overloaded);
8473 } else {
8474 atomic_count_set(&pmd->pmd_overloaded, 0);
8475 }
8476 }
8477
8478 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8479 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8480 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8481 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8482
4809891b
KT
8483 /* Get the cycles that were used to process each queue and store. */
8484 for (unsigned i = 0; i < poll_cnt; i++) {
8485 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8486 RXQ_CYCLES_PROC_CURR);
8487 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8488 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8489 0);
8490 }
2a2c67b4
KT
8491 curr_tsc = cycles_counter_update(&pmd->perf_stats);
8492 if (pmd->intrvl_tsc_prev) {
8493 /* There is a prev timestamp, store a new intrvl cycle count. */
8494 atomic_store_relaxed(&pmd->intrvl_cycles,
8495 curr_tsc - pmd->intrvl_tsc_prev);
8496 }
8497 pmd->intrvl_tsc_prev = curr_tsc;
4809891b 8498 /* Start new measuring interval */
b010be17 8499 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
4809891b
KT
8500 }
8501
b010be17 8502 if (pmd->ctx.now > pmd->next_optimization) {
3453b4d6
JS
8503 /* Try to obtain the flow lock to block out revalidator threads.
8504 * If not possible, just try next time. */
8505 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8506 /* Optimize each classifier */
8507 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8508 dpcls_sort_subtable_vector(cls);
8509 }
8510 ovs_mutex_unlock(&pmd->flow_mutex);
8511 /* Start new measuring interval */
b010be17
IM
8512 pmd->next_optimization = pmd->ctx.now
8513 + DPCLS_OPTIMIZATION_INTERVAL;
3453b4d6
JS
8514 }
8515 }
8516}
8517
0de8783a
JR
8518/* Insert 'rule' into 'cls'. */
8519static void
8520dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8521 const struct netdev_flow_key *mask)
8522{
8523 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8524
3453b4d6 8525 /* Refer to subtable's mask, also for later removal. */
0de8783a
JR
8526 rule->mask = &subtable->mask;
8527 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8528}
8529
8530/* Removes 'rule' from 'cls', also destructing the 'rule'. */
8531static void
8532dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8533{
8534 struct dpcls_subtable *subtable;
8535
8536 ovs_assert(rule->mask);
8537
3453b4d6 8538 /* Get subtable from reference in rule->mask. */
0de8783a 8539 INIT_CONTAINER(subtable, rule->mask, mask);
0de8783a
JR
8540 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8541 == 0) {
3453b4d6 8542 /* Delete empty subtable. */
0de8783a 8543 dpcls_destroy_subtable(cls, subtable);
da9cfca6 8544 pvector_publish(&cls->subtables);
0de8783a
JR
8545 }
8546}
8547
a0b36b39
HH
8548/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8549static inline void
8550netdev_flow_key_gen_mask_unit(uint64_t iter,
8551 const uint64_t count,
8552 uint64_t *mf_masks)
8553{
8554 int i;
8555 for (i = 0; i < count; i++) {
8556 uint64_t lowest_bit = (iter & -iter);
8557 iter &= ~lowest_bit;
8558 mf_masks[i] = (lowest_bit - 1);
8559 }
8560 /* Checks that count has covered all bits in the iter bitmap. */
8561 ovs_assert(iter == 0);
8562}
8563
8564/* Generate a mask for each block in the miniflow, based on the bits set. This
8565 * allows easily masking packets with the generated array here, without
8566 * calculations. This replaces runtime-calculating the masks.
8567 * @param key The table to generate the mf_masks for
8568 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8569 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8570 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8571 */
8572void
8573netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8574 uint64_t *mf_masks,
8575 const uint32_t mf_bits_u0,
8576 const uint32_t mf_bits_u1)
8577{
8578 uint64_t iter_u0 = tbl->mf.map.bits[0];
8579 uint64_t iter_u1 = tbl->mf.map.bits[1];
8580
8581 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8582 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8583}
8584
361d808d
JR
8585/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8586 * in 'mask' the values in 'key' and 'target' are the same. */
f5ace7cd 8587bool
0de8783a
JR
8588dpcls_rule_matches_key(const struct dpcls_rule *rule,
8589 const struct netdev_flow_key *target)
8590{
09b0fa9c
JR
8591 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8592 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 8593 uint64_t value;
0de8783a 8594
5fcff47b
JR
8595 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8596 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
8597 return false;
8598 }
8599 }
8600 return true;
8601}
8602
5b1c9c78
FA
8603/* For each miniflow in 'keys' performs a classifier lookup writing the result
8604 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
0de8783a
JR
8605 * NULL it is skipped.
8606 *
8607 * This function is optimized for use in the userspace datapath and therefore
8608 * does not implement a lot of features available in the standard
8609 * classifier_lookup() function. Specifically, it does not implement
8610 * priorities, instead returning any rule which matches the flow.
8611 *
5b1c9c78 8612 * Returns true if all miniflows found a corresponding rule. */
0de8783a 8613static bool
60d8ccae 8614dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
3453b4d6
JS
8615 struct dpcls_rule **rules, const size_t cnt,
8616 int *num_lookups_p)
0de8783a 8617{
5b1c9c78 8618 /* The received 'cnt' miniflows are the search-keys that will be processed
63906f18
BB
8619 * to find a matching entry into the available subtables.
8620 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
aadede3d 8621#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
63906f18 8622 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
0de8783a 8623
0de8783a 8624 struct dpcls_subtable *subtable;
aadede3d 8625 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
63906f18
BB
8626
8627 if (cnt != MAP_BITS) {
8628 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
0de8783a
JR
8629 }
8630 memset(rules, 0, cnt * sizeof *rules);
8631
3453b4d6 8632 int lookups_match = 0, subtable_pos = 1;
aadede3d 8633 uint32_t found_map;
3453b4d6 8634
5b1c9c78
FA
8635 /* The Datapath classifier - aka dpcls - is composed of subtables.
8636 * Subtables are dynamically created as needed when new rules are inserted.
8637 * Each subtable collects rules with matches on a specific subset of packet
8638 * fields as defined by the subtable's mask. We proceed to process every
8639 * search-key against each subtable, but when a match is found for a
8640 * search-key, the search for that key can stop because the rules are
8641 * non-overlapping. */
da9cfca6 8642 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
aadede3d
HH
8643 /* Call the subtable specific lookup function. */
8644 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
63906f18 8645
aadede3d
HH
8646 /* Count the number of subtables searched for this packet match. This
8647 * estimates the "spread" of subtables looked at per matched packet. */
8648 uint32_t pkts_matched = count_1bits(found_map);
8649 lookups_match += pkts_matched * subtable_pos;
63906f18 8650
aadede3d
HH
8651 /* Clear the found rules, and return early if all packets are found. */
8652 keys_map &= ~found_map;
63906f18 8653 if (!keys_map) {
3453b4d6
JS
8654 if (num_lookups_p) {
8655 *num_lookups_p = lookups_match;
8656 }
aadede3d 8657 return true;
0de8783a 8658 }
3453b4d6
JS
8659 subtable_pos++;
8660 }
aadede3d 8661
3453b4d6
JS
8662 if (num_lookups_p) {
8663 *num_lookups_p = lookups_match;
0de8783a 8664 }
aadede3d 8665 return false;
0de8783a 8666}