]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netdev.c
dpif-netdev: Modified ovs-appctl dpctl/dump-flows command
[mirror_ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
4ea96698 2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
f5ace7cd 19#include "dpif-netdev-private.h"
72865317 20
72865317
BP
21#include <ctype.h>
22#include <errno.h>
23#include <fcntl.h>
24#include <inttypes.h>
7f3adc00 25#include <net/if.h>
b2befd5b 26#include <sys/types.h>
7daedce4 27#include <netinet/in.h>
cdee00fd 28#include <stdint.h>
72865317
BP
29#include <stdlib.h>
30#include <string.h>
31#include <sys/ioctl.h>
7daedce4 32#include <sys/socket.h>
72865317 33#include <sys/stat.h>
72865317
BP
34#include <unistd.h>
35
9f861c91 36#include "bitmap.h"
59e6d833 37#include "cmap.h"
5cf3edb3 38#include "conntrack.h"
7daedce4 39#include "coverage.h"
4d4e68ed 40#include "ct-dpif.h"
72865317 41#include "csum.h"
e14deea0 42#include "dp-packet.h"
614c4892 43#include "dpif.h"
82a48ead 44#include "dpif-netdev-perf.h"
72865317 45#include "dpif-provider.h"
614c4892 46#include "dummy.h"
afae68b1 47#include "fat-rwlock.h"
72865317 48#include "flow.h"
762d146a 49#include "hmapx.h"
140dd699 50#include "id-pool.h"
4ea96698 51#include "ipf.h"
72865317 52#include "netdev.h"
b6cabb8f 53#include "netdev-offload.h"
79f36875 54#include "netdev-provider.h"
de281153 55#include "netdev-vport.h"
cdee00fd 56#include "netlink.h"
f094af7b 57#include "odp-execute.h"
72865317 58#include "odp-util.h"
25d436fb
BW
59#include "openvswitch/dynamic-string.h"
60#include "openvswitch/list.h"
61#include "openvswitch/match.h"
0d71302e 62#include "openvswitch/ofp-parse.h"
25d436fb 63#include "openvswitch/ofp-print.h"
64c96779 64#include "openvswitch/ofpbuf.h"
3eb67853 65#include "openvswitch/shash.h"
25d436fb 66#include "openvswitch/vlog.h"
5a034064 67#include "ovs-numa.h"
61e7deb1 68#include "ovs-rcu.h"
72865317 69#include "packets.h"
fd016ae3 70#include "openvswitch/poll-loop.h"
0de8783a 71#include "pvector.h"
26c6b6cd 72#include "random.h"
d33ed218 73#include "seq.h"
3eb67853 74#include "smap.h"
0cbfe35d 75#include "sset.h"
72865317 76#include "timeval.h"
53902038 77#include "tnl-neigh-cache.h"
7f9b8504 78#include "tnl-ports.h"
74cc3969 79#include "unixctl.h"
72865317 80#include "util.h"
241bad15 81#include "uuid.h"
7daedce4 82
d98e6007 83VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 84
5bf84282
NK
85/* Auto Load Balancing Defaults */
86#define ALB_ACCEPTABLE_IMPROVEMENT 25
87#define ALB_PMD_LOAD_THRESHOLD 95
88#define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
89#define MIN_TO_MSEC 60000
90
8bb113da 91#define FLOW_DUMP_MAX_BATCH 50
adcf00ba 92/* Use per thread recirc_depth to prevent recirculation loop. */
3f9d3836 93#define MAX_RECIRC_DEPTH 6
adcf00ba 94DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 95
c71ea3c4
IM
96/* Use instant packet send by default. */
97#define DEFAULT_TX_FLUSH_INTERVAL 0
98
72865317 99/* Configuration parameters. */
72865317 100enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
4b27db64
JR
101enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
102enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
103enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
72865317 104
a13a0209
AT
105COVERAGE_DEFINE(datapath_drop_meter);
106COVERAGE_DEFINE(datapath_drop_upcall_error);
107COVERAGE_DEFINE(datapath_drop_lock_error);
108COVERAGE_DEFINE(datapath_drop_userspace_action_error);
109COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
110COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
111COVERAGE_DEFINE(datapath_drop_recirc_error);
112COVERAGE_DEFINE(datapath_drop_invalid_port);
113COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
114COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
115
8a4e3a85
BP
116/* Protects against changes to 'dp_netdevs'. */
117static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
118
119/* Contains all 'struct dp_netdev's. */
120static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
121 = SHASH_INITIALIZER(&dp_netdevs);
122
623540e4 123static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 124
5cf3edb3 125#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
4cddb1f0
DB
126 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
127 | CS_SRC_NAT | CS_DST_NAT)
5cf3edb3
DDP
128#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
129
2494ccd7 130static struct odp_support dp_netdev_support = {
f0fb825a 131 .max_vlan_headers = SIZE_MAX,
2494ccd7
JS
132 .max_mpls_depth = SIZE_MAX,
133 .recirc = true,
5cf3edb3
DDP
134 .ct_state = true,
135 .ct_zone = true,
136 .ct_mark = true,
137 .ct_label = true,
2575df07
JP
138 .ct_state_nat = true,
139 .ct_orig_tuple = true,
140 .ct_orig_tuple6 = true,
2494ccd7
JS
141};
142
60d8ccae
YW
143/* EMC cache and SMC cache compose the datapath flow cache (DFC)
144 *
145 * Exact match cache for frequently used flows
9bbf1c3d
DDP
146 *
147 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
148 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 149 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
150 *
151 * A cache entry holds a reference to its 'dp_netdev_flow'.
152 *
153 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
154 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
155 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
156 * value is the index of a cache entry where the miniflow could be.
157 *
158 *
60d8ccae
YW
159 * Signature match cache (SMC)
160 *
161 * This cache stores a 16-bit signature for each flow without storing keys, and
162 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
163 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
164 * SMC uses a set-associative design that each bucket contains
165 * SMC_ENTRY_PER_BUCKET number of entries.
166 * Since 16-bit flow_table index is used, if there are more than 2^16
167 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
168 *
169 *
9bbf1c3d
DDP
170 * Thread-safety
171 * =============
172 *
173 * Each pmd_thread has its own private exact match cache.
174 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
175 */
176
fc82e877 177#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
178#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
179#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
180#define EM_FLOW_HASH_SEGS 2
181
60d8ccae
YW
182/* SMC uses a set-associative design. A bucket contains a set of entries that
183 * a flow item can occupy. For now, it uses one hash function rather than two
184 * as for the EMC design. */
185#define SMC_ENTRY_PER_BUCKET 4
186#define SMC_ENTRIES (1u << 20)
187#define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
188#define SMC_MASK (SMC_BUCKET_CNT - 1)
189
4c30b246
CL
190/* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
191#define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
192#define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
193 DEFAULT_EM_FLOW_INSERT_INV_PROB)
194
9bbf1c3d 195struct emc_entry {
9bbf1c3d 196 struct dp_netdev_flow *flow;
0de8783a 197 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
198};
199
200struct emc_cache {
201 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 202 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
203};
204
60d8ccae
YW
205struct smc_bucket {
206 uint16_t sig[SMC_ENTRY_PER_BUCKET];
207 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
208};
209
210/* Signature match cache, differentiate from EMC cache */
211struct smc_cache {
212 struct smc_bucket buckets[SMC_BUCKET_CNT];
213};
214
215struct dfc_cache {
216 struct emc_cache emc_cache;
217 struct smc_cache smc_cache;
218};
219
9bbf1c3d
DDP
220/* Iterate in the exact match cache through every entry that might contain a
221 * miniflow with hash 'HASH'. */
222#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
223 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
224 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
225 i__ < EM_FLOW_HASH_SEGS; \
226 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
227\f
228/* Simple non-wildcarding single-priority classifier. */
229
05f9e707
IM
230/* Time in microseconds between successive optimizations of the dpcls
231 * subtable vector */
232#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
3453b4d6 233
05f9e707
IM
234/* Time in microseconds of the interval in which rxq processing cycles used
235 * in rxq to pmd assignments is measured and stored. */
236#define PMD_RXQ_INTERVAL_LEN 10000000LL
4809891b 237
c59e759f
KT
238/* Number of intervals for which cycles are stored
239 * and used during rxq to pmd assignment. */
240#define PMD_RXQ_INTERVAL_MAX 6
241
0de8783a 242struct dpcls {
3453b4d6
JS
243 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
244 odp_port_t in_port;
0de8783a 245 struct cmap subtables_map;
da9cfca6 246 struct pvector subtables;
0de8783a 247};
9bbf1c3d 248
9b4f08cd
VDA
249/* Data structure to keep packet order till fastpath processing. */
250struct dp_packet_flow_map {
251 struct dp_packet *packet;
252 struct dp_netdev_flow *flow;
253 uint16_t tcp_flags;
254};
255
0de8783a
JR
256static void dpcls_init(struct dpcls *);
257static void dpcls_destroy(struct dpcls *);
3453b4d6 258static void dpcls_sort_subtable_vector(struct dpcls *);
0de8783a
JR
259static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
260 const struct netdev_flow_key *mask);
261static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
3453b4d6 262static bool dpcls_lookup(struct dpcls *cls,
60d8ccae 263 const struct netdev_flow_key *keys[],
3453b4d6
JS
264 struct dpcls_rule **rules, size_t cnt,
265 int *num_lookups_p);
92c7c870 266
4b27db64
JR
267/* Set of supported meter flags */
268#define DP_SUPPORTED_METER_FLAGS_MASK \
269 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
270
271/* Set of supported meter band types */
272#define DP_SUPPORTED_METER_BAND_TYPES \
273 ( 1 << OFPMBT13_DROP )
274
275struct dp_meter_band {
276 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
277 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
278 uint64_t packet_count;
279 uint64_t byte_count;
280};
281
282struct dp_meter {
283 uint16_t flags;
284 uint16_t n_bands;
285 uint32_t max_delta_t;
286 uint64_t used;
287 uint64_t packet_count;
288 uint64_t byte_count;
289 struct dp_meter_band bands[];
290};
291
5bf84282
NK
292struct pmd_auto_lb {
293 bool auto_lb_requested; /* Auto load balancing requested by user. */
294 bool is_enabled; /* Current status of Auto load balancing. */
295 uint64_t rebalance_intvl;
296 uint64_t rebalance_poll_timer;
297};
298
8a4e3a85
BP
299/* Datapath based on the network device interface from netdev.h.
300 *
301 *
302 * Thread-safety
303 * =============
304 *
305 * Some members, marked 'const', are immutable. Accessing other members
306 * requires synchronization, as noted in more detail below.
307 *
308 * Acquisition order is, from outermost to innermost:
309 *
310 * dp_netdev_mutex (global)
59e6d833 311 * port_mutex
d0cca6c3 312 * non_pmd_mutex
8a4e3a85 313 */
72865317 314struct dp_netdev {
8a4e3a85
BP
315 const struct dpif_class *const class;
316 const char *const name;
6a8267c5
BP
317 struct ovs_refcount ref_cnt;
318 atomic_flag destroyed;
72865317 319
8a4e3a85
BP
320 /* Ports.
321 *
e9985d6a
DDP
322 * Any lookup into 'ports' or any access to the dp_netdev_ports found
323 * through 'ports' requires taking 'port_mutex'. */
59e6d833 324 struct ovs_mutex port_mutex;
e9985d6a 325 struct hmap ports;
d33ed218 326 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 327
c71ea3c4
IM
328 /* The time that a packet can wait in output batch for sending. */
329 atomic_uint32_t tx_flush_interval;
330
4b27db64
JR
331 /* Meters. */
332 struct ovs_mutex meter_locks[N_METER_LOCKS];
333 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
4b27db64 334
65dcf3da
BB
335 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
336 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
79f36875
JS
337 /* Enable collection of PMD performance metrics. */
338 atomic_bool pmd_perf_metrics;
60d8ccae
YW
339 /* Enable the SMC cache from ovsdb config */
340 atomic_bool smc_enable_db;
65dcf3da 341
6b31e073
RW
342 /* Protects access to ofproto-dpif-upcall interface during revalidator
343 * thread synchronization. */
344 struct fat_rwlock upcall_rwlock;
623540e4
EJ
345 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
346 void *upcall_aux;
6b31e073 347
e4e74c3a
AW
348 /* Callback function for notifying the purging of dp flows (during
349 * reseting pmd deletion). */
350 dp_purge_callback *dp_purge_cb;
351 void *dp_purge_aux;
352
65f13b50
AW
353 /* Stores all 'struct dp_netdev_pmd_thread's. */
354 struct cmap poll_threads;
140dd699
IM
355 /* id pool for per thread static_tx_qid. */
356 struct id_pool *tx_qid_pool;
357 struct ovs_mutex tx_qid_pool_mutex;
e77c97b9
KT
358 /* Use measured cycles for rxq to pmd assignment. */
359 bool pmd_rxq_assign_cyc;
65f13b50
AW
360
361 /* Protects the access of the 'struct dp_netdev_pmd_thread'
362 * instance for non-pmd thread. */
363 struct ovs_mutex non_pmd_mutex;
364
365 /* Each pmd thread will store its pointer to
366 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
367 ovsthread_key_t per_pmd_key;
f2eee189 368
a6a426d6
IM
369 struct seq *reconfigure_seq;
370 uint64_t last_reconfigure_seq;
371
a14b8947 372 /* Cpu mask for pin of pmd threads. */
f2eee189 373 char *pmd_cmask;
6e3c6fa4 374
a36de779 375 uint64_t last_tnl_conf_seq;
5cf3edb3 376
57593fd2 377 struct conntrack *conntrack;
5bf84282 378 struct pmd_auto_lb pmd_alb;
72865317
BP
379};
380
4b27db64
JR
381static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
382 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
383{
384 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
385}
386
387static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
388 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
389{
390 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
391}
392
393
8a4e3a85 394static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
e9985d6a
DDP
395 odp_port_t)
396 OVS_REQUIRES(dp->port_mutex);
ff073a71 397
c59e759f
KT
398enum rxq_cycles_counter_type {
399 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
400 processing packets during the current
401 interval. */
402 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
403 during rxq to pmd assignment. */
404 RXQ_N_CYCLES
405};
406
02bb2824
YL
407enum {
408 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
409 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
410 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
411};
412
413struct dp_flow_offload_item {
414 struct dp_netdev_pmd_thread *pmd;
415 struct dp_netdev_flow *flow;
416 int op;
417 struct match match;
418 struct nlattr *actions;
419 size_t actions_len;
420
421 struct ovs_list node;
422};
423
424struct dp_flow_offload {
425 struct ovs_mutex mutex;
426 struct ovs_list list;
427 pthread_cond_t cond;
428};
429
430static struct dp_flow_offload dp_flow_offload = {
431 .mutex = OVS_MUTEX_INITIALIZER,
432 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
433};
434
435static struct ovsthread_once offload_thread_once
436 = OVSTHREAD_ONCE_INITIALIZER;
437
05f9e707 438#define XPS_TIMEOUT 500000LL /* In microseconds. */
324c8374 439
3eb67853
IM
440/* Contained by struct dp_netdev_port's 'rxqs' member. */
441struct dp_netdev_rxq {
947dc567
DDP
442 struct dp_netdev_port *port;
443 struct netdev_rxq *rx;
444 unsigned core_id; /* Core to which this queue should be
445 pinned. OVS_CORE_UNSPEC if the
446 queue doesn't need to be pinned to a
447 particular core. */
ee42dd70 448 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
47a45d86 449 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
79f36875 450 bool is_vhost; /* Is rxq of a vhost port. */
c59e759f
KT
451
452 /* Counters of cycles spent successfully polling and processing pkts. */
453 atomic_ullong cycles[RXQ_N_CYCLES];
454 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
455 sum them to yield the cycles used for an rxq. */
456 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
3eb67853
IM
457};
458
72865317
BP
459/* A port in a netdev-based datapath. */
460struct dp_netdev_port {
35303d71 461 odp_port_t port_no;
ca62bb16
BB
462 bool dynamic_txqs; /* If true XPS will be used. */
463 bool need_reconfigure; /* True if we should reconfigure netdev. */
72865317 464 struct netdev *netdev;
e9985d6a 465 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 466 struct netdev_saved_flags *sf;
3eb67853 467 struct dp_netdev_rxq *rxqs;
85a4f238 468 unsigned n_rxq; /* Number of elements in 'rxqs' */
47a45d86 469 unsigned *txq_used; /* Number of threads that use each tx queue. */
324c8374 470 struct ovs_mutex txq_used_mutex;
2fbadeb6 471 bool emc_enabled; /* If true EMC will be used. */
0cbfe35d 472 char *type; /* Port type as requested by user. */
3eb67853 473 char *rxq_affinity_list; /* Requested affinity of rx queues. */
72865317
BP
474};
475
1c1e46ed
AW
476/* Contained by struct dp_netdev_flow's 'stats' member. */
477struct dp_netdev_flow_stats {
eb94da30
DDP
478 atomic_llong used; /* Last used time, in monotonic msecs. */
479 atomic_ullong packet_count; /* Number of packets matched. */
480 atomic_ullong byte_count; /* Number of bytes matched. */
481 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
482};
483
484/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
485 *
486 *
487 * Thread-safety
488 * =============
489 *
490 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 491 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
492 *
493 * Motivation
494 * ----------
495 *
496 * The thread safety rules described here for "struct dp_netdev_flow" are
497 * motivated by two goals:
498 *
499 * - Prevent threads that read members of "struct dp_netdev_flow" from
500 * reading bad data due to changes by some thread concurrently modifying
501 * those members.
502 *
503 * - Prevent two threads making changes to members of a given "struct
504 * dp_netdev_flow" from interfering with each other.
505 *
506 *
507 * Rules
508 * -----
509 *
ed79f89a
DDP
510 * A flow 'flow' may be accessed without a risk of being freed during an RCU
511 * grace period. Code that needs to hold onto a flow for a while
512 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
513 *
514 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
515 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
516 * from modification.
8a4e3a85
BP
517 *
518 * Some members, marked 'const', are immutable. Accessing other members
519 * requires synchronization, as noted in more detail below.
520 */
72865317 521struct dp_netdev_flow {
11e5cf1f 522 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 523 /* Hash table index by unmasked flow. */
1c1e46ed
AW
524 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
525 /* 'flow_table'. */
241bad15 526 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
70e5ed6f 527 const ovs_u128 ufid; /* Unique flow identifier. */
241bad15 528 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
bd5131ba 529 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 530 /* flow. */
72865317 531
ed79f89a
DDP
532 /* Number of references.
533 * The classifier owns one reference.
534 * Any thread trying to keep a rule from being freed should hold its own
535 * reference. */
536 struct ovs_refcount ref_cnt;
537
11e5cf1f 538 bool dead;
241bad15 539 uint32_t mark; /* Unique flow mark assigned to a flow */
11e5cf1f 540
1c1e46ed
AW
541 /* Statistics. */
542 struct dp_netdev_flow_stats stats;
8a4e3a85 543
45c626a3 544 /* Actions. */
61e7deb1 545 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 546
11e5cf1f
DDP
547 /* While processing a group of input packets, the datapath uses the next
548 * member to store a pointer to the output batch for the flow. It is
549 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
f7ce4811
PS
550 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
551 struct packet_batch_per_flow *batch;
11e5cf1f 552
0de8783a
JR
553 /* Packet classification. */
554 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
555 /* 'cr' must be the last member. */
72865317
BP
556};
557
ed79f89a 558static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 559static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f 560static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
f0fb825a 561 struct flow *, bool);
8a4e3a85 562
a84cb64a
BP
563/* A set of datapath actions within a "struct dp_netdev_flow".
564 *
565 *
566 * Thread-safety
567 * =============
568 *
45c626a3 569 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 570struct dp_netdev_actions {
a84cb64a
BP
571 /* These members are immutable: they do not change during the struct's
572 * lifetime. */
a84cb64a 573 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 574 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
575};
576
577struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
578 size_t);
61e7deb1
BP
579struct dp_netdev_actions *dp_netdev_flow_get_actions(
580 const struct dp_netdev_flow *);
581static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 582
947dc567 583struct polled_queue {
922b28d4 584 struct dp_netdev_rxq *rxq;
947dc567 585 odp_port_t port_no;
2fbadeb6 586 bool emc_enabled;
35c91567
DM
587 bool rxq_enabled;
588 uint64_t change_seq;
947dc567
DDP
589};
590
ae7ad0a1
IM
591/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
592struct rxq_poll {
947dc567
DDP
593 struct dp_netdev_rxq *rxq;
594 struct hmap_node node;
ae7ad0a1
IM
595};
596
57eebbb4
DDP
597/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
598 * 'tnl_port_cache' or 'tx_ports'. */
d0cca6c3 599struct tx_port {
324c8374
IM
600 struct dp_netdev_port *port;
601 int qid;
602 long long last_used;
d0cca6c3 603 struct hmap_node node;
c71ea3c4 604 long long flush_time;
009e0033 605 struct dp_packet_batch output_pkts;
58ed6df0 606 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
d0cca6c3
DDP
607};
608
b010be17
IM
609/* A set of properties for the current processing loop that is not directly
610 * associated with the pmd thread itself, but with the packets being
611 * processed or the short-term system configuration (for example, time).
612 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
613struct dp_netdev_pmd_thread_ctx {
614 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
615 long long now;
58ed6df0
IM
616 /* RX queue from which last packet was received. */
617 struct dp_netdev_rxq *last_rxq;
2fbadeb6
IM
618 /* EMC insertion probability context for the current processing cycle. */
619 uint32_t emc_insert_min;
d0cca6c3
DDP
620};
621
e4cfed38
PS
622/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
623 * the performance overhead of interrupt processing. Therefore netdev can
624 * not implement rx-wait for these devices. dpif-netdev needs to poll
625 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 626 * devices assigned to itself.
e4cfed38
PS
627 *
628 * DPDK used PMD for accessing NIC.
629 *
65f13b50
AW
630 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
631 * I/O of all non-pmd threads. There will be no actual thread created
632 * for the instance.
1c1e46ed 633 *
1859876c
BB
634 * Each struct has its own flow cache and classifier per managed ingress port.
635 * For packets received on ingress port, a look up is done on corresponding PMD
636 * thread's flow cache and in case of a miss, lookup is performed in the
637 * corresponding classifier of port. Packets are executed with the found
638 * actions in either case.
1c1e46ed 639 * */
65f13b50 640struct dp_netdev_pmd_thread {
d9d73f84
IM
641 struct dp_netdev *dp;
642 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
643 struct cmap_node node; /* In 'dp->poll_threads'. */
644
65f13b50
AW
645 /* Per thread exact-match cache. Note, the instance for cpu core
646 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
d0cca6c3
DDP
647 * need to be protected by 'non_pmd_mutex'. Every other instance
648 * will only be accessed by its own pmd thread. */
60d8ccae 649 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
1c1e46ed 650
3453b4d6 651 /* Flow-Table and classifiers
1c1e46ed
AW
652 *
653 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
3453b4d6
JS
654 * changes to 'classifiers' must be made while still holding the
655 * 'flow_mutex'.
1c1e46ed
AW
656 */
657 struct ovs_mutex flow_mutex;
d9d73f84
IM
658 struct cmap flow_table OVS_GUARDED; /* Flow table. */
659
660 /* One classifier per in_port polled by the pmd */
661 struct cmap classifiers;
662 /* Periodically sort subtable vectors according to hit frequencies */
663 long long int next_optimization;
664 /* End of the next time interval for which processing cycles
665 are stored for each polled rxq. */
666 long long int rxq_next_cycle_store;
667
2a2c67b4
KT
668 /* Last interval timestamp. */
669 uint64_t intrvl_tsc_prev;
670 /* Last interval cycles. */
671 atomic_ullong intrvl_cycles;
672
b010be17
IM
673 /* Current context of the PMD thread. */
674 struct dp_netdev_pmd_thread_ctx ctx;
d9d73f84 675
d9d73f84
IM
676 struct seq *reload_seq;
677 uint64_t last_reload_seq;
ec61d470
IM
678
679 /* These are atomic variables used as a synchronization and configuration
680 * points for thread reload/exit.
681 *
682 * 'reload' atomic is the main one and it's used as a memory
683 * synchronization point for all other knobs and data.
684 *
685 * For a thread that requests PMD reload:
686 *
687 * * All changes that should be visible to the PMD thread must be made
688 * before setting the 'reload'. These changes could use any memory
689 * ordering model including 'relaxed'.
690 * * Setting the 'reload' atomic should occur in the same thread where
691 * all other PMD configuration options updated.
692 * * Setting the 'reload' atomic should be done with 'release' memory
693 * ordering model or stricter. This will guarantee that all previous
694 * changes (including non-atomic and 'relaxed') will be visible to
695 * the PMD thread.
696 * * To check that reload is done, thread should poll the 'reload' atomic
697 * to become 'false'. Polling should be done with 'acquire' memory
698 * ordering model or stricter. This ensures that PMD thread completed
699 * the reload process.
700 *
701 * For the PMD thread:
702 *
703 * * PMD thread should read 'reload' atomic with 'acquire' memory
704 * ordering model or stricter. This will guarantee that all changes
705 * made before setting the 'reload' in the requesting thread will be
706 * visible to the PMD thread.
707 * * All other configuration data could be read with any memory
708 * ordering model (including non-atomic and 'relaxed') but *only after*
709 * reading the 'reload' atomic set to 'true'.
710 * * When the PMD reload done, PMD should (optionally) set all the below
711 * knobs except the 'reload' to their default ('false') values and
712 * (mandatory), as the last step, set the 'reload' to 'false' using
713 * 'release' memory ordering model or stricter. This will inform the
714 * requesting thread that PMD has completed a reload cycle.
715 */
d9d73f84 716 atomic_bool reload; /* Do we need to reload ports? */
6d9fead1 717 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
e2cafa86 718 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
299c8d61 719 atomic_bool exit; /* For terminating the pmd thread. */
ec61d470 720
d9d73f84
IM
721 pthread_t thread;
722 unsigned core_id; /* CPU core id of this pmd thread. */
723 int numa_id; /* numa node id of this pmd thread. */
724 bool isolated;
725
726 /* Queue id used by this pmd thread to send packets on all netdevs if
727 * XPS disabled for this netdev. All static_tx_qid's are unique and less
728 * than 'cmap_count(dp->poll_threads)'. */
729 uint32_t static_tx_qid;
730
c71ea3c4
IM
731 /* Number of filled output batches. */
732 int n_output_batches;
733
d9d73f84
IM
734 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
735 /* List of rx queues to poll. */
736 struct hmap poll_list OVS_GUARDED;
737 /* Map of 'tx_port's used for transmission. Written by the main thread,
738 * read by the pmd thread. */
739 struct hmap tx_ports OVS_GUARDED;
740
741 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
742 * ports (that support push_tunnel/pop_tunnel), the other contains ports
743 * with at least one txq (that support send). A port can be in both.
744 *
745 * There are two separate maps to make sure that we don't try to execute
746 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
747 *
748 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
749 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
750 * other instance will only be accessed by its own pmd thread. */
751 struct hmap tnl_port_cache;
752 struct hmap send_port_cache;
753
82a48ead
JS
754 /* Keep track of detailed PMD performance statistics. */
755 struct pmd_perf_stats perf_stats;
d9d73f84 756
5bf84282
NK
757 /* Stats from previous iteration used by automatic pmd
758 * load balance logic. */
759 uint64_t prev_stats[PMD_N_STATS];
760 atomic_count pmd_overloaded;
761
d9d73f84
IM
762 /* Set to true if the pmd thread needs to be reloaded. */
763 bool need_reload;
6c3eee82
BP
764};
765
72865317
BP
766/* Interface to netdev-based datapath. */
767struct dpif_netdev {
768 struct dpif dpif;
769 struct dp_netdev *dp;
d33ed218 770 uint64_t last_port_seq;
72865317
BP
771};
772
8a4e3a85 773static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
e9985d6a
DDP
774 struct dp_netdev_port **portp)
775 OVS_REQUIRES(dp->port_mutex);
8a4e3a85 776static int get_port_by_name(struct dp_netdev *dp, const char *devname,
e9985d6a
DDP
777 struct dp_netdev_port **portp)
778 OVS_REQUIRES(dp->port_mutex);
8a4e3a85
BP
779static void dp_netdev_free(struct dp_netdev *)
780 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
781static int do_add_port(struct dp_netdev *dp, const char *devname,
782 const char *type, odp_port_t port_no)
59e6d833 783 OVS_REQUIRES(dp->port_mutex);
c40b890f 784static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 785 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
786static int dpif_netdev_open(const struct dpif_class *, const char *name,
787 bool create, struct dpif **);
65f13b50 788static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 789 struct dp_packet_batch *,
7d7ded7a
DB
790 bool should_steal,
791 const struct flow *flow,
4edb9ae9 792 const struct nlattr *actions,
b010be17 793 size_t actions_len);
65f13b50 794static void dp_netdev_input(struct dp_netdev_pmd_thread *,
1895cc8d 795 struct dp_packet_batch *, odp_port_t port_no);
a90ed026 796static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
1895cc8d 797 struct dp_packet_batch *);
41ccaa24 798
6b31e073 799static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 800static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50 801static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
00873463
DDP
802 struct dp_netdev *dp, unsigned core_id,
803 int numa_id);
1c1e46ed 804static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
e9985d6a
DDP
805static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
806 OVS_REQUIRES(dp->port_mutex);
807
e32971b8 808static void *pmd_thread_main(void *);
b19befae 809static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 810 unsigned core_id);
1c1e46ed
AW
811static struct dp_netdev_pmd_thread *
812dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
140dd699
IM
813static void dp_netdev_del_pmd(struct dp_netdev *dp,
814 struct dp_netdev_pmd_thread *pmd);
e32971b8 815static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
d0cca6c3 816static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
d0cca6c3 817static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
e32971b8
DDP
818 struct dp_netdev_port *port)
819 OVS_REQUIRES(pmd->port_mutex);
820static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
821 struct tx_port *tx)
822 OVS_REQUIRES(pmd->port_mutex);
d0cca6c3 823static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
947dc567
DDP
824 struct dp_netdev_rxq *rxq)
825 OVS_REQUIRES(pmd->port_mutex);
e32971b8
DDP
826static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
827 struct rxq_poll *poll)
828 OVS_REQUIRES(pmd->port_mutex);
c71ea3c4
IM
829static int
830dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
831 bool force);
009e0033 832
e32971b8 833static void reconfigure_datapath(struct dp_netdev *dp)
3eb67853 834 OVS_REQUIRES(dp->port_mutex);
1c1e46ed
AW
835static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
836static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
837static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
d0cca6c3
DDP
838static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
839 OVS_REQUIRES(pmd->port_mutex);
3453b4d6 840static inline void
4809891b
KT
841dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
842 struct polled_queue *poll_list, int poll_cnt);
843static void
844dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
845 enum rxq_cycles_counter_type type,
846 unsigned long long cycles);
847static uint64_t
848dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
849 enum rxq_cycles_counter_type type);
850static void
851dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
852 unsigned long long cycles);
655856ef
KT
853static uint64_t
854dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
324c8374
IM
855static void
856dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 857 bool purge);
324c8374 858static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 859 struct tx_port *tx);
324c8374 860
67ad54cb 861static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d 862static void emc_clear_entry(struct emc_entry *ce);
60d8ccae 863static void smc_clear_entry(struct smc_bucket *b, int idx);
9bbf1c3d 864
cd995c73 865static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
79f36875
JS
866static inline bool
867pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
02bb2824
YL
868static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
869 struct dp_netdev_flow *flow);
cd995c73 870
9bbf1c3d
DDP
871static void
872emc_cache_init(struct emc_cache *flow_cache)
873{
874 int i;
875
67ad54cb 876 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
877 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
878 flow_cache->entries[i].flow = NULL;
0de8783a 879 flow_cache->entries[i].key.hash = 0;
09b0fa9c 880 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 881 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
882 }
883}
884
60d8ccae
YW
885static void
886smc_cache_init(struct smc_cache *smc_cache)
887{
888 int i, j;
889 for (i = 0; i < SMC_BUCKET_CNT; i++) {
890 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
891 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
892 }
893 }
894}
895
896static void
897dfc_cache_init(struct dfc_cache *flow_cache)
898{
899 emc_cache_init(&flow_cache->emc_cache);
900 smc_cache_init(&flow_cache->smc_cache);
901}
902
9bbf1c3d
DDP
903static void
904emc_cache_uninit(struct emc_cache *flow_cache)
905{
906 int i;
907
908 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
909 emc_clear_entry(&flow_cache->entries[i]);
910 }
911}
912
60d8ccae
YW
913static void
914smc_cache_uninit(struct smc_cache *smc)
915{
916 int i, j;
917
918 for (i = 0; i < SMC_BUCKET_CNT; i++) {
919 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
920 smc_clear_entry(&(smc->buckets[i]), j);
921 }
922 }
923}
924
925static void
926dfc_cache_uninit(struct dfc_cache *flow_cache)
927{
928 smc_cache_uninit(&flow_cache->smc_cache);
929 emc_cache_uninit(&flow_cache->emc_cache);
930}
931
67ad54cb
AW
932/* Check and clear dead flow references slowly (one entry at each
933 * invocation). */
934static void
935emc_cache_slow_sweep(struct emc_cache *flow_cache)
936{
937 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
938
939 if (!emc_entry_alive(entry)) {
940 emc_clear_entry(entry);
941 }
942 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
943}
944
b010be17
IM
945/* Updates the time in PMD threads context and should be called in three cases:
946 *
947 * 1. PMD structure initialization:
948 * - dp_netdev_configure_pmd()
949 *
950 * 2. Before processing of the new packet batch:
951 * - dpif_netdev_execute()
009e0033 952 * - dp_netdev_process_rxq_port()
b010be17
IM
953 *
954 * 3. At least once per polling iteration in main polling threads if no
955 * packets received on current iteration:
956 * - dpif_netdev_run()
957 * - pmd_thread_main()
958 *
959 * 'pmd->ctx.now' should be used without update in all other cases if possible.
960 */
961static inline void
962pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
963{
05f9e707 964 pmd->ctx.now = time_usec();
b010be17
IM
965}
966
c4ea7529
BP
967/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
968bool
969dpif_is_netdev(const struct dpif *dpif)
970{
971 return dpif->dpif_class->open == dpif_netdev_open;
972}
973
72865317
BP
974static struct dpif_netdev *
975dpif_netdev_cast(const struct dpif *dpif)
976{
c4ea7529 977 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
978 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
979}
980
981static struct dp_netdev *
982get_dp_netdev(const struct dpif *dpif)
983{
984 return dpif_netdev_cast(dpif)->dp;
985}
6553d06b
DDP
986\f
987enum pmd_info_type {
ce179f11
IM
988 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
989 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
79f36875
JS
990 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
991 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
6553d06b
DDP
992};
993
994static void
82a48ead 995format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
6553d06b 996{
6553d06b
DDP
997 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
998 ? "main thread" : "pmd thread");
6553d06b
DDP
999 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1000 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1001 }
d5c199ea 1002 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 1003 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
1004 }
1005 ds_put_cstr(reply, ":\n");
82a48ead
JS
1006}
1007
1008static void
1009pmd_info_show_stats(struct ds *reply,
1010 struct dp_netdev_pmd_thread *pmd)
1011{
1012 uint64_t stats[PMD_N_STATS];
1013 uint64_t total_cycles, total_packets;
1014 double passes_per_pkt = 0;
1015 double lookups_per_hit = 0;
1016 double packets_per_batch = 0;
1017
1018 pmd_perf_read_counters(&pmd->perf_stats, stats);
1019 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1020 + stats[PMD_CYCLES_ITER_BUSY];
1021 total_packets = stats[PMD_STAT_RECV];
1022
1023 format_pmd_thread(reply, pmd);
6553d06b 1024
82a48ead
JS
1025 if (total_packets > 0) {
1026 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1027 / (double) total_packets;
cc4891f3 1028 }
82a48ead
JS
1029 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1030 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1031 / (double) stats[PMD_STAT_MASKED_HIT];
1032 }
1033 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1034 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1035 / (double) stats[PMD_STAT_SENT_BATCHES];
cc4891f3
IM
1036 }
1037
6553d06b 1038 ds_put_format(reply,
5a0e4aec
BP
1039 " packets received: %"PRIu64"\n"
1040 " packet recirculations: %"PRIu64"\n"
1041 " avg. datapath passes per packet: %.02f\n"
1042 " emc hits: %"PRIu64"\n"
60d8ccae 1043 " smc hits: %"PRIu64"\n"
5a0e4aec
BP
1044 " megaflow hits: %"PRIu64"\n"
1045 " avg. subtable lookups per megaflow hit: %.02f\n"
1046 " miss with success upcall: %"PRIu64"\n"
1047 " miss with failed upcall: %"PRIu64"\n"
1048 " avg. packets per output batch: %.02f\n",
82a48ead
JS
1049 total_packets, stats[PMD_STAT_RECIRC],
1050 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
60d8ccae 1051 stats[PMD_STAT_SMC_HIT],
82a48ead
JS
1052 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1053 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
cc4891f3 1054 packets_per_batch);
6553d06b
DDP
1055
1056 if (total_cycles == 0) {
1057 return;
1058 }
1059
1060 ds_put_format(reply,
5a0e4aec
BP
1061 " idle cycles: %"PRIu64" (%.02f%%)\n"
1062 " processing cycles: %"PRIu64" (%.02f%%)\n",
82a48ead
JS
1063 stats[PMD_CYCLES_ITER_IDLE],
1064 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1065 stats[PMD_CYCLES_ITER_BUSY],
1066 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
6553d06b
DDP
1067
1068 if (total_packets == 0) {
1069 return;
1070 }
1071
1072 ds_put_format(reply,
5a0e4aec 1073 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
82a48ead 1074 total_cycles / (double) total_packets,
6553d06b
DDP
1075 total_cycles, total_packets);
1076
1077 ds_put_format(reply,
5a0e4aec 1078 " avg processing cycles per packet: "
82a48ead
JS
1079 "%.02f (%"PRIu64"/%"PRIu64")\n",
1080 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1081 stats[PMD_CYCLES_ITER_BUSY], total_packets);
6553d06b
DDP
1082}
1083
79f36875
JS
1084static void
1085pmd_info_show_perf(struct ds *reply,
1086 struct dp_netdev_pmd_thread *pmd,
1087 struct pmd_perf_params *par)
1088{
1089 if (pmd->core_id != NON_PMD_CORE_ID) {
1090 char *time_str =
1091 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1092 long long now = time_msec();
1093 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1094
1095 ds_put_cstr(reply, "\n");
1096 ds_put_format(reply, "Time: %s\n", time_str);
1097 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1098 ds_put_cstr(reply, "\n");
1099 format_pmd_thread(reply, pmd);
1100 ds_put_cstr(reply, "\n");
1101 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1102 if (pmd_perf_metrics_enabled(pmd)) {
1103 /* Prevent parallel clearing of perf metrics. */
1104 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1105 if (par->histograms) {
1106 ds_put_cstr(reply, "\n");
1107 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1108 }
1109 if (par->iter_hist_len > 0) {
1110 ds_put_cstr(reply, "\n");
1111 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1112 par->iter_hist_len);
1113 }
1114 if (par->ms_hist_len > 0) {
1115 ds_put_cstr(reply, "\n");
1116 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1117 par->ms_hist_len);
1118 }
1119 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1120 }
1121 free(time_str);
1122 }
1123}
1124
947dc567
DDP
1125static int
1126compare_poll_list(const void *a_, const void *b_)
1127{
1128 const struct rxq_poll *a = a_;
1129 const struct rxq_poll *b = b_;
1130
1131 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1132 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1133
1134 int cmp = strcmp(namea, nameb);
1135 if (!cmp) {
1136 return netdev_rxq_get_queue_id(a->rxq->rx)
1137 - netdev_rxq_get_queue_id(b->rxq->rx);
1138 } else {
1139 return cmp;
1140 }
1141}
1142
1143static void
1144sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1145 size_t *n)
216abd28 1146 OVS_REQUIRES(pmd->port_mutex)
947dc567
DDP
1147{
1148 struct rxq_poll *ret, *poll;
1149 size_t i;
1150
1151 *n = hmap_count(&pmd->poll_list);
1152 if (!*n) {
1153 ret = NULL;
1154 } else {
1155 ret = xcalloc(*n, sizeof *ret);
1156 i = 0;
1157 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1158 ret[i] = *poll;
1159 i++;
1160 }
1161 ovs_assert(i == *n);
1cc1b5f6 1162 qsort(ret, *n, sizeof *ret, compare_poll_list);
947dc567
DDP
1163 }
1164
947dc567
DDP
1165 *list = ret;
1166}
1167
ce179f11
IM
1168static void
1169pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1170{
1171 if (pmd->core_id != NON_PMD_CORE_ID) {
947dc567 1172 struct rxq_poll *list;
2a2c67b4
KT
1173 size_t n_rxq;
1174 uint64_t total_cycles = 0;
ce179f11 1175
3eb67853 1176 ds_put_format(reply,
5a0e4aec 1177 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
3eb67853
IM
1178 pmd->numa_id, pmd->core_id, (pmd->isolated)
1179 ? "true" : "false");
ce179f11 1180
d0cca6c3 1181 ovs_mutex_lock(&pmd->port_mutex);
2a2c67b4 1182 sorted_poll_list(pmd, &list, &n_rxq);
ce179f11 1183
2a2c67b4
KT
1184 /* Get the total pmd cycles for an interval. */
1185 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1186 /* Estimate the cycles to cover all intervals. */
1187 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1188
1189 for (int i = 0; i < n_rxq; i++) {
1190 struct dp_netdev_rxq *rxq = list[i].rxq;
1191 const char *name = netdev_rxq_get_name(rxq->rx);
1192 uint64_t proc_cycles = 0;
1193
1194 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1195 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
ce179f11 1196 }
5a0e4aec 1197 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
947dc567 1198 netdev_rxq_get_queue_id(list[i].rxq->rx));
35c91567
DM
1199 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1200 ? "(enabled) " : "(disabled)");
5a0e4aec 1201 ds_put_format(reply, " pmd usage: ");
2a2c67b4
KT
1202 if (total_cycles) {
1203 ds_put_format(reply, "%2"PRIu64"",
1204 proc_cycles * 100 / total_cycles);
1205 ds_put_cstr(reply, " %");
1206 } else {
1207 ds_put_format(reply, "%s", "NOT AVAIL");
1208 }
1209 ds_put_cstr(reply, "\n");
ce179f11 1210 }
d0cca6c3 1211 ovs_mutex_unlock(&pmd->port_mutex);
947dc567 1212 free(list);
ce179f11
IM
1213 }
1214}
1215
34d8e04b
EC
1216static int
1217compare_poll_thread_list(const void *a_, const void *b_)
1218{
1219 const struct dp_netdev_pmd_thread *a, *b;
1220
1221 a = *(struct dp_netdev_pmd_thread **)a_;
1222 b = *(struct dp_netdev_pmd_thread **)b_;
1223
1224 if (a->core_id < b->core_id) {
1225 return -1;
1226 }
1227 if (a->core_id > b->core_id) {
1228 return 1;
1229 }
1230 return 0;
1231}
1232
1233/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1234 * this list, as long as we do not go to quiescent state. */
1235static void
1236sorted_poll_thread_list(struct dp_netdev *dp,
1237 struct dp_netdev_pmd_thread ***list,
1238 size_t *n)
1239{
1240 struct dp_netdev_pmd_thread *pmd;
1241 struct dp_netdev_pmd_thread **pmd_list;
1242 size_t k = 0, n_pmds;
1243
1244 n_pmds = cmap_count(&dp->poll_threads);
1245 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1246
1247 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1248 if (k >= n_pmds) {
1249 break;
1250 }
1251 pmd_list[k++] = pmd;
1252 }
1253
1254 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1255
1256 *list = pmd_list;
1257 *n = k;
1258}
1259
cd995c73
KT
1260static void
1261dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1262 const char *argv[], void *aux OVS_UNUSED)
1263{
1264 struct ds reply = DS_EMPTY_INITIALIZER;
1265 struct dp_netdev *dp = NULL;
1266
1267 ovs_mutex_lock(&dp_netdev_mutex);
1268
1269 if (argc == 2) {
1270 dp = shash_find_data(&dp_netdevs, argv[1]);
1271 } else if (shash_count(&dp_netdevs) == 1) {
1272 /* There's only one datapath */
1273 dp = shash_first(&dp_netdevs)->data;
1274 }
1275
1276 if (!dp) {
1277 ovs_mutex_unlock(&dp_netdev_mutex);
1278 unixctl_command_reply_error(conn,
1279 "please specify an existing datapath");
1280 return;
1281 }
1282
1283 dp_netdev_request_reconfigure(dp);
1284 ovs_mutex_unlock(&dp_netdev_mutex);
1285 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1286 unixctl_command_reply(conn, ds_cstr(&reply));
1287 ds_destroy(&reply);
1288}
1289
6553d06b
DDP
1290static void
1291dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1292 void *aux)
1293{
1294 struct ds reply = DS_EMPTY_INITIALIZER;
34d8e04b 1295 struct dp_netdev_pmd_thread **pmd_list;
6553d06b
DDP
1296 struct dp_netdev *dp = NULL;
1297 enum pmd_info_type type = *(enum pmd_info_type *) aux;
82a48ead
JS
1298 unsigned int core_id;
1299 bool filter_on_pmd = false;
1300 size_t n;
6553d06b
DDP
1301
1302 ovs_mutex_lock(&dp_netdev_mutex);
1303
82a48ead 1304 while (argc > 1) {
79f36875 1305 if (!strcmp(argv[1], "-pmd") && argc > 2) {
82a48ead
JS
1306 if (str_to_uint(argv[2], 10, &core_id)) {
1307 filter_on_pmd = true;
1308 }
1309 argc -= 2;
1310 argv += 2;
1311 } else {
1312 dp = shash_find_data(&dp_netdevs, argv[1]);
1313 argc -= 1;
1314 argv += 1;
1315 }
6553d06b
DDP
1316 }
1317
1318 if (!dp) {
82a48ead
JS
1319 if (shash_count(&dp_netdevs) == 1) {
1320 /* There's only one datapath */
1321 dp = shash_first(&dp_netdevs)->data;
1322 } else {
1323 ovs_mutex_unlock(&dp_netdev_mutex);
1324 unixctl_command_reply_error(conn,
1325 "please specify an existing datapath");
1326 return;
1327 }
6553d06b
DDP
1328 }
1329
34d8e04b
EC
1330 sorted_poll_thread_list(dp, &pmd_list, &n);
1331 for (size_t i = 0; i < n; i++) {
1332 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1333 if (!pmd) {
1334 break;
1335 }
82a48ead
JS
1336 if (filter_on_pmd && pmd->core_id != core_id) {
1337 continue;
1338 }
ce179f11
IM
1339 if (type == PMD_INFO_SHOW_RXQ) {
1340 pmd_info_show_rxq(&reply, pmd);
82a48ead
JS
1341 } else if (type == PMD_INFO_CLEAR_STATS) {
1342 pmd_perf_stats_clear(&pmd->perf_stats);
1343 } else if (type == PMD_INFO_SHOW_STATS) {
1344 pmd_info_show_stats(&reply, pmd);
79f36875
JS
1345 } else if (type == PMD_INFO_PERF_SHOW) {
1346 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
6553d06b
DDP
1347 }
1348 }
34d8e04b 1349 free(pmd_list);
6553d06b
DDP
1350
1351 ovs_mutex_unlock(&dp_netdev_mutex);
1352
1353 unixctl_command_reply(conn, ds_cstr(&reply));
1354 ds_destroy(&reply);
1355}
79f36875
JS
1356
1357static void
1358pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1359 const char *argv[],
1360 void *aux OVS_UNUSED)
1361{
1362 struct pmd_perf_params par;
1363 long int it_hist = 0, ms_hist = 0;
1364 par.histograms = true;
1365
1366 while (argc > 1) {
1367 if (!strcmp(argv[1], "-nh")) {
1368 par.histograms = false;
1369 argc -= 1;
1370 argv += 1;
1371 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1372 it_hist = strtol(argv[2], NULL, 10);
1373 if (it_hist < 0) {
1374 it_hist = 0;
1375 } else if (it_hist > HISTORY_LEN) {
1376 it_hist = HISTORY_LEN;
1377 }
1378 argc -= 2;
1379 argv += 2;
1380 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1381 ms_hist = strtol(argv[2], NULL, 10);
1382 if (ms_hist < 0) {
1383 ms_hist = 0;
1384 } else if (ms_hist > HISTORY_LEN) {
1385 ms_hist = HISTORY_LEN;
1386 }
1387 argc -= 2;
1388 argv += 2;
1389 } else {
1390 break;
1391 }
1392 }
1393 par.iter_hist_len = it_hist;
1394 par.ms_hist_len = ms_hist;
1395 par.command_type = PMD_INFO_PERF_SHOW;
1396 dpif_netdev_pmd_info(conn, argc, argv, &par);
1397}
6553d06b
DDP
1398\f
1399static int
1400dpif_netdev_init(void)
1401{
1402 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
1403 clear_aux = PMD_INFO_CLEAR_STATS,
1404 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b 1405
82a48ead
JS
1406 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1407 0, 3, dpif_netdev_pmd_info,
6553d06b 1408 (void *)&show_aux);
82a48ead
JS
1409 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1410 0, 3, dpif_netdev_pmd_info,
6553d06b 1411 (void *)&clear_aux);
82a48ead
JS
1412 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1413 0, 3, dpif_netdev_pmd_info,
ce179f11 1414 (void *)&poll_aux);
79f36875
JS
1415 unixctl_command_register("dpif-netdev/pmd-perf-show",
1416 "[-nh] [-it iter-history-len]"
1417 " [-ms ms-history-len]"
1418 " [-pmd core] [dp]",
1419 0, 8, pmd_perf_show_cmd,
1420 NULL);
cd995c73
KT
1421 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1422 0, 1, dpif_netdev_pmd_rebalance,
1423 NULL);
7178fefb
JS
1424 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1425 "on|off [-b before] [-a after] [-e|-ne] "
1426 "[-us usec] [-q qlen]",
1427 0, 10, pmd_perf_log_set_cmd,
1428 NULL);
6553d06b
DDP
1429 return 0;
1430}
72865317 1431
2197d7ab 1432static int
2240af25
DDP
1433dpif_netdev_enumerate(struct sset *all_dps,
1434 const struct dpif_class *dpif_class)
2197d7ab
GL
1435{
1436 struct shash_node *node;
1437
97be1538 1438 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 1439 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
1440 struct dp_netdev *dp = node->data;
1441 if (dpif_class != dp->class) {
1442 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1443 * If the class doesn't match, skip this dpif. */
1444 continue;
1445 }
2197d7ab
GL
1446 sset_add(all_dps, node->name);
1447 }
97be1538 1448 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 1449
2197d7ab
GL
1450 return 0;
1451}
1452
add90f6f
EJ
1453static bool
1454dpif_netdev_class_is_dummy(const struct dpif_class *class)
1455{
1456 return class != &dpif_netdev_class;
1457}
1458
0aeaabc8
JP
1459static const char *
1460dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1461{
1462 return strcmp(type, "internal") ? type
e98d0cb3 1463 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
0aeaabc8
JP
1464 : "tap";
1465}
1466
72865317
BP
1467static struct dpif *
1468create_dpif_netdev(struct dp_netdev *dp)
1469{
462278db 1470 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 1471 struct dpif_netdev *dpif;
72865317 1472
6a8267c5 1473 ovs_refcount_ref(&dp->ref_cnt);
72865317 1474
72865317 1475 dpif = xmalloc(sizeof *dpif);
614c4892 1476 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 1477 dpif->dp = dp;
d33ed218 1478 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
1479
1480 return &dpif->dpif;
1481}
1482
4e022ec0
AW
1483/* Choose an unused, non-zero port number and return it on success.
1484 * Return ODPP_NONE on failure. */
1485static odp_port_t
e44768b7 1486choose_port(struct dp_netdev *dp, const char *name)
59e6d833 1487 OVS_REQUIRES(dp->port_mutex)
e44768b7 1488{
4e022ec0 1489 uint32_t port_no;
e44768b7
JP
1490
1491 if (dp->class != &dpif_netdev_class) {
1492 const char *p;
1493 int start_no = 0;
1494
1495 /* If the port name begins with "br", start the number search at
1496 * 100 to make writing tests easier. */
1497 if (!strncmp(name, "br", 2)) {
1498 start_no = 100;
1499 }
1500
1501 /* If the port name contains a number, try to assign that port number.
1502 * This can make writing unit tests easier because port numbers are
1503 * predictable. */
1504 for (p = name; *p != '\0'; p++) {
1505 if (isdigit((unsigned char) *p)) {
1506 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
1507 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1508 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1509 return u32_to_odp(port_no);
e44768b7
JP
1510 }
1511 break;
1512 }
1513 }
1514 }
1515
ff073a71
BP
1516 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1517 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 1518 return u32_to_odp(port_no);
e44768b7
JP
1519 }
1520 }
1521
4e022ec0 1522 return ODPP_NONE;
e44768b7
JP
1523}
1524
72865317 1525static int
614c4892
BP
1526create_dp_netdev(const char *name, const struct dpif_class *class,
1527 struct dp_netdev **dpp)
8a4e3a85 1528 OVS_REQUIRES(dp_netdev_mutex)
72865317 1529{
1276e3db 1530 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
72865317
BP
1531 struct dp_netdev *dp;
1532 int error;
72865317 1533
1276e3db
IM
1534 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1535 * unit tests. */
1536 if (!dpif_netdev_class_is_dummy(class)
1537 && ovsthread_once_start(&tsc_freq_check)) {
1538 pmd_perf_estimate_tsc_frequency();
1539 ovsthread_once_done(&tsc_freq_check);
1540 }
1541
462278db 1542 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
1543 shash_add(&dp_netdevs, name, dp);
1544
1545 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1546 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 1547 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 1548 atomic_flag_clear(&dp->destroyed);
8a4e3a85 1549
81e89d5c 1550 ovs_mutex_init_recursive(&dp->port_mutex);
e9985d6a 1551 hmap_init(&dp->ports);
d33ed218 1552 dp->port_seq = seq_create();
6b31e073
RW
1553 fat_rwlock_init(&dp->upcall_rwlock);
1554
a6a426d6
IM
1555 dp->reconfigure_seq = seq_create();
1556 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1557
4b27db64
JR
1558 for (int i = 0; i < N_METER_LOCKS; ++i) {
1559 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1560 }
1561
6b31e073
RW
1562 /* Disable upcalls by default. */
1563 dp_netdev_disable_upcall(dp);
623540e4 1564 dp->upcall_aux = NULL;
6b31e073 1565 dp->upcall_cb = NULL;
e44768b7 1566
57593fd2 1567 dp->conntrack = conntrack_init();
5cf3edb3 1568
4c30b246 1569 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
c71ea3c4 1570 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
4c30b246 1571
65f13b50 1572 cmap_init(&dp->poll_threads);
e77c97b9 1573 dp->pmd_rxq_assign_cyc = true;
140dd699
IM
1574
1575 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1576 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1577 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1578
65f13b50
AW
1579 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1580 ovsthread_key_create(&dp->per_pmd_key, NULL);
1581
e9985d6a 1582 ovs_mutex_lock(&dp->port_mutex);
140dd699
IM
1583 /* non-PMD will be created before all other threads and will
1584 * allocate static_tx_qid = 0. */
f2eee189 1585 dp_netdev_set_nonpmd(dp);
65f13b50 1586
a3e8437a
TLSC
1587 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1588 "internal"),
1589 ODPP_LOCAL);
59e6d833 1590 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
1591 if (error) {
1592 dp_netdev_free(dp);
462278db 1593 return error;
72865317
BP
1594 }
1595
a36de779 1596 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 1597 *dpp = dp;
72865317
BP
1598 return 0;
1599}
1600
a6a426d6
IM
1601static void
1602dp_netdev_request_reconfigure(struct dp_netdev *dp)
1603{
1604 seq_change(dp->reconfigure_seq);
1605}
1606
1607static bool
1608dp_netdev_is_reconf_required(struct dp_netdev *dp)
1609{
1610 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1611}
1612
72865317 1613static int
614c4892 1614dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 1615 bool create, struct dpif **dpifp)
72865317 1616{
462278db 1617 struct dp_netdev *dp;
5279f8fd 1618 int error;
462278db 1619
97be1538 1620 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
1621 dp = shash_find_data(&dp_netdevs, name);
1622 if (!dp) {
5279f8fd 1623 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 1624 } else {
5279f8fd
BP
1625 error = (dp->class != class ? EINVAL
1626 : create ? EEXIST
1627 : 0);
1628 }
1629 if (!error) {
1630 *dpifp = create_dpif_netdev(dp);
72865317 1631 }
97be1538 1632 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 1633
5279f8fd 1634 return error;
72865317
BP
1635}
1636
88ace79b
DDP
1637static void
1638dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1639 OVS_NO_THREAD_SAFETY_ANALYSIS
1640{
1641 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1642 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1643
1644 /* Before freeing a lock we should release it */
1645 fat_rwlock_unlock(&dp->upcall_rwlock);
1646 fat_rwlock_destroy(&dp->upcall_rwlock);
1647}
1648
4b27db64
JR
1649static void
1650dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1651 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1652{
1653 if (dp->meters[meter_id]) {
1654 free(dp->meters[meter_id]);
1655 dp->meters[meter_id] = NULL;
1656 }
1657}
1658
8a4e3a85
BP
1659/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1660 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
1661static void
1662dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 1663 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 1664{
e9985d6a 1665 struct dp_netdev_port *port, *next;
4ad28026 1666
8a4e3a85
BP
1667 shash_find_and_delete(&dp_netdevs, dp->name);
1668
59e6d833 1669 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 1670 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
c40b890f 1671 do_del_port(dp, port);
1ba530f4 1672 }
59e6d833 1673 ovs_mutex_unlock(&dp->port_mutex);
4b27db64 1674
e32971b8 1675 dp_netdev_destroy_all_pmds(dp, true);
d916785c 1676 cmap_destroy(&dp->poll_threads);
51852a57 1677
140dd699
IM
1678 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1679 id_pool_destroy(dp->tx_qid_pool);
1680
b9584f21
DDP
1681 ovs_mutex_destroy(&dp->non_pmd_mutex);
1682 ovsthread_key_delete(dp->per_pmd_key);
1683
57593fd2 1684 conntrack_destroy(dp->conntrack);
b9584f21
DDP
1685
1686
a6a426d6
IM
1687 seq_destroy(dp->reconfigure_seq);
1688
d33ed218 1689 seq_destroy(dp->port_seq);
e9985d6a 1690 hmap_destroy(&dp->ports);
3186ea46 1691 ovs_mutex_destroy(&dp->port_mutex);
88ace79b
DDP
1692
1693 /* Upcalls must be disabled at this point */
1694 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 1695
4b27db64
JR
1696 int i;
1697
1698 for (i = 0; i < MAX_METERS; ++i) {
1699 meter_lock(dp, i);
1700 dp_delete_meter(dp, i);
1701 meter_unlock(dp, i);
1702 }
1703 for (i = 0; i < N_METER_LOCKS; ++i) {
1704 ovs_mutex_destroy(&dp->meter_locks[i]);
1705 }
1706
f2eee189 1707 free(dp->pmd_cmask);
8a4e3a85 1708 free(CONST_CAST(char *, dp->name));
72865317
BP
1709 free(dp);
1710}
1711
8a4e3a85
BP
1712static void
1713dp_netdev_unref(struct dp_netdev *dp)
1714{
1715 if (dp) {
1716 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1717 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1718 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1719 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1720 dp_netdev_free(dp);
1721 }
1722 ovs_mutex_unlock(&dp_netdev_mutex);
1723 }
1724}
1725
72865317
BP
1726static void
1727dpif_netdev_close(struct dpif *dpif)
1728{
1729 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1730
8a4e3a85 1731 dp_netdev_unref(dp);
72865317
BP
1732 free(dpif);
1733}
1734
1735static int
7dab847a 1736dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1737{
1738 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1739
6a8267c5 1740 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1741 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1742 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1743 OVS_NOT_REACHED();
1744 }
1745 }
5279f8fd 1746
72865317
BP
1747 return 0;
1748}
1749
eb94da30
DDP
1750/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1751 * load/store semantics. While the increment is not atomic, the load and
1752 * store operations are, making it impossible to read inconsistent values.
1753 *
1754 * This is used to update thread local stats counters. */
1755static void
1756non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1757{
1758 unsigned long long tmp;
1759
1760 atomic_read_relaxed(var, &tmp);
1761 tmp += n;
1762 atomic_store_relaxed(var, tmp);
1763}
1764
72865317 1765static int
a8d9304d 1766dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1767{
1768 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1769 struct dp_netdev_pmd_thread *pmd;
82a48ead 1770 uint64_t pmd_stats[PMD_N_STATS];
8a4e3a85 1771
1c1e46ed
AW
1772 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1773 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1774 stats->n_flows += cmap_count(&pmd->flow_table);
82a48ead
JS
1775 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1776 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
60d8ccae 1777 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
82a48ead
JS
1778 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1779 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1780 stats->n_lost += pmd_stats[PMD_STAT_LOST];
51852a57 1781 }
1ce3fa06 1782 stats->n_masks = UINT32_MAX;
847108dc 1783 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1784
72865317
BP
1785 return 0;
1786}
1787
e4cfed38 1788static void
65f13b50 1789dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1790{
accf8626 1791 if (pmd->core_id == NON_PMD_CORE_ID) {
d0cca6c3
DDP
1792 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1793 ovs_mutex_lock(&pmd->port_mutex);
1794 pmd_load_cached_ports(pmd);
1795 ovs_mutex_unlock(&pmd->port_mutex);
1796 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
accf8626
AW
1797 return;
1798 }
1799
2788a1b1 1800 seq_change(pmd->reload_seq);
299c8d61 1801 atomic_store_explicit(&pmd->reload, true, memory_order_release);
65f13b50 1802}
e4cfed38 1803
59e6d833
BP
1804static uint32_t
1805hash_port_no(odp_port_t port_no)
1806{
1807 return hash_int(odp_to_u32(port_no), 0);
1808}
1809
72865317 1810static int
a3e8437a 1811port_create(const char *devname, const char *type,
b8d29252 1812 odp_port_t port_no, struct dp_netdev_port **portp)
72865317 1813{
72865317 1814 struct dp_netdev_port *port;
2499a8ce 1815 enum netdev_flags flags;
b8d29252 1816 struct netdev *netdev;
e32971b8 1817 int error;
72865317 1818
b8d29252 1819 *portp = NULL;
72865317
BP
1820
1821 /* Open and validate network device. */
a3e8437a 1822 error = netdev_open(devname, type, &netdev);
72865317 1823 if (error) {
b8d29252 1824 return error;
72865317 1825 }
72865317
BP
1826 /* XXX reject non-Ethernet devices */
1827
2499a8ce
AC
1828 netdev_get_flags(netdev, &flags);
1829 if (flags & NETDEV_LOOPBACK) {
1830 VLOG_ERR("%s: cannot add a loopback device", devname);
d17f4f08 1831 error = EINVAL;
b8d29252 1832 goto out;
2499a8ce
AC
1833 }
1834
e4cfed38 1835 port = xzalloc(sizeof *port);
35303d71 1836 port->port_no = port_no;
e4cfed38
PS
1837 port->netdev = netdev;
1838 port->type = xstrdup(type);
96e74404 1839 port->sf = NULL;
2fbadeb6 1840 port->emc_enabled = true;
e32971b8
DDP
1841 port->need_reconfigure = true;
1842 ovs_mutex_init(&port->txq_used_mutex);
e4cfed38 1843
b8d29252 1844 *portp = port;
72865317
BP
1845
1846 return 0;
d17f4f08 1847
d17f4f08 1848out:
b8d29252 1849 netdev_close(netdev);
d17f4f08 1850 return error;
72865317
BP
1851}
1852
b8d29252
DDP
1853static int
1854do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1855 odp_port_t port_no)
1856 OVS_REQUIRES(dp->port_mutex)
1857{
96e74404 1858 struct netdev_saved_flags *sf;
b8d29252
DDP
1859 struct dp_netdev_port *port;
1860 int error;
1861
1862 /* Reject devices already in 'dp'. */
1863 if (!get_port_by_name(dp, devname, &port)) {
1864 return EEXIST;
1865 }
1866
a3e8437a 1867 error = port_create(devname, type, port_no, &port);
b8d29252
DDP
1868 if (error) {
1869 return error;
1870 }
1871
e9985d6a 1872 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
b8d29252
DDP
1873 seq_change(dp->port_seq);
1874
e32971b8
DDP
1875 reconfigure_datapath(dp);
1876
3f51ea18 1877 /* Check that port was successfully configured. */
96e74404
IM
1878 if (!dp_netdev_lookup_port(dp, port_no)) {
1879 return EINVAL;
1880 }
1881
1882 /* Updating device flags triggers an if_notifier, which triggers a bridge
1883 * reconfiguration and another attempt to add this port, leading to an
1884 * infinite loop if the device is configured incorrectly and cannot be
1885 * added. Setting the promisc mode after a successful reconfiguration,
1886 * since we already know that the device is somehow properly configured. */
1887 error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1888 if (error) {
1889 VLOG_ERR("%s: cannot set promisc flag", devname);
1890 do_del_port(dp, port);
1891 return error;
1892 }
1893 port->sf = sf;
1894
1895 return 0;
b8d29252
DDP
1896}
1897
247527db
BP
1898static int
1899dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1900 odp_port_t *port_nop)
247527db
BP
1901{
1902 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1903 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1904 const char *dpif_port;
4e022ec0 1905 odp_port_t port_no;
5279f8fd 1906 int error;
247527db 1907
59e6d833 1908 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1909 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1910 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1911 port_no = *port_nop;
1912 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1913 } else {
3aa30359 1914 port_no = choose_port(dp, dpif_port);
5279f8fd 1915 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1916 }
5279f8fd 1917 if (!error) {
247527db 1918 *port_nop = port_no;
5279f8fd 1919 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1920 }
59e6d833 1921 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1922
1923 return error;
72865317
BP
1924}
1925
1926static int
4e022ec0 1927dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1928{
1929 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1930 int error;
1931
59e6d833 1932 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1933 if (port_no == ODPP_LOCAL) {
1934 error = EINVAL;
1935 } else {
1936 struct dp_netdev_port *port;
1937
1938 error = get_port_by_number(dp, port_no, &port);
1939 if (!error) {
1940 do_del_port(dp, port);
1941 }
1942 }
59e6d833 1943 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1944
1945 return error;
72865317
BP
1946}
1947
1948static bool
4e022ec0 1949is_valid_port_number(odp_port_t port_no)
72865317 1950{
ff073a71
BP
1951 return port_no != ODPP_NONE;
1952}
1953
1954static struct dp_netdev_port *
1955dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
e9985d6a 1956 OVS_REQUIRES(dp->port_mutex)
ff073a71
BP
1957{
1958 struct dp_netdev_port *port;
1959
e9985d6a 1960 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 1961 if (port->port_no == port_no) {
ff073a71
BP
1962 return port;
1963 }
1964 }
1965 return NULL;
72865317
BP
1966}
1967
1968static int
1969get_port_by_number(struct dp_netdev *dp,
4e022ec0 1970 odp_port_t port_no, struct dp_netdev_port **portp)
e9985d6a 1971 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1972{
1973 if (!is_valid_port_number(port_no)) {
1974 *portp = NULL;
1975 return EINVAL;
1976 } else {
ff073a71 1977 *portp = dp_netdev_lookup_port(dp, port_no);
0f6a066f 1978 return *portp ? 0 : ENODEV;
72865317
BP
1979 }
1980}
1981
b284085e 1982static void
62453dad 1983port_destroy(struct dp_netdev_port *port)
b284085e 1984{
62453dad
DDP
1985 if (!port) {
1986 return;
b284085e 1987 }
b284085e 1988
62453dad
DDP
1989 netdev_close(port->netdev);
1990 netdev_restore_flags(port->sf);
accf8626 1991
62453dad 1992 for (unsigned i = 0; i < port->n_rxq; i++) {
947dc567 1993 netdev_rxq_close(port->rxqs[i].rx);
b284085e 1994 }
324c8374 1995 ovs_mutex_destroy(&port->txq_used_mutex);
3eb67853 1996 free(port->rxq_affinity_list);
324c8374 1997 free(port->txq_used);
3eb67853 1998 free(port->rxqs);
62453dad
DDP
1999 free(port->type);
2000 free(port);
b284085e
PS
2001}
2002
72865317
BP
2003static int
2004get_port_by_name(struct dp_netdev *dp,
2005 const char *devname, struct dp_netdev_port **portp)
59e6d833 2006 OVS_REQUIRES(dp->port_mutex)
72865317
BP
2007{
2008 struct dp_netdev_port *port;
2009
e9985d6a 2010 HMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 2011 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
2012 *portp = port;
2013 return 0;
2014 }
2015 }
0f6a066f
DDP
2016
2017 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2018 * existing port. */
2019 return ENODEV;
72865317
BP
2020}
2021
b9584f21 2022/* Returns 'true' if there is a port with pmd netdev. */
65f13b50 2023static bool
b9584f21 2024has_pmd_port(struct dp_netdev *dp)
e9985d6a 2025 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
2026{
2027 struct dp_netdev_port *port;
2028
e9985d6a 2029 HMAP_FOR_EACH (port, node, &dp->ports) {
5dd57e80 2030 if (netdev_is_pmd(port->netdev)) {
b9584f21 2031 return true;
65f13b50
AW
2032 }
2033 }
2034
2035 return false;
2036}
2037
c40b890f
BP
2038static void
2039do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 2040 OVS_REQUIRES(dp->port_mutex)
72865317 2041{
e9985d6a 2042 hmap_remove(&dp->ports, &port->node);
d33ed218 2043 seq_change(dp->port_seq);
d0cca6c3 2044
e32971b8 2045 reconfigure_datapath(dp);
72865317 2046
62453dad 2047 port_destroy(port);
72865317
BP
2048}
2049
2050static void
4c738a8d
BP
2051answer_port_query(const struct dp_netdev_port *port,
2052 struct dpif_port *dpif_port)
72865317 2053{
3efb6063 2054 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 2055 dpif_port->type = xstrdup(port->type);
35303d71 2056 dpif_port->port_no = port->port_no;
72865317
BP
2057}
2058
2059static int
4e022ec0 2060dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 2061 struct dpif_port *dpif_port)
72865317
BP
2062{
2063 struct dp_netdev *dp = get_dp_netdev(dpif);
2064 struct dp_netdev_port *port;
2065 int error;
2066
e9985d6a 2067 ovs_mutex_lock(&dp->port_mutex);
72865317 2068 error = get_port_by_number(dp, port_no, &port);
4afba28d 2069 if (!error && dpif_port) {
4c738a8d 2070 answer_port_query(port, dpif_port);
72865317 2071 }
e9985d6a 2072 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2073
72865317
BP
2074 return error;
2075}
2076
2077static int
2078dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 2079 struct dpif_port *dpif_port)
72865317
BP
2080{
2081 struct dp_netdev *dp = get_dp_netdev(dpif);
2082 struct dp_netdev_port *port;
2083 int error;
2084
59e6d833 2085 ovs_mutex_lock(&dp->port_mutex);
72865317 2086 error = get_port_by_name(dp, devname, &port);
4afba28d 2087 if (!error && dpif_port) {
4c738a8d 2088 answer_port_query(port, dpif_port);
72865317 2089 }
59e6d833 2090 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2091
72865317
BP
2092 return error;
2093}
2094
61e7deb1
BP
2095static void
2096dp_netdev_flow_free(struct dp_netdev_flow *flow)
2097{
61e7deb1 2098 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
2099 free(flow);
2100}
2101
ed79f89a
DDP
2102static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2103{
2104 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2105 ovsrcu_postpone(dp_netdev_flow_free, flow);
2106 }
2107}
2108
70e5ed6f
JS
2109static uint32_t
2110dp_netdev_flow_hash(const ovs_u128 *ufid)
2111{
2112 return ufid->u32[0];
2113}
2114
3453b4d6
JS
2115static inline struct dpcls *
2116dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2117 odp_port_t in_port)
2118{
2119 struct dpcls *cls;
2120 uint32_t hash = hash_port_no(in_port);
2121 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2122 if (cls->in_port == in_port) {
2123 /* Port classifier exists already */
2124 return cls;
2125 }
2126 }
2127 return NULL;
2128}
2129
2130static inline struct dpcls *
2131dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2132 odp_port_t in_port)
2133 OVS_REQUIRES(pmd->flow_mutex)
2134{
2135 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2136 uint32_t hash = hash_port_no(in_port);
2137
2138 if (!cls) {
2139 /* Create new classifier for in_port */
2140 cls = xmalloc(sizeof(*cls));
2141 dpcls_init(cls);
2142 cls->in_port = in_port;
2143 cmap_insert(&pmd->classifiers, &cls->node, hash);
2144 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2145 }
2146 return cls;
2147}
2148
241bad15
YL
2149#define MAX_FLOW_MARK (UINT32_MAX - 1)
2150#define INVALID_FLOW_MARK (UINT32_MAX)
2151
2152struct megaflow_to_mark_data {
2153 const struct cmap_node node;
2154 ovs_u128 mega_ufid;
2155 uint32_t mark;
2156};
2157
2158struct flow_mark {
2159 struct cmap megaflow_to_mark;
2160 struct cmap mark_to_flow;
2161 struct id_pool *pool;
241bad15
YL
2162};
2163
2164static struct flow_mark flow_mark = {
2165 .megaflow_to_mark = CMAP_INITIALIZER,
2166 .mark_to_flow = CMAP_INITIALIZER,
241bad15
YL
2167};
2168
2169static uint32_t
2170flow_mark_alloc(void)
2171{
2172 uint32_t mark;
2173
2174 if (!flow_mark.pool) {
2175 /* Haven't initiated yet, do it here */
2176 flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2177 }
2178
2179 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2180 return mark;
2181 }
2182
2183 return INVALID_FLOW_MARK;
2184}
2185
2186static void
2187flow_mark_free(uint32_t mark)
2188{
2189 id_pool_free_id(flow_mark.pool, mark);
2190}
2191
2192/* associate megaflow with a mark, which is a 1:1 mapping */
2193static void
2194megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2195{
2196 size_t hash = dp_netdev_flow_hash(mega_ufid);
2197 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2198
2199 data->mega_ufid = *mega_ufid;
2200 data->mark = mark;
2201
2202 cmap_insert(&flow_mark.megaflow_to_mark,
2203 CONST_CAST(struct cmap_node *, &data->node), hash);
2204}
2205
2206/* disassociate meagaflow with a mark */
2207static void
2208megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2209{
2210 size_t hash = dp_netdev_flow_hash(mega_ufid);
2211 struct megaflow_to_mark_data *data;
2212
2213 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2214 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2215 cmap_remove(&flow_mark.megaflow_to_mark,
2216 CONST_CAST(struct cmap_node *, &data->node), hash);
5752eae4 2217 ovsrcu_postpone(free, data);
241bad15
YL
2218 return;
2219 }
2220 }
2221
2222 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2223 UUID_ARGS((struct uuid *)mega_ufid));
2224}
2225
2226static inline uint32_t
2227megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2228{
2229 size_t hash = dp_netdev_flow_hash(mega_ufid);
2230 struct megaflow_to_mark_data *data;
2231
2232 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2233 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2234 return data->mark;
2235 }
2236 }
2237
5d1765d3
IM
2238 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2239 UUID_ARGS((struct uuid *)mega_ufid));
241bad15
YL
2240 return INVALID_FLOW_MARK;
2241}
2242
2243/* associate mark with a flow, which is 1:N mapping */
2244static void
2245mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2246{
2247 dp_netdev_flow_ref(flow);
2248
2249 cmap_insert(&flow_mark.mark_to_flow,
2250 CONST_CAST(struct cmap_node *, &flow->mark_node),
2251 hash_int(mark, 0));
2252 flow->mark = mark;
2253
2254 VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2255}
2256
2257static bool
2258flow_mark_has_no_ref(uint32_t mark)
2259{
2260 struct dp_netdev_flow *flow;
2261
2262 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2263 &flow_mark.mark_to_flow) {
2264 if (flow->mark == mark) {
2265 return false;
2266 }
2267 }
2268
2269 return true;
2270}
2271
2272static int
2273mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2274 struct dp_netdev_flow *flow)
2275{
2276 int ret = 0;
2277 uint32_t mark = flow->mark;
2278 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2279 &flow->mark_node);
2280
2281 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2282 flow->mark = INVALID_FLOW_MARK;
2283
2284 /*
2285 * no flow is referencing the mark any more? If so, let's
2286 * remove the flow from hardware and free the mark.
2287 */
2288 if (flow_mark_has_no_ref(mark)) {
30115809 2289 struct netdev *port;
241bad15
YL
2290 odp_port_t in_port = flow->flow.in_port.odp_port;
2291
1061dc7c 2292 port = netdev_ports_get(in_port, pmd->dp->class);
241bad15 2293 if (port) {
e7cb123f
IM
2294 /* Taking a global 'port_mutex' to fulfill thread safety
2295 * restrictions for the netdev-offload-dpdk module. */
2296 ovs_mutex_lock(&pmd->dp->port_mutex);
30115809 2297 ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
e7cb123f 2298 ovs_mutex_unlock(&pmd->dp->port_mutex);
30115809 2299 netdev_close(port);
241bad15 2300 }
241bad15
YL
2301
2302 flow_mark_free(mark);
2303 VLOG_DBG("Freed flow mark %u\n", mark);
2304
2305 megaflow_to_mark_disassociate(&flow->mega_ufid);
2306 }
2307 dp_netdev_flow_unref(flow);
2308
2309 return ret;
2310}
2311
2312static void
2313flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2314{
2315 struct dp_netdev_flow *flow;
2316
2317 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2318 if (flow->pmd_id == pmd->core_id) {
02bb2824 2319 queue_netdev_flow_del(pmd, flow);
241bad15
YL
2320 }
2321 }
2322}
2323
aab96ec4
YL
2324static struct dp_netdev_flow *
2325mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2326 const uint32_t mark)
2327{
2328 struct dp_netdev_flow *flow;
2329
2330 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2331 &flow_mark.mark_to_flow) {
2332 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2333 flow->dead == false) {
2334 return flow;
2335 }
2336 }
2337
2338 return NULL;
2339}
2340
02bb2824
YL
2341static struct dp_flow_offload_item *
2342dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2343 struct dp_netdev_flow *flow,
2344 int op)
2345{
2346 struct dp_flow_offload_item *offload;
2347
2348 offload = xzalloc(sizeof(*offload));
2349 offload->pmd = pmd;
2350 offload->flow = flow;
2351 offload->op = op;
2352
2353 dp_netdev_flow_ref(flow);
2354 dp_netdev_pmd_try_ref(pmd);
2355
2356 return offload;
2357}
2358
2359static void
2360dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2361{
2362 dp_netdev_pmd_unref(offload->pmd);
2363 dp_netdev_flow_unref(offload->flow);
2364
2365 free(offload->actions);
2366 free(offload);
2367}
2368
2369static void
2370dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2371{
2372 ovs_mutex_lock(&dp_flow_offload.mutex);
2373 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2374 xpthread_cond_signal(&dp_flow_offload.cond);
2375 ovs_mutex_unlock(&dp_flow_offload.mutex);
2376}
2377
2378static int
2379dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2380{
2381 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2382}
2383
2384/*
2385 * There are two flow offload operations here: addition and modification.
2386 *
2387 * For flow addition, this function does:
2388 * - allocate a new flow mark id
2389 * - perform hardware flow offload
2390 * - associate the flow mark with flow and mega flow
2391 *
2392 * For flow modification, both flow mark and the associations are still
2393 * valid, thus only item 2 needed.
2394 */
2395static int
2396dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2397{
02bb2824 2398 struct dp_netdev_pmd_thread *pmd = offload->pmd;
319a9bb3 2399 const struct dpif_class *dpif_class = pmd->dp->class;
02bb2824
YL
2400 struct dp_netdev_flow *flow = offload->flow;
2401 odp_port_t in_port = flow->flow.in_port.odp_port;
2402 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2403 struct offload_info info;
30115809 2404 struct netdev *port;
02bb2824
YL
2405 uint32_t mark;
2406 int ret;
2407
2408 if (flow->dead) {
2409 return -1;
2410 }
2411
2412 if (modification) {
2413 mark = flow->mark;
2414 ovs_assert(mark != INVALID_FLOW_MARK);
2415 } else {
2416 /*
2417 * If a mega flow has already been offloaded (from other PMD
2418 * instances), do not offload it again.
2419 */
2420 mark = megaflow_to_mark_find(&flow->mega_ufid);
2421 if (mark != INVALID_FLOW_MARK) {
2422 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2423 if (flow->mark != INVALID_FLOW_MARK) {
2424 ovs_assert(flow->mark == mark);
2425 } else {
2426 mark_to_flow_associate(mark, flow);
2427 }
2428 return 0;
2429 }
2430
2431 mark = flow_mark_alloc();
2432 if (mark == INVALID_FLOW_MARK) {
2433 VLOG_ERR("Failed to allocate flow mark!\n");
2434 }
2435 }
2436 info.flow_mark = mark;
319a9bb3 2437 info.dpif_class = dpif_class;
02bb2824 2438
1061dc7c 2439 port = netdev_ports_get(in_port, pmd->dp->class);
30115809
IM
2440 if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2441 netdev_close(port);
0a5cba65 2442 goto err_free;
02bb2824 2443 }
e7cb123f
IM
2444 /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2445 * the netdev-offload-dpdk module. */
2446 ovs_mutex_lock(&pmd->dp->port_mutex);
30115809 2447 ret = netdev_flow_put(port, &offload->match,
02bb2824
YL
2448 CONST_CAST(struct nlattr *, offload->actions),
2449 offload->actions_len, &flow->mega_ufid, &info,
2450 NULL);
e7cb123f 2451 ovs_mutex_unlock(&pmd->dp->port_mutex);
30115809 2452 netdev_close(port);
02bb2824
YL
2453
2454 if (ret) {
0a5cba65 2455 goto err_free;
02bb2824
YL
2456 }
2457
2458 if (!modification) {
2459 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2460 mark_to_flow_associate(mark, flow);
2461 }
02bb2824 2462 return 0;
0a5cba65
IM
2463
2464err_free:
2465 if (!modification) {
2466 flow_mark_free(mark);
2467 } else {
2468 mark_to_flow_disassociate(pmd, flow);
2469 }
2470 return -1;
02bb2824
YL
2471}
2472
2473static void *
2474dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2475{
2476 struct dp_flow_offload_item *offload;
2477 struct ovs_list *list;
2478 const char *op;
2479 int ret;
2480
2481 for (;;) {
2482 ovs_mutex_lock(&dp_flow_offload.mutex);
2483 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2484 ovsrcu_quiesce_start();
2485 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2486 &dp_flow_offload.mutex);
6c95dbf9 2487 ovsrcu_quiesce_end();
02bb2824
YL
2488 }
2489 list = ovs_list_pop_front(&dp_flow_offload.list);
2490 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2491 ovs_mutex_unlock(&dp_flow_offload.mutex);
2492
2493 switch (offload->op) {
2494 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2495 op = "add";
2496 ret = dp_netdev_flow_offload_put(offload);
2497 break;
2498 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2499 op = "modify";
2500 ret = dp_netdev_flow_offload_put(offload);
2501 break;
2502 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2503 op = "delete";
2504 ret = dp_netdev_flow_offload_del(offload);
2505 break;
2506 default:
2507 OVS_NOT_REACHED();
2508 }
2509
2510 VLOG_DBG("%s to %s netdev flow\n",
2511 ret == 0 ? "succeed" : "failed", op);
2512 dp_netdev_free_flow_offload(offload);
2513 }
2514
2515 return NULL;
2516}
2517
2518static void
2519queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2520 struct dp_netdev_flow *flow)
2521{
2522 struct dp_flow_offload_item *offload;
2523
2524 if (ovsthread_once_start(&offload_thread_once)) {
2525 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2526 ovs_thread_create("dp_netdev_flow_offload",
2527 dp_netdev_flow_offload_main, NULL);
2528 ovsthread_once_done(&offload_thread_once);
2529 }
2530
2531 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2532 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2533 dp_netdev_append_flow_offload(offload);
2534}
2535
2536static void
2537queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2538 struct dp_netdev_flow *flow, struct match *match,
2539 const struct nlattr *actions, size_t actions_len)
2540{
2541 struct dp_flow_offload_item *offload;
2542 int op;
2543
2544 if (!netdev_is_flow_api_enabled()) {
2545 return;
2546 }
2547
2548 if (ovsthread_once_start(&offload_thread_once)) {
2549 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2550 ovs_thread_create("dp_netdev_flow_offload",
2551 dp_netdev_flow_offload_main, NULL);
2552 ovsthread_once_done(&offload_thread_once);
2553 }
2554
2555 if (flow->mark != INVALID_FLOW_MARK) {
2556 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2557 } else {
2558 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2559 }
2560 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2561 offload->match = *match;
2562 offload->actions = xmalloc(actions_len);
2563 memcpy(offload->actions, actions, actions_len);
2564 offload->actions_len = actions_len;
2565
2566 dp_netdev_append_flow_offload(offload);
2567}
2568
72865317 2569static void
1c1e46ed
AW
2570dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2571 struct dp_netdev_flow *flow)
2572 OVS_REQUIRES(pmd->flow_mutex)
72865317 2573{
9f361d6b 2574 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3453b4d6
JS
2575 struct dpcls *cls;
2576 odp_port_t in_port = flow->flow.in_port.odp_port;
2c0ea78f 2577
3453b4d6
JS
2578 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2579 ovs_assert(cls != NULL);
2580 dpcls_remove(cls, &flow->cr);
1c1e46ed 2581 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
241bad15 2582 if (flow->mark != INVALID_FLOW_MARK) {
02bb2824 2583 queue_netdev_flow_del(pmd, flow);
241bad15 2584 }
9bbf1c3d 2585 flow->dead = true;
ed79f89a
DDP
2586
2587 dp_netdev_flow_unref(flow);
72865317
BP
2588}
2589
2590static void
1c1e46ed 2591dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 2592{
78c8df12 2593 struct dp_netdev_flow *netdev_flow;
72865317 2594
1c1e46ed
AW
2595 ovs_mutex_lock(&pmd->flow_mutex);
2596 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2597 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2598 }
1c1e46ed 2599 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
2600}
2601
2602static int
2603dpif_netdev_flow_flush(struct dpif *dpif)
2604{
2605 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
2606 struct dp_netdev_pmd_thread *pmd;
2607
2608 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2609 dp_netdev_pmd_flow_flush(pmd);
2610 }
5279f8fd 2611
72865317
BP
2612 return 0;
2613}
2614
b0ec0f27 2615struct dp_netdev_port_state {
e9985d6a 2616 struct hmap_position position;
4c738a8d 2617 char *name;
b0ec0f27
BP
2618};
2619
2620static int
2621dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2622{
2623 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2624 return 0;
2625}
2626
72865317 2627static int
b0ec0f27 2628dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 2629 struct dpif_port *dpif_port)
72865317 2630{
b0ec0f27 2631 struct dp_netdev_port_state *state = state_;
72865317 2632 struct dp_netdev *dp = get_dp_netdev(dpif);
e9985d6a 2633 struct hmap_node *node;
ff073a71 2634 int retval;
72865317 2635
e9985d6a
DDP
2636 ovs_mutex_lock(&dp->port_mutex);
2637 node = hmap_at_position(&dp->ports, &state->position);
ff073a71
BP
2638 if (node) {
2639 struct dp_netdev_port *port;
5279f8fd 2640
ff073a71
BP
2641 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2642
2643 free(state->name);
2644 state->name = xstrdup(netdev_get_name(port->netdev));
2645 dpif_port->name = state->name;
2646 dpif_port->type = port->type;
35303d71 2647 dpif_port->port_no = port->port_no;
ff073a71
BP
2648
2649 retval = 0;
2650 } else {
2651 retval = EOF;
72865317 2652 }
e9985d6a 2653 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 2654
ff073a71 2655 return retval;
b0ec0f27
BP
2656}
2657
2658static int
4c738a8d 2659dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 2660{
4c738a8d
BP
2661 struct dp_netdev_port_state *state = state_;
2662 free(state->name);
b0ec0f27
BP
2663 free(state);
2664 return 0;
72865317
BP
2665}
2666
2667static int
67a4917b 2668dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
2669{
2670 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 2671 uint64_t new_port_seq;
5279f8fd
BP
2672 int error;
2673
d33ed218
BP
2674 new_port_seq = seq_read(dpif->dp->port_seq);
2675 if (dpif->last_port_seq != new_port_seq) {
2676 dpif->last_port_seq = new_port_seq;
5279f8fd 2677 error = ENOBUFS;
72865317 2678 } else {
5279f8fd 2679 error = EAGAIN;
72865317 2680 }
5279f8fd
BP
2681
2682 return error;
72865317
BP
2683}
2684
2685static void
2686dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2687{
2688 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 2689
d33ed218 2690 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
2691}
2692
2693static struct dp_netdev_flow *
0de8783a 2694dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
2695{
2696 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
2697}
2698
9bbf1c3d
DDP
2699static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2700{
2701 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2702}
2703
79df317f
DDP
2704/* netdev_flow_key utilities.
2705 *
2706 * netdev_flow_key is basically a miniflow. We use these functions
2707 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2708 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2709 *
2710 * - Since we are dealing exclusively with miniflows created by
2711 * miniflow_extract(), if the map is different the miniflow is different.
2712 * Therefore we can be faster by comparing the map and the miniflow in a
2713 * single memcmp().
5fcff47b 2714 * - These functions can be inlined by the compiler. */
79df317f 2715
361d808d 2716/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 2717 * 'netdev_flow_key.mf' */
361d808d
JR
2718static inline size_t
2719netdev_flow_key_size(size_t flow_u64s)
79df317f 2720{
361d808d 2721 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
2722}
2723
79df317f
DDP
2724static inline bool
2725netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
2726 const struct netdev_flow_key *b)
2727{
caeb4906
JR
2728 /* 'b->len' may be not set yet. */
2729 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
2730}
2731
2732/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
d79a39fe 2733 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
0de8783a
JR
2734 * generated by miniflow_extract. */
2735static inline bool
2736netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2737 const struct miniflow *mf)
79df317f 2738{
caeb4906 2739 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
2740}
2741
2742static inline void
2743netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
2744 const struct netdev_flow_key *src)
2745{
caeb4906
JR
2746 memcpy(dst, src,
2747 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
2748}
2749
0de8783a
JR
2750/* Initialize a netdev_flow_key 'mask' from 'match'. */
2751static inline void
2752netdev_flow_mask_init(struct netdev_flow_key *mask,
2753 const struct match *match)
2754{
09b0fa9c 2755 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 2756 struct flowmap fmap;
0de8783a 2757 uint32_t hash = 0;
5fcff47b 2758 size_t idx;
0de8783a
JR
2759
2760 /* Only check masks that make sense for the flow. */
5fcff47b
JR
2761 flow_wc_map(&match->flow, &fmap);
2762 flowmap_init(&mask->mf.map);
0de8783a 2763
5fcff47b
JR
2764 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2765 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 2766
5fcff47b
JR
2767 if (mask_u64) {
2768 flowmap_set(&mask->mf.map, idx, 1);
2769 *dst++ = mask_u64;
2770 hash = hash_add64(hash, mask_u64);
0de8783a 2771 }
0de8783a
JR
2772 }
2773
5fcff47b 2774 map_t map;
0de8783a 2775
5fcff47b
JR
2776 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2777 hash = hash_add64(hash, map);
2778 }
0de8783a 2779
5fcff47b 2780 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 2781
d70e8c28 2782 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
2783 mask->len = netdev_flow_key_size(n);
2784}
2785
361d808d 2786/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
2787static inline void
2788netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2789 const struct flow *flow,
2790 const struct netdev_flow_key *mask)
79df317f 2791{
09b0fa9c
JR
2792 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2793 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 2794 uint32_t hash = 0;
d70e8c28 2795 uint64_t value;
0de8783a
JR
2796
2797 dst->len = mask->len;
361d808d 2798 dst->mf = mask->mf; /* Copy maps. */
0de8783a 2799
5fcff47b 2800 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
2801 *dst_u64 = value & *mask_u64++;
2802 hash = hash_add64(hash, *dst_u64++);
0de8783a 2803 }
09b0fa9c
JR
2804 dst->hash = hash_finish(hash,
2805 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
2806}
2807
9bbf1c3d
DDP
2808static inline bool
2809emc_entry_alive(struct emc_entry *ce)
2810{
2811 return ce->flow && !ce->flow->dead;
2812}
2813
2814static void
2815emc_clear_entry(struct emc_entry *ce)
2816{
2817 if (ce->flow) {
2818 dp_netdev_flow_unref(ce->flow);
2819 ce->flow = NULL;
2820 }
2821}
2822
2823static inline void
2824emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 2825 const struct netdev_flow_key *key)
9bbf1c3d
DDP
2826{
2827 if (ce->flow != flow) {
2828 if (ce->flow) {
2829 dp_netdev_flow_unref(ce->flow);
2830 }
2831
2832 if (dp_netdev_flow_ref(flow)) {
2833 ce->flow = flow;
2834 } else {
2835 ce->flow = NULL;
2836 }
2837 }
0de8783a
JR
2838 if (key) {
2839 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
2840 }
2841}
2842
2843static inline void
0de8783a 2844emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
2845 struct dp_netdev_flow *flow)
2846{
2847 struct emc_entry *to_be_replaced = NULL;
2848 struct emc_entry *current_entry;
2849
0de8783a
JR
2850 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2851 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 2852 /* We found the entry with the 'mf' miniflow */
0de8783a 2853 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
2854 return;
2855 }
2856
2857 /* Replacement policy: put the flow in an empty (not alive) entry, or
2858 * in the first entry where it can be */
2859 if (!to_be_replaced
2860 || (emc_entry_alive(to_be_replaced)
2861 && !emc_entry_alive(current_entry))
0de8783a 2862 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
2863 to_be_replaced = current_entry;
2864 }
2865 }
2866 /* We didn't find the miniflow in the cache.
2867 * The 'to_be_replaced' entry is where the new flow will be stored */
2868
0de8783a 2869 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
2870}
2871
4c30b246
CL
2872static inline void
2873emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2874 const struct netdev_flow_key *key,
2875 struct dp_netdev_flow *flow)
2876{
2877 /* Insert an entry into the EMC based on probability value 'min'. By
2878 * default the value is UINT32_MAX / 100 which yields an insertion
2879 * probability of 1/100 ie. 1% */
2880
2fbadeb6 2881 uint32_t min = pmd->ctx.emc_insert_min;
4c30b246 2882
656238ee 2883 if (min && random_uint32() <= min) {
60d8ccae 2884 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
4c30b246
CL
2885 }
2886}
2887
9bbf1c3d 2888static inline struct dp_netdev_flow *
0de8783a 2889emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
2890{
2891 struct emc_entry *current_entry;
2892
0de8783a
JR
2893 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2894 if (current_entry->key.hash == key->hash
2895 && emc_entry_alive(current_entry)
2896 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 2897
0de8783a 2898 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
2899 return current_entry->flow;
2900 }
2901 }
2902
2903 return NULL;
2904}
2905
60d8ccae
YW
2906static inline const struct cmap_node *
2907smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2908{
2909 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2910 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2911 uint16_t sig = hash >> 16;
2912 uint16_t index = UINT16_MAX;
2913
2914 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2915 if (bucket->sig[i] == sig) {
2916 index = bucket->flow_idx[i];
2917 break;
2918 }
2919 }
2920 if (index != UINT16_MAX) {
2921 return cmap_find_by_index(&pmd->flow_table, index);
2922 }
2923 return NULL;
2924}
2925
2926static void
2927smc_clear_entry(struct smc_bucket *b, int idx)
2928{
2929 b->flow_idx[idx] = UINT16_MAX;
2930}
2931
2932/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2933 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2934 * If there is already an SMC entry having same signature, the index will be
2935 * updated. If there is no existing entry, but an empty entry is available,
2936 * the empty entry will be taken. If no empty entry or existing same signature,
2937 * a random entry from the hashed bucket will be picked. */
2938static inline void
2939smc_insert(struct dp_netdev_pmd_thread *pmd,
2940 const struct netdev_flow_key *key,
2941 uint32_t hash)
2942{
2943 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2944 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2945 uint16_t index;
2946 uint32_t cmap_index;
2947 bool smc_enable_db;
2948 int i;
2949
2950 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2951 if (!smc_enable_db) {
2952 return;
2953 }
2954
2955 cmap_index = cmap_find_index(&pmd->flow_table, hash);
2956 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2957
2958 /* If the index is larger than SMC can handle (uint16_t), we don't
2959 * insert */
2960 if (index == UINT16_MAX) {
2961 return;
2962 }
2963
2964 /* If an entry with same signature already exists, update the index */
2965 uint16_t sig = key->hash >> 16;
2966 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2967 if (bucket->sig[i] == sig) {
2968 bucket->flow_idx[i] = index;
2969 return;
2970 }
2971 }
2972 /* If there is an empty entry, occupy it. */
2973 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2974 if (bucket->flow_idx[i] == UINT16_MAX) {
2975 bucket->sig[i] = sig;
2976 bucket->flow_idx[i] = index;
2977 return;
2978 }
2979 }
2980 /* Otherwise, pick a random entry. */
2981 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2982 bucket->sig[i] = sig;
2983 bucket->flow_idx[i] = index;
2984}
2985
72865317 2986static struct dp_netdev_flow *
3453b4d6
JS
2987dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2988 const struct netdev_flow_key *key,
2989 int *lookup_num_p)
2c0ea78f 2990{
3453b4d6 2991 struct dpcls *cls;
0de8783a 2992 struct dpcls_rule *rule;
f825fdd4
BP
2993 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2994 in_port.odp_port));
3453b4d6 2995 struct dp_netdev_flow *netdev_flow = NULL;
2c0ea78f 2996
3453b4d6
JS
2997 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2998 if (OVS_LIKELY(cls)) {
60d8ccae 2999 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3453b4d6
JS
3000 netdev_flow = dp_netdev_flow_cast(rule);
3001 }
8a4e3a85 3002 return netdev_flow;
2c0ea78f
GS
3003}
3004
3005static struct dp_netdev_flow *
1c1e46ed
AW
3006dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3007 const ovs_u128 *ufidp, const struct nlattr *key,
3008 size_t key_len)
72865317 3009{
1763b4b8 3010 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
3011 struct flow flow;
3012 ovs_u128 ufid;
3013
3014 /* If a UFID is not provided, determine one based on the key. */
3015 if (!ufidp && key && key_len
f0fb825a 3016 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
7a5e0ee7 3017 odp_flow_key_hash(&flow, sizeof flow, &ufid);
70e5ed6f
JS
3018 ufidp = &ufid;
3019 }
72865317 3020
70e5ed6f
JS
3021 if (ufidp) {
3022 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 3023 &pmd->flow_table) {
2ff8484b 3024 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
70e5ed6f
JS
3025 return netdev_flow;
3026 }
72865317
BP
3027 }
3028 }
8a4e3a85 3029
72865317
BP
3030 return NULL;
3031}
3032
a309e4f5
OM
3033static bool
3034dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3035 const struct dp_netdev_flow *netdev_flow,
3036 struct dpif_flow_stats *stats,
3037 struct dpif_flow_attrs *attrs)
3038{
3039 uint64_t act_buf[1024 / 8];
3040 struct nlattr *actions;
3041 struct netdev *netdev;
3042 struct match match;
3043 struct ofpbuf buf;
3044
3045 int ret = 0;
3046
3047 if (!netdev_is_flow_api_enabled()) {
3048 return false;
3049 }
3050
3051 netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port, dp->class);
3052 if (!netdev) {
3053 return false;
3054 }
3055 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3056 /* Taking a global 'port_mutex' to fulfill thread safety
3057 * restrictions for the netdev-offload-dpdk module. */
3058 ovs_mutex_lock(&dp->port_mutex);
3059 ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid,
3060 stats, attrs, &buf);
3061 ovs_mutex_unlock(&dp->port_mutex);
3062 netdev_close(netdev);
3063 if (ret) {
3064 return false;
3065 }
3066
3067 return true;
3068}
3069
72865317 3070static void
a309e4f5
OM
3071get_dpif_flow_status(const struct dp_netdev *dp,
3072 const struct dp_netdev_flow *netdev_flow_,
3073 struct dpif_flow_stats *stats,
3074 struct dpif_flow_attrs *attrs)
feebdea2 3075{
a309e4f5
OM
3076 struct dpif_flow_stats offload_stats;
3077 struct dpif_flow_attrs offload_attrs;
eb94da30
DDP
3078 struct dp_netdev_flow *netdev_flow;
3079 unsigned long long n;
3080 long long used;
3081 uint16_t flags;
3082
3083 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3084
3085 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3086 stats->n_packets = n;
3087 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3088 stats->n_bytes = n;
3089 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3090 stats->used = used;
3091 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3092 stats->tcp_flags = flags;
a309e4f5
OM
3093
3094 if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3095 &offload_stats, &offload_attrs)) {
3096 stats->n_packets += offload_stats.n_packets;
3097 stats->n_bytes += offload_stats.n_bytes;
3098 stats->used = MAX(stats->used, offload_stats.used);
3099 stats->tcp_flags |= offload_stats.tcp_flags;
3100 if (attrs) {
3101 attrs->offloaded = offload_attrs.offloaded;
3102 attrs->dp_layer = offload_attrs.dp_layer;
3103 }
3104 } else if (attrs) {
3105 attrs->offloaded = false;
3106 attrs->dp_layer = "ovs";
3107 }
72865317
BP
3108}
3109
7af12bd7
JS
3110/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3111 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3112 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3113 * protect them. */
6fe09f8c 3114static void
a309e4f5
OM
3115dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3116 const struct dp_netdev_flow *netdev_flow,
7af12bd7 3117 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 3118 struct dpif_flow *flow, bool terse)
6fe09f8c 3119{
64bb477f
JS
3120 if (terse) {
3121 memset(flow, 0, sizeof *flow);
3122 } else {
3123 struct flow_wildcards wc;
3124 struct dp_netdev_actions *actions;
3125 size_t offset;
5262eea1
JG
3126 struct odp_flow_key_parms odp_parms = {
3127 .flow = &netdev_flow->flow,
3128 .mask = &wc.masks,
2494ccd7 3129 .support = dp_netdev_support,
5262eea1 3130 };
64bb477f
JS
3131
3132 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
f4b835bb
JR
3133 /* in_port is exact matched, but we have left it out from the mask for
3134 * optimnization reasons. Add in_port back to the mask. */
3135 wc.masks.in_port.odp_port = ODPP_NONE;
64bb477f
JS
3136
3137 /* Key */
6fd6ed71 3138 offset = key_buf->size;
64bb477f 3139 flow->key = ofpbuf_tail(key_buf);
5262eea1 3140 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 3141 flow->key_len = key_buf->size - offset;
64bb477f
JS
3142
3143 /* Mask */
6fd6ed71 3144 offset = mask_buf->size;
64bb477f 3145 flow->mask = ofpbuf_tail(mask_buf);
ec1f6f32 3146 odp_parms.key_buf = key_buf;
5262eea1 3147 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 3148 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
3149
3150 /* Actions */
3151 actions = dp_netdev_flow_get_actions(netdev_flow);
3152 flow->actions = actions->actions;
3153 flow->actions_len = actions->size;
3154 }
6fe09f8c 3155
70e5ed6f
JS
3156 flow->ufid = netdev_flow->ufid;
3157 flow->ufid_present = true;
1c1e46ed 3158 flow->pmd_id = netdev_flow->pmd_id;
0d6b401c 3159
a309e4f5 3160 get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
0e8f5c6a
EF
3161
3162 struct ds extra_info = DS_EMPTY_INITIALIZER;
3163 size_t unit;
3164
3165 ds_put_cstr(&extra_info, "miniflow_bits(");
3166 FLOWMAP_FOR_EACH_UNIT (unit) {
3167 if (unit) {
3168 ds_put_char(&extra_info, ',');
3169 }
3170 ds_put_format(&extra_info, "%d",
3171 count_1bits(netdev_flow->cr.mask->mf.map.bits[unit]));
3172 }
3173 ds_put_char(&extra_info, ')');
3174 flow->attrs.dp_extra_info = ds_steal_cstr(&extra_info);
3175 ds_destroy(&extra_info);
6fe09f8c
JS
3176}
3177
36956a7d 3178static int
8c301900
JR
3179dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3180 const struct nlattr *mask_key,
3181 uint32_t mask_key_len, const struct flow *flow,
f0fb825a 3182 struct flow_wildcards *wc, bool probe)
8c301900 3183{
ca8d3442
DDP
3184 enum odp_key_fitness fitness;
3185
d40533fc 3186 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
ca8d3442 3187 if (fitness) {
f0fb825a
EG
3188 if (!probe) {
3189 /* This should not happen: it indicates that
3190 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3191 * disagree on the acceptable form of a mask. Log the problem
3192 * as an error, with enough details to enable debugging. */
3193 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3194
3195 if (!VLOG_DROP_ERR(&rl)) {
3196 struct ds s;
3197
3198 ds_init(&s);
3199 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3200 true);
3201 VLOG_ERR("internal error parsing flow mask %s (%s)",
3202 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3203 ds_destroy(&s);
3204 }
8c301900 3205 }
ca8d3442
DDP
3206
3207 return EINVAL;
8c301900
JR
3208 }
3209
3210 return 0;
3211}
3212
3213static int
3214dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
f0fb825a 3215 struct flow *flow, bool probe)
36956a7d 3216{
d40533fc 3217 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
f0fb825a
EG
3218 if (!probe) {
3219 /* This should not happen: it indicates that
3220 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3221 * the acceptable form of a flow. Log the problem as an error,
3222 * with enough details to enable debugging. */
3223 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3224
3225 if (!VLOG_DROP_ERR(&rl)) {
3226 struct ds s;
3227
3228 ds_init(&s);
3229 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3230 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3231 ds_destroy(&s);
3232 }
36956a7d
BP
3233 }
3234
3235 return EINVAL;
3236 }
3237
5cf3edb3 3238 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
07659514
JS
3239 return EINVAL;
3240 }
3241
36956a7d
BP
3242 return 0;
3243}
3244
72865317 3245static int
6fe09f8c 3246dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
3247{
3248 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 3249 struct dp_netdev_flow *netdev_flow;
1c1e46ed 3250 struct dp_netdev_pmd_thread *pmd;
c673049c
IM
3251 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3252 struct hmapx_node *node;
3253 int error = EINVAL;
3254
3255 if (get->pmd_id == PMD_ID_NULL) {
3256 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3257 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3258 dp_netdev_pmd_unref(pmd);
3259 }
3260 }
3261 } else {
3262 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3263 if (!pmd) {
3264 goto out;
3265 }
3266 hmapx_add(&to_find, pmd);
1c1e46ed
AW
3267 }
3268
c673049c
IM
3269 if (!hmapx_count(&to_find)) {
3270 goto out;
72865317 3271 }
1c1e46ed 3272
c673049c
IM
3273 HMAPX_FOR_EACH (node, &to_find) {
3274 pmd = (struct dp_netdev_pmd_thread *) node->data;
3275 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3276 get->key_len);
3277 if (netdev_flow) {
a309e4f5
OM
3278 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3279 get->buffer, get->flow, false);
c673049c
IM
3280 error = 0;
3281 break;
3282 } else {
3283 error = ENOENT;
3284 }
3285 }
bc4a05c6 3286
c673049c
IM
3287 HMAPX_FOR_EACH (node, &to_find) {
3288 pmd = (struct dp_netdev_pmd_thread *) node->data;
3289 dp_netdev_pmd_unref(pmd);
3290 }
3291out:
3292 hmapx_destroy(&to_find);
5279f8fd 3293 return error;
72865317
BP
3294}
3295
241bad15
YL
3296static void
3297dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3298{
3299 struct flow masked_flow;
3300 size_t i;
3301
3302 for (i = 0; i < sizeof(struct flow); i++) {
3303 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3304 ((uint8_t *)&match->wc)[i];
3305 }
7a5e0ee7 3306 odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
241bad15
YL
3307}
3308
0de8783a 3309static struct dp_netdev_flow *
1c1e46ed
AW
3310dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3311 struct match *match, const ovs_u128 *ufid,
ae2ceebd 3312 const struct nlattr *actions, size_t actions_len)
1c1e46ed 3313 OVS_REQUIRES(pmd->flow_mutex)
72865317 3314{
0de8783a
JR
3315 struct dp_netdev_flow *flow;
3316 struct netdev_flow_key mask;
3453b4d6 3317 struct dpcls *cls;
f4b835bb
JR
3318
3319 /* Make sure in_port is exact matched before we read it. */
3320 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3453b4d6 3321 odp_port_t in_port = match->flow.in_port.odp_port;
ed79f89a 3322
f4b835bb
JR
3323 /* As we select the dpcls based on the port number, each netdev flow
3324 * belonging to the same dpcls will have the same odp_port value.
3325 * For performance reasons we wildcard odp_port here in the mask. In the
3326 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3327 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3328 * will not be part of the subtable mask.
3329 * This will speed up the hash computation during dpcls_lookup() because
3330 * there is one less call to hash_add64() in this case. */
3331 match->wc.masks.in_port.odp_port = 0;
0de8783a 3332 netdev_flow_mask_init(&mask, match);
f4b835bb
JR
3333 match->wc.masks.in_port.odp_port = ODPP_NONE;
3334
0de8783a 3335 /* Make sure wc does not have metadata. */
5fcff47b
JR
3336 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3337 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 3338
0de8783a 3339 /* Do not allocate extra space. */
caeb4906 3340 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 3341 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 3342 flow->dead = false;
11e5cf1f 3343 flow->batch = NULL;
241bad15 3344 flow->mark = INVALID_FLOW_MARK;
bd5131ba 3345 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 3346 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 3347 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 3348 ovs_refcount_init(&flow->ref_cnt);
0de8783a 3349 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 3350
241bad15 3351 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
0de8783a 3352 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3453b4d6 3353
f4b835bb 3354 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3453b4d6
JS
3355 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3356 dpcls_insert(cls, &flow->cr, &mask);
72865317 3357
4c75aaab
EJ
3358 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3359 dp_netdev_flow_hash(&flow->ufid));
3360
02bb2824 3361 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
241bad15 3362
beb75a40 3363 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
623540e4 3364 struct ds ds = DS_EMPTY_INITIALIZER;
9044f2c1
JG
3365 struct ofpbuf key_buf, mask_buf;
3366 struct odp_flow_key_parms odp_parms = {
3367 .flow = &match->flow,
3368 .mask = &match->wc.masks,
3369 .support = dp_netdev_support,
3370 };
3371
3372 ofpbuf_init(&key_buf, 0);
3373 ofpbuf_init(&mask_buf, 0);
623540e4 3374
9044f2c1
JG
3375 odp_flow_key_from_flow(&odp_parms, &key_buf);
3376 odp_parms.key_buf = &key_buf;
3377 odp_flow_key_from_mask(&odp_parms, &mask_buf);
0de8783a 3378
623540e4 3379 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
3380 odp_format_ufid(ufid, &ds);
3381 ds_put_cstr(&ds, " ");
9044f2c1
JG
3382 odp_flow_format(key_buf.data, key_buf.size,
3383 mask_buf.data, mask_buf.size,
3384 NULL, &ds, false);
623540e4 3385 ds_put_cstr(&ds, ", actions:");
0722f341 3386 format_odp_actions(&ds, actions, actions_len, NULL);
623540e4 3387
beb75a40 3388 VLOG_DBG("%s", ds_cstr(&ds));
623540e4 3389
9044f2c1
JG
3390 ofpbuf_uninit(&key_buf);
3391 ofpbuf_uninit(&mask_buf);
beb75a40
JS
3392
3393 /* Add a printout of the actual match installed. */
3394 struct match m;
3395 ds_clear(&ds);
3396 ds_put_cstr(&ds, "flow match: ");
3397 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3398 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
b2f4b622 3399 memset(&m.tun_md, 0, sizeof m.tun_md);
beb75a40
JS
3400 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3401
3402 VLOG_DBG("%s", ds_cstr(&ds));
3403
623540e4
EJ
3404 ds_destroy(&ds);
3405 }
3406
0de8783a 3407 return flow;
72865317
BP
3408}
3409
72865317 3410static int
f5d317a1
DDP
3411flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3412 struct netdev_flow_key *key,
3413 struct match *match,
3414 ovs_u128 *ufid,
3415 const struct dpif_flow_put *put,
3416 struct dpif_flow_stats *stats)
72865317 3417{
1763b4b8 3418 struct dp_netdev_flow *netdev_flow;
f5d317a1 3419 int error = 0;
72865317 3420
f5d317a1
DDP
3421 if (stats) {
3422 memset(stats, 0, sizeof *stats);
70e5ed6f
JS
3423 }
3424
1c1e46ed 3425 ovs_mutex_lock(&pmd->flow_mutex);
f5d317a1 3426 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
1763b4b8 3427 if (!netdev_flow) {
89625d1e 3428 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 3429 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
f5d317a1 3430 dp_netdev_flow_add(pmd, match, ufid, put->actions,
70e5ed6f 3431 put->actions_len);
0de8783a 3432 error = 0;
72865317 3433 } else {
5279f8fd 3434 error = EFBIG;
72865317
BP
3435 }
3436 } else {
5279f8fd 3437 error = ENOENT;
72865317
BP
3438 }
3439 } else {
beb75a40 3440 if (put->flags & DPIF_FP_MODIFY) {
8a4e3a85
BP
3441 struct dp_netdev_actions *new_actions;
3442 struct dp_netdev_actions *old_actions;
3443
3444 new_actions = dp_netdev_actions_create(put->actions,
3445 put->actions_len);
3446
61e7deb1
BP
3447 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3448 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 3449
02bb2824
YL
3450 queue_netdev_flow_put(pmd, netdev_flow, match,
3451 put->actions, put->actions_len);
241bad15 3452
f5d317a1 3453 if (stats) {
a309e4f5 3454 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
a84cb64a
BP
3455 }
3456 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
3457 /* XXX: The userspace datapath uses thread local statistics
3458 * (for flows), which should be updated only by the owning
3459 * thread. Since we cannot write on stats memory here,
3460 * we choose not to support this flag. Please note:
3461 * - This feature is currently used only by dpctl commands with
3462 * option --clear.
3463 * - Should the need arise, this operation can be implemented
3464 * by keeping a base value (to be update here) for each
3465 * counter, and subtracting it before outputting the stats */
3466 error = EOPNOTSUPP;
72865317 3467 }
8a4e3a85 3468
61e7deb1 3469 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 3470 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 3471 error = EEXIST;
2c0ea78f
GS
3472 } else {
3473 /* Overlapping flow. */
3474 error = EINVAL;
72865317
BP
3475 }
3476 }
1c1e46ed 3477 ovs_mutex_unlock(&pmd->flow_mutex);
5279f8fd 3478 return error;
72865317
BP
3479}
3480
72865317 3481static int
f5d317a1 3482dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
3483{
3484 struct dp_netdev *dp = get_dp_netdev(dpif);
beb75a40 3485 struct netdev_flow_key key, mask;
1c1e46ed 3486 struct dp_netdev_pmd_thread *pmd;
f5d317a1
DDP
3487 struct match match;
3488 ovs_u128 ufid;
3489 int error;
f0fb825a 3490 bool probe = put->flags & DPIF_FP_PROBE;
72865317 3491
f5d317a1
DDP
3492 if (put->stats) {
3493 memset(put->stats, 0, sizeof *put->stats);
3494 }
f0fb825a
EG
3495 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3496 probe);
f5d317a1
DDP
3497 if (error) {
3498 return error;
3499 }
3500 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3501 put->mask, put->mask_len,
f0fb825a 3502 &match.flow, &match.wc, probe);
f5d317a1
DDP
3503 if (error) {
3504 return error;
1c1e46ed
AW
3505 }
3506
f5d317a1
DDP
3507 if (put->ufid) {
3508 ufid = *put->ufid;
3509 } else {
7a5e0ee7 3510 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
f5d317a1
DDP
3511 }
3512
35fe9efb
IM
3513 /* The Netlink encoding of datapath flow keys cannot express
3514 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3515 * tag is interpreted as exact match on the fact that there is no
3516 * VLAN. Unless we refactor a lot of code that translates between
3517 * Netlink and struct flow representations, we have to do the same
3518 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3519 if (!match.wc.masks.vlans[0].tci) {
3520 match.wc.masks.vlans[0].tci = htons(0xffff);
3521 }
3522
f5d317a1 3523 /* Must produce a netdev_flow_key for lookup.
beb75a40
JS
3524 * Use the same method as employed to create the key when adding
3525 * the flow to the dplcs to make sure they match. */
3526 netdev_flow_mask_init(&mask, &match);
3527 netdev_flow_key_init_masked(&key, &match.flow, &mask);
f5d317a1
DDP
3528
3529 if (put->pmd_id == PMD_ID_NULL) {
3530 if (cmap_count(&dp->poll_threads) == 0) {
3531 return EINVAL;
3532 }
3533 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3534 struct dpif_flow_stats pmd_stats;
3535 int pmd_error;
3536
3537 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3538 &pmd_stats);
3539 if (pmd_error) {
3540 error = pmd_error;
3541 } else if (put->stats) {
3542 put->stats->n_packets += pmd_stats.n_packets;
3543 put->stats->n_bytes += pmd_stats.n_bytes;
3544 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3545 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3546 }
3547 }
3548 } else {
3549 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3550 if (!pmd) {
3551 return EINVAL;
3552 }
3553 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3554 dp_netdev_pmd_unref(pmd);
3555 }
3556
3557 return error;
3558}
3559
3560static int
3561flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3562 struct dpif_flow_stats *stats,
3563 const struct dpif_flow_del *del)
3564{
3565 struct dp_netdev_flow *netdev_flow;
3566 int error = 0;
3567
1c1e46ed
AW
3568 ovs_mutex_lock(&pmd->flow_mutex);
3569 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3570 del->key_len);
1763b4b8 3571 if (netdev_flow) {
f5d317a1 3572 if (stats) {
a309e4f5 3573 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
feebdea2 3574 }
1c1e46ed 3575 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 3576 } else {
5279f8fd 3577 error = ENOENT;
72865317 3578 }
1c1e46ed 3579 ovs_mutex_unlock(&pmd->flow_mutex);
f5d317a1
DDP
3580
3581 return error;
3582}
3583
3584static int
3585dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3586{
3587 struct dp_netdev *dp = get_dp_netdev(dpif);
3588 struct dp_netdev_pmd_thread *pmd;
3589 int error = 0;
3590
3591 if (del->stats) {
3592 memset(del->stats, 0, sizeof *del->stats);
3593 }
3594
3595 if (del->pmd_id == PMD_ID_NULL) {
3596 if (cmap_count(&dp->poll_threads) == 0) {
3597 return EINVAL;
3598 }
3599 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3600 struct dpif_flow_stats pmd_stats;
3601 int pmd_error;
3602
3603 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3604 if (pmd_error) {
3605 error = pmd_error;
3606 } else if (del->stats) {
3607 del->stats->n_packets += pmd_stats.n_packets;
3608 del->stats->n_bytes += pmd_stats.n_bytes;
3609 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3610 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3611 }
3612 }
3613 } else {
3614 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3615 if (!pmd) {
3616 return EINVAL;
3617 }
3618 error = flow_del_on_pmd(pmd, del->stats, del);
3619 dp_netdev_pmd_unref(pmd);
3620 }
3621
5279f8fd
BP
3622
3623 return error;
72865317
BP
3624}
3625
ac64794a
BP
3626struct dpif_netdev_flow_dump {
3627 struct dpif_flow_dump up;
1c1e46ed
AW
3628 struct cmap_position poll_thread_pos;
3629 struct cmap_position flow_pos;
3630 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
3631 int status;
3632 struct ovs_mutex mutex;
e723fd32
JS
3633};
3634
ac64794a
BP
3635static struct dpif_netdev_flow_dump *
3636dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 3637{
ac64794a 3638 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
3639}
3640
ac64794a 3641static struct dpif_flow_dump *
7e8b7199 3642dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
a692410a 3643 struct dpif_flow_dump_types *types OVS_UNUSED)
e723fd32 3644{
ac64794a 3645 struct dpif_netdev_flow_dump *dump;
e723fd32 3646
1c1e46ed 3647 dump = xzalloc(sizeof *dump);
ac64794a 3648 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 3649 dump->up.terse = terse;
ac64794a
BP
3650 ovs_mutex_init(&dump->mutex);
3651
3652 return &dump->up;
e723fd32
JS
3653}
3654
3655static int
ac64794a 3656dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 3657{
ac64794a 3658 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 3659
ac64794a
BP
3660 ovs_mutex_destroy(&dump->mutex);
3661 free(dump);
704a1e09
BP
3662 return 0;
3663}
3664
ac64794a
BP
3665struct dpif_netdev_flow_dump_thread {
3666 struct dpif_flow_dump_thread up;
3667 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
3668 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3669 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
3670};
3671
3672static struct dpif_netdev_flow_dump_thread *
3673dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3674{
3675 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3676}
3677
3678static struct dpif_flow_dump_thread *
3679dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3680{
3681 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3682 struct dpif_netdev_flow_dump_thread *thread;
3683
3684 thread = xmalloc(sizeof *thread);
3685 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3686 thread->dump = dump;
3687 return &thread->up;
3688}
3689
3690static void
3691dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3692{
3693 struct dpif_netdev_flow_dump_thread *thread
3694 = dpif_netdev_flow_dump_thread_cast(thread_);
3695
3696 free(thread);
3697}
3698
704a1e09 3699static int
ac64794a 3700dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 3701 struct dpif_flow *flows, int max_flows)
ac64794a
BP
3702{
3703 struct dpif_netdev_flow_dump_thread *thread
3704 = dpif_netdev_flow_dump_thread_cast(thread_);
3705 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 3706 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
a309e4f5
OM
3707 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3708 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
8bb113da
RW
3709 int n_flows = 0;
3710 int i;
14608a15 3711
ac64794a 3712 ovs_mutex_lock(&dump->mutex);
8bb113da 3713 if (!dump->status) {
1c1e46ed
AW
3714 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3715 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3716
3717 /* First call to dump_next(), extracts the first pmd thread.
3718 * If there is no pmd thread, returns immediately. */
3719 if (!pmd) {
3720 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3721 if (!pmd) {
3722 ovs_mutex_unlock(&dump->mutex);
3723 return n_flows;
8bb113da 3724
8bb113da 3725 }
d2ad7ef1 3726 }
1c1e46ed
AW
3727
3728 do {
3729 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3730 struct cmap_node *node;
3731
3732 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3733 if (!node) {
3734 break;
3735 }
3736 netdev_flows[n_flows] = CONTAINER_OF(node,
3737 struct dp_netdev_flow,
3738 node);
3739 }
3740 /* When finishing dumping the current pmd thread, moves to
3741 * the next. */
3742 if (n_flows < flow_limit) {
3743 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3744 dp_netdev_pmd_unref(pmd);
3745 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3746 if (!pmd) {
3747 dump->status = EOF;
3748 break;
3749 }
3750 }
3751 /* Keeps the reference to next caller. */
3752 dump->cur_pmd = pmd;
3753
3754 /* If the current dump is empty, do not exit the loop, since the
3755 * remaining pmds could have flows to be dumped. Just dumps again
3756 * on the new 'pmd'. */
3757 } while (!n_flows);
8a4e3a85 3758 }
ac64794a 3759 ovs_mutex_unlock(&dump->mutex);
ac64794a 3760
8bb113da
RW
3761 for (i = 0; i < n_flows; i++) {
3762 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3763 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3764 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3765 struct dpif_flow *f = &flows[i];
7af12bd7 3766 struct ofpbuf key, mask;
8bb113da 3767
7af12bd7
JS
3768 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3769 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
a309e4f5 3770 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
64bb477f 3771 dump->up.terse);
8bb113da 3772 }
feebdea2 3773
8bb113da 3774 return n_flows;
72865317
BP
3775}
3776
3777static int
758c456d 3778dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 3779 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
3780{
3781 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 3782 struct dp_netdev_pmd_thread *pmd;
1895cc8d 3783 struct dp_packet_batch pp;
72865317 3784
cf62fa4c
PS
3785 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3786 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
3787 return EINVAL;
3788 }
3789
65f13b50
AW
3790 /* Tries finding the 'pmd'. If NULL is returned, that means
3791 * the current thread is a non-pmd thread and should use
b19befae 3792 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
3793 pmd = ovsthread_getspecific(dp->per_pmd_key);
3794 if (!pmd) {
b19befae 3795 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
546e57d4
DDP
3796 if (!pmd) {
3797 return EBUSY;
3798 }
65f13b50
AW
3799 }
3800
05267613
AZ
3801 if (execute->probe) {
3802 /* If this is part of a probe, Drop the packet, since executing
3803 * the action may actually cause spurious packets be sent into
3804 * the network. */
d1ce9c20
YS
3805 if (pmd->core_id == NON_PMD_CORE_ID) {
3806 dp_netdev_pmd_unref(pmd);
3807 }
05267613
AZ
3808 return 0;
3809 }
3810
65f13b50
AW
3811 /* If the current thread is non-pmd thread, acquires
3812 * the 'non_pmd_mutex'. */
3813 if (pmd->core_id == NON_PMD_CORE_ID) {
3814 ovs_mutex_lock(&dp->non_pmd_mutex);
3815 }
1c1e46ed 3816
2fbadeb6
IM
3817 /* Update current time in PMD context. We don't care about EMC insertion
3818 * probability, because we are on a slow path. */
b010be17
IM
3819 pmd_thread_ctx_time_update(pmd);
3820
36d8de17
DDP
3821 /* The action processing expects the RSS hash to be valid, because
3822 * it's always initialized at the beginning of datapath processing.
3823 * In this case, though, 'execute->packet' may not have gone through
3824 * the datapath at all, it may have been generated by the upper layer
3825 * (OpenFlow packet-out, BFD frame, ...). */
3826 if (!dp_packet_rss_valid(execute->packet)) {
3827 dp_packet_set_rss_hash(execute->packet,
3828 flow_hash_5tuple(execute->flow, 0));
3829 }
3830
72c84bc2 3831 dp_packet_batch_init_packet(&pp, execute->packet);
9f17f104 3832 pp.do_not_steal = true;
66e4ad8a 3833 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
b010be17 3834 execute->actions, execute->actions_len);
c71ea3c4 3835 dp_netdev_pmd_flush_output_packets(pmd, true);
36d8de17 3836
65f13b50
AW
3837 if (pmd->core_id == NON_PMD_CORE_ID) {
3838 ovs_mutex_unlock(&dp->non_pmd_mutex);
e9985d6a 3839 dp_netdev_pmd_unref(pmd);
65f13b50 3840 }
8a4e3a85 3841
758c456d 3842 return 0;
72865317
BP
3843}
3844
1a0c894a 3845static void
57924fc9
SB
3846dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3847 enum dpif_offload_type offload_type OVS_UNUSED)
1a0c894a
BP
3848{
3849 size_t i;
3850
3851 for (i = 0; i < n_ops; i++) {
3852 struct dpif_op *op = ops[i];
3853
3854 switch (op->type) {
3855 case DPIF_OP_FLOW_PUT:
fa37affa 3856 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
1a0c894a
BP
3857 break;
3858
3859 case DPIF_OP_FLOW_DEL:
fa37affa 3860 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
1a0c894a
BP
3861 break;
3862
3863 case DPIF_OP_EXECUTE:
fa37affa 3864 op->error = dpif_netdev_execute(dpif, &op->execute);
1a0c894a 3865 break;
6fe09f8c
JS
3866
3867 case DPIF_OP_FLOW_GET:
fa37affa 3868 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
6fe09f8c 3869 break;
1a0c894a
BP
3870 }
3871 }
3872}
3873
5bf84282
NK
3874/* Enable or Disable PMD auto load balancing. */
3875static void
3876set_pmd_auto_lb(struct dp_netdev *dp)
3877{
3878 unsigned int cnt = 0;
3879 struct dp_netdev_pmd_thread *pmd;
3880 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3881
3882 bool enable_alb = false;
3883 bool multi_rxq = false;
3884 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3885
3886 /* Ensure that there is at least 2 non-isolated PMDs and
3887 * one of them is polling more than one rxq. */
3888 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3889 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3890 continue;
3891 }
3892
3893 if (hmap_count(&pmd->poll_list) > 1) {
3894 multi_rxq = true;
3895 }
3896 if (cnt && multi_rxq) {
3897 enable_alb = true;
3898 break;
3899 }
3900 cnt++;
3901 }
3902
3903 /* Enable auto LB if it is requested and cycle based assignment is true. */
3904 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3905 pmd_alb->auto_lb_requested;
3906
3907 if (pmd_alb->is_enabled != enable_alb) {
3908 pmd_alb->is_enabled = enable_alb;
3909 if (pmd_alb->is_enabled) {
3910 VLOG_INFO("PMD auto load balance is enabled "
3911 "(with rebalance interval:%"PRIu64" msec)",
3912 pmd_alb->rebalance_intvl);
3913 } else {
3914 pmd_alb->rebalance_poll_timer = 0;
3915 VLOG_INFO("PMD auto load balance is disabled");
3916 }
3917 }
3918
3919}
3920
d4f6865c
DDP
3921/* Applies datapath configuration from the database. Some of the changes are
3922 * actually applied in dpif_netdev_run(). */
f2eee189 3923static int
d4f6865c 3924dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
f2eee189
AW
3925{
3926 struct dp_netdev *dp = get_dp_netdev(dpif);
d4f6865c 3927 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
e77c97b9
KT
3928 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3929 "cycles");
4c30b246
CL
3930 unsigned long long insert_prob =
3931 smap_get_ullong(other_config, "emc-insert-inv-prob",
3932 DEFAULT_EM_FLOW_INSERT_INV_PROB);
3933 uint32_t insert_min, cur_min;
c71ea3c4 3934 uint32_t tx_flush_interval, cur_tx_flush_interval;
5bf84282 3935 uint64_t rebalance_intvl;
c71ea3c4
IM
3936
3937 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3938 DEFAULT_TX_FLUSH_INTERVAL);
3939 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3940 if (tx_flush_interval != cur_tx_flush_interval) {
3941 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3942 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3943 tx_flush_interval);
3944 }
f2eee189 3945
a6a426d6
IM
3946 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3947 free(dp->pmd_cmask);
3948 dp->pmd_cmask = nullable_xstrdup(cmask);
3949 dp_netdev_request_reconfigure(dp);
f2eee189
AW
3950 }
3951
4c30b246
CL
3952 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3953 if (insert_prob <= UINT32_MAX) {
3954 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3955 } else {
3956 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3957 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3958 }
3959
3960 if (insert_min != cur_min) {
3961 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3962 if (insert_min == 0) {
2fbadeb6 3963 VLOG_INFO("EMC insertion probability changed to zero");
4c30b246
CL
3964 } else {
3965 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3966 insert_prob, (100 / (float)insert_prob));
3967 }
3968 }
3969
79f36875
JS
3970 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3971 bool cur_perf_enabled;
3972 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3973 if (perf_enabled != cur_perf_enabled) {
3974 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3975 if (perf_enabled) {
3976 VLOG_INFO("PMD performance metrics collection enabled");
3977 } else {
3978 VLOG_INFO("PMD performance metrics collection disabled");
3979 }
3980 }
3981
60d8ccae
YW
3982 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3983 bool cur_smc;
3984 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3985 if (smc_enable != cur_smc) {
3986 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3987 if (smc_enable) {
3988 VLOG_INFO("SMC cache is enabled");
3989 } else {
3990 VLOG_INFO("SMC cache is disabled");
3991 }
3992 }
e77c97b9
KT
3993
3994 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3995 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3996 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3997 "Defaulting to 'cycles'.");
3998 pmd_rxq_assign_cyc = true;
3999 pmd_rxq_assign = "cycles";
4000 }
4001 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4002 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4003 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4004 pmd_rxq_assign);
4005 dp_netdev_request_reconfigure(dp);
4006 }
5bf84282
NK
4007
4008 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4009 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4010 false);
4011
4012 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4013 ALB_PMD_REBALANCE_POLL_INTERVAL);
4014
4015 /* Input is in min, convert it to msec. */
4016 rebalance_intvl =
4017 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4018
4019 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4020 pmd_alb->rebalance_intvl = rebalance_intvl;
4021 }
4022
4023 set_pmd_auto_lb(dp);
f2eee189
AW
4024 return 0;
4025}
4026
3eb67853
IM
4027/* Parses affinity list and returns result in 'core_ids'. */
4028static int
4029parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4030{
4031 unsigned i;
4032 char *list, *copy, *key, *value;
4033 int error = 0;
4034
4035 for (i = 0; i < n_rxq; i++) {
51c37a56 4036 core_ids[i] = OVS_CORE_UNSPEC;
3eb67853
IM
4037 }
4038
4039 if (!affinity_list) {
4040 return 0;
4041 }
4042
4043 list = copy = xstrdup(affinity_list);
4044
4045 while (ofputil_parse_key_value(&list, &key, &value)) {
4046 int rxq_id, core_id;
4047
4048 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4049 || !str_to_int(value, 0, &core_id) || core_id < 0) {
4050 error = EINVAL;
4051 break;
4052 }
4053
4054 if (rxq_id < n_rxq) {
4055 core_ids[rxq_id] = core_id;
4056 }
4057 }
4058
4059 free(copy);
4060 return error;
4061}
4062
4063/* Parses 'affinity_list' and applies configuration if it is valid. */
4064static int
4065dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4066 const char *affinity_list)
4067{
4068 unsigned *core_ids, i;
4069 int error = 0;
4070
4071 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4072 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4073 error = EINVAL;
4074 goto exit;
4075 }
4076
4077 for (i = 0; i < port->n_rxq; i++) {
4078 port->rxqs[i].core_id = core_ids[i];
4079 }
4080
4081exit:
4082 free(core_ids);
4083 return error;
4084}
4085
2fbadeb6
IM
4086/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4087 * of given PMD thread. */
4088static bool
4089dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4090 struct dp_netdev_port *port)
4091 OVS_EXCLUDED(pmd->port_mutex)
4092{
4093 struct rxq_poll *poll;
4094 bool found = false;
4095
4096 ovs_mutex_lock(&pmd->port_mutex);
4097 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4098 if (port == poll->rxq->port) {
4099 found = true;
4100 break;
4101 }
4102 }
4103 ovs_mutex_unlock(&pmd->port_mutex);
4104 return found;
4105}
4106
4107/* Updates port configuration from the database. The changes are actually
4108 * applied in dpif_netdev_run(). */
3eb67853
IM
4109static int
4110dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4111 const struct smap *cfg)
4112{
4113 struct dp_netdev *dp = get_dp_netdev(dpif);
4114 struct dp_netdev_port *port;
4115 int error = 0;
4116 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2fbadeb6 4117 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
3eb67853
IM
4118
4119 ovs_mutex_lock(&dp->port_mutex);
4120 error = get_port_by_number(dp, port_no, &port);
2fbadeb6
IM
4121 if (error) {
4122 goto unlock;
4123 }
4124
4125 if (emc_enabled != port->emc_enabled) {
4126 struct dp_netdev_pmd_thread *pmd;
4127 struct ds ds = DS_EMPTY_INITIALIZER;
4128 uint32_t cur_min, insert_prob;
4129
4130 port->emc_enabled = emc_enabled;
4131 /* Mark for reload all the threads that polls this port and request
4132 * for reconfiguration for the actual reloading of threads. */
4133 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4134 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4135 pmd->need_reload = true;
4136 }
4137 }
4138 dp_netdev_request_reconfigure(dp);
4139
4140 ds_put_format(&ds, "%s: EMC has been %s.",
4141 netdev_get_name(port->netdev),
4142 (emc_enabled) ? "enabled" : "disabled");
4143 if (emc_enabled) {
4144 ds_put_cstr(&ds, " Current insertion probability is ");
4145 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4146 if (!cur_min) {
4147 ds_put_cstr(&ds, "zero.");
4148 } else {
4149 insert_prob = UINT32_MAX / cur_min;
4150 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4151 insert_prob, 100 / (float) insert_prob);
4152 }
4153 }
4154 VLOG_INFO("%s", ds_cstr(&ds));
4155 ds_destroy(&ds);
4156 }
4157
4158 /* Checking for RXq affinity changes. */
4159 if (!netdev_is_pmd(port->netdev)
3eb67853
IM
4160 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4161 goto unlock;
4162 }
4163
4164 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4165 if (error) {
4166 goto unlock;
4167 }
4168 free(port->rxq_affinity_list);
4169 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4170
4171 dp_netdev_request_reconfigure(dp);
4172unlock:
4173 ovs_mutex_unlock(&dp->port_mutex);
4174 return error;
4175}
4176
5bf93d67
EJ
4177static int
4178dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4179 uint32_t queue_id, uint32_t *priority)
4180{
4181 *priority = queue_id;
4182 return 0;
4183}
4184
72865317 4185\f
9ff55ae2 4186/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
1401f6de 4187 * a copy of the 'size' bytes of 'actions' input parameters. */
a84cb64a
BP
4188struct dp_netdev_actions *
4189dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4190{
4191 struct dp_netdev_actions *netdev_actions;
4192
9ff55ae2
DDP
4193 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4194 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
4195 netdev_actions->size = size;
4196
4197 return netdev_actions;
4198}
4199
a84cb64a 4200struct dp_netdev_actions *
61e7deb1 4201dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 4202{
61e7deb1 4203 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
4204}
4205
61e7deb1
BP
4206static void
4207dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 4208{
61e7deb1 4209 free(actions);
a84cb64a
BP
4210}
4211\f
a19896ab
JS
4212static void
4213dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4214 enum rxq_cycles_counter_type type,
4215 unsigned long long cycles)
a2ac666d 4216{
a19896ab 4217 atomic_store_relaxed(&rx->cycles[type], cycles);
a2ac666d
CL
4218}
4219
4809891b 4220static void
a19896ab 4221dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4809891b
KT
4222 enum rxq_cycles_counter_type type,
4223 unsigned long long cycles)
4224{
a19896ab 4225 non_atomic_ullong_add(&rx->cycles[type], cycles);
4809891b
KT
4226}
4227
4228static uint64_t
4229dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4230 enum rxq_cycles_counter_type type)
4231{
4232 unsigned long long processing_cycles;
4233 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4234 return processing_cycles;
4235}
4236
4237static void
4238dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4239 unsigned long long cycles)
4240{
4ee87ad3
BP
4241 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4242 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4809891b
KT
4243}
4244
655856ef
KT
4245static uint64_t
4246dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4247{
4248 unsigned long long processing_cycles;
4249 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4250 return processing_cycles;
4251}
4252
79f36875
JS
4253#if ATOMIC_ALWAYS_LOCK_FREE_8B
4254static inline bool
4255pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4256{
4257 bool pmd_perf_enabled;
4258 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4259 return pmd_perf_enabled;
4260}
4261#else
4262/* If stores and reads of 64-bit integers are not atomic, the full PMD
4263 * performance metrics are not available as locked access to 64 bit
4264 * integers would be prohibitively expensive. */
4265static inline bool
4266pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4267{
4268 return false;
4269}
4270#endif
4271
c71ea3c4 4272static int
009e0033
IM
4273dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4274 struct tx_port *p)
4275{
58ed6df0 4276 int i;
009e0033 4277 int tx_qid;
cc4891f3 4278 int output_cnt;
009e0033 4279 bool dynamic_txqs;
58ed6df0
IM
4280 struct cycle_timer timer;
4281 uint64_t cycles;
c71ea3c4 4282 uint32_t tx_flush_interval;
58ed6df0
IM
4283
4284 cycle_timer_start(&pmd->perf_stats, &timer);
009e0033
IM
4285
4286 dynamic_txqs = p->port->dynamic_txqs;
4287 if (dynamic_txqs) {
4288 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4289 } else {
4290 tx_qid = pmd->static_tx_qid;
4291 }
4292
cc4891f3 4293 output_cnt = dp_packet_batch_size(&p->output_pkts);
58ed6df0 4294 ovs_assert(output_cnt > 0);
cc4891f3 4295
b30896c9 4296 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
009e0033 4297 dp_packet_batch_init(&p->output_pkts);
cc4891f3 4298
c71ea3c4
IM
4299 /* Update time of the next flush. */
4300 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4301 p->flush_time = pmd->ctx.now + tx_flush_interval;
4302
4303 ovs_assert(pmd->n_output_batches > 0);
4304 pmd->n_output_batches--;
4305
82a48ead
JS
4306 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4307 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
58ed6df0
IM
4308
4309 /* Distribute send cycles evenly among transmitted packets and assign to
4310 * their respective rx queues. */
4311 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4312 for (i = 0; i < output_cnt; i++) {
4313 if (p->output_pkts_rxqs[i]) {
4314 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4315 RXQ_CYCLES_PROC_CURR, cycles);
4316 }
4317 }
c71ea3c4
IM
4318
4319 return output_cnt;
009e0033
IM
4320}
4321
c71ea3c4
IM
4322static int
4323dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4324 bool force)
009e0033
IM
4325{
4326 struct tx_port *p;
c71ea3c4
IM
4327 int output_cnt = 0;
4328
4329 if (!pmd->n_output_batches) {
4330 return 0;
4331 }
009e0033
IM
4332
4333 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
c71ea3c4
IM
4334 if (!dp_packet_batch_is_empty(&p->output_pkts)
4335 && (force || pmd->ctx.now >= p->flush_time)) {
4336 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
009e0033
IM
4337 }
4338 }
c71ea3c4 4339 return output_cnt;
009e0033
IM
4340}
4341
a2ac666d 4342static int
65f13b50 4343dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
a19896ab 4344 struct dp_netdev_rxq *rxq,
947dc567 4345 odp_port_t port_no)
e4cfed38 4346{
79f36875 4347 struct pmd_perf_stats *s = &pmd->perf_stats;
1895cc8d 4348 struct dp_packet_batch batch;
a19896ab 4349 struct cycle_timer timer;
1895cc8d 4350 int error;
79f36875
JS
4351 int batch_cnt = 0;
4352 int rem_qlen = 0, *qlen_p = NULL;
58ed6df0 4353 uint64_t cycles;
e4cfed38 4354
a19896ab
JS
4355 /* Measure duration for polling and processing rx burst. */
4356 cycle_timer_start(&pmd->perf_stats, &timer);
58ed6df0
IM
4357
4358 pmd->ctx.last_rxq = rxq;
1895cc8d 4359 dp_packet_batch_init(&batch);
58ed6df0 4360
79f36875
JS
4361 /* Fetch the rx queue length only for vhostuser ports. */
4362 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4363 qlen_p = &rem_qlen;
4364 }
4365
4366 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
e4cfed38 4367 if (!error) {
a19896ab 4368 /* At least one packet received. */
3c33f0ff 4369 *recirc_depth_get() = 0;
009e0033 4370 pmd_thread_ctx_time_update(pmd);
940ac2ce 4371 batch_cnt = dp_packet_batch_size(&batch);
79f36875
JS
4372 if (pmd_perf_metrics_enabled(pmd)) {
4373 /* Update batch histogram. */
4374 s->current.batches++;
4375 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4376 /* Update the maximum vhost rx queue fill level. */
4377 if (rxq->is_vhost && rem_qlen >= 0) {
4378 uint32_t qfill = batch_cnt + rem_qlen;
4379 if (qfill > s->current.max_vhost_qfill) {
4380 s->current.max_vhost_qfill = qfill;
4381 }
4382 }
4383 }
4384 /* Process packet batch. */
947dc567 4385 dp_netdev_input(pmd, &batch, port_no);
e4cfed38 4386
a19896ab 4387 /* Assign processing cycles to rx queue. */
58ed6df0 4388 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
a19896ab
JS
4389 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4390
79f36875 4391 dp_netdev_pmd_flush_output_packets(pmd, false);
a19896ab
JS
4392 } else {
4393 /* Discard cycles. */
4394 cycle_timer_stop(&pmd->perf_stats, &timer);
4395 if (error != EAGAIN && error != EOPNOTSUPP) {
4396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4397
4398 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4399 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4400 }
e4cfed38 4401 }
a2ac666d 4402
58ed6df0
IM
4403 pmd->ctx.last_rxq = NULL;
4404
79f36875 4405 return batch_cnt;
e4cfed38
PS
4406}
4407
e32971b8
DDP
4408static struct tx_port *
4409tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4410{
4411 struct tx_port *tx;
4412
4413 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4414 if (tx->port->port_no == port_no) {
4415 return tx;
4416 }
4417 }
4418
4419 return NULL;
4420}
4421
dc36593c
DDP
4422static int
4423port_reconfigure(struct dp_netdev_port *port)
4424{
4425 struct netdev *netdev = port->netdev;
dc36593c
DDP
4426 int i, err;
4427
dc36593c
DDP
4428 /* Closes the existing 'rxq's. */
4429 for (i = 0; i < port->n_rxq; i++) {
947dc567
DDP
4430 netdev_rxq_close(port->rxqs[i].rx);
4431 port->rxqs[i].rx = NULL;
dc36593c 4432 }
4809891b 4433 unsigned last_nrxq = port->n_rxq;
dc36593c
DDP
4434 port->n_rxq = 0;
4435
050c60bf 4436 /* Allows 'netdev' to apply the pending configuration changes. */
606f6650 4437 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
e32971b8
DDP
4438 err = netdev_reconfigure(netdev);
4439 if (err && (err != EOPNOTSUPP)) {
4440 VLOG_ERR("Failed to set interface %s new configuration",
4441 netdev_get_name(netdev));
4442 return err;
4443 }
dc36593c 4444 }
050c60bf 4445 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3eb67853
IM
4446 port->rxqs = xrealloc(port->rxqs,
4447 sizeof *port->rxqs * netdev_n_rxq(netdev));
324c8374
IM
4448 /* Realloc 'used' counters for tx queues. */
4449 free(port->txq_used);
4450 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4451
dc36593c 4452 for (i = 0; i < netdev_n_rxq(netdev); i++) {
38259bd7
BP
4453 bool new_queue = i >= last_nrxq;
4454 if (new_queue) {
4455 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4456 }
4457
947dc567 4458 port->rxqs[i].port = port;
79f36875 4459 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
38259bd7 4460
947dc567 4461 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
dc36593c
DDP
4462 if (err) {
4463 return err;
4464 }
4465 port->n_rxq++;
4466 }
4467
3eb67853
IM
4468 /* Parse affinity list to apply configuration for new queues. */
4469 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4470
606f6650
EC
4471 /* If reconfiguration was successful mark it as such, so we can use it */
4472 port->need_reconfigure = false;
4473
dc36593c
DDP
4474 return 0;
4475}
4476
e32971b8
DDP
4477struct rr_numa_list {
4478 struct hmap numas; /* Contains 'struct rr_numa' */
4479};
4480
4481struct rr_numa {
4482 struct hmap_node node;
4483
4484 int numa_id;
4485
4486 /* Non isolated pmds on numa node 'numa_id' */
4487 struct dp_netdev_pmd_thread **pmds;
4488 int n_pmds;
4489
4490 int cur_index;
79da1e41 4491 bool idx_inc;
e32971b8
DDP
4492};
4493
4494static struct rr_numa *
4495rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4496{
4497 struct rr_numa *numa;
4498
4499 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4500 if (numa->numa_id == numa_id) {
4501 return numa;
4502 }
4503 }
4504
4505 return NULL;
4506}
4507
c37813fd
BM
4508/* Returns the next node in numa list following 'numa' in round-robin fashion.
4509 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4510 * Returns NULL if 'rr' numa list is empty. */
4511static struct rr_numa *
4512rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4513{
4514 struct hmap_node *node = NULL;
4515
4516 if (numa) {
4517 node = hmap_next(&rr->numas, &numa->node);
4518 }
4519 if (!node) {
4520 node = hmap_first(&rr->numas);
4521 }
4522
4523 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4524}
4525
e32971b8
DDP
4526static void
4527rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4528{
4529 struct dp_netdev_pmd_thread *pmd;
4530 struct rr_numa *numa;
4531
4532 hmap_init(&rr->numas);
4533
4534 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4535 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4536 continue;
4537 }
4538
4539 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4540 if (!numa) {
4541 numa = xzalloc(sizeof *numa);
4542 numa->numa_id = pmd->numa_id;
4543 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4544 }
4545 numa->n_pmds++;
4546 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4547 numa->pmds[numa->n_pmds - 1] = pmd;
79da1e41
KT
4548 /* At least one pmd so initialise curr_idx and idx_inc. */
4549 numa->cur_index = 0;
4550 numa->idx_inc = true;
e32971b8
DDP
4551 }
4552}
4553
e77c97b9
KT
4554/*
4555 * Returns the next pmd from the numa node.
4556 *
4557 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4558 * either an up or down walk, switching between up/down when the first or last
4559 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4560 *
4561 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4562 * core reached. e.g. 1,2,3,1,2,3,1,2...
4563 */
e32971b8 4564static struct dp_netdev_pmd_thread *
e77c97b9 4565rr_numa_get_pmd(struct rr_numa *numa, bool updown)
e32971b8 4566{
79da1e41
KT
4567 int numa_idx = numa->cur_index;
4568
4569 if (numa->idx_inc == true) {
4570 /* Incrementing through list of pmds. */
4571 if (numa->cur_index == numa->n_pmds-1) {
4572 /* Reached the last pmd. */
e77c97b9
KT
4573 if (updown) {
4574 numa->idx_inc = false;
4575 } else {
4576 numa->cur_index = 0;
4577 }
79da1e41
KT
4578 } else {
4579 numa->cur_index++;
4580 }
4581 } else {
4582 /* Decrementing through list of pmds. */
4583 if (numa->cur_index == 0) {
4584 /* Reached the first pmd. */
4585 numa->idx_inc = true;
4586 } else {
4587 numa->cur_index--;
4588 }
4589 }
4590 return numa->pmds[numa_idx];
e32971b8
DDP
4591}
4592
4593static void
4594rr_numa_list_destroy(struct rr_numa_list *rr)
4595{
4596 struct rr_numa *numa;
4597
4598 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4599 free(numa->pmds);
4600 free(numa);
4601 }
4602 hmap_destroy(&rr->numas);
4603}
4604
655856ef
KT
4605/* Sort Rx Queues by the processing cycles they are consuming. */
4606static int
cc131ac1 4607compare_rxq_cycles(const void *a, const void *b)
655856ef 4608{
28080276
KT
4609 struct dp_netdev_rxq *qa;
4610 struct dp_netdev_rxq *qb;
8368866e 4611 uint64_t cycles_qa, cycles_qb;
655856ef
KT
4612
4613 qa = *(struct dp_netdev_rxq **) a;
4614 qb = *(struct dp_netdev_rxq **) b;
4615
8368866e
KT
4616 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4617 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
655856ef 4618
8368866e
KT
4619 if (cycles_qa != cycles_qb) {
4620 return (cycles_qa < cycles_qb) ? 1 : -1;
a130f1a8
KT
4621 } else {
4622 /* Cycles are the same so tiebreak on port/queue id.
4623 * Tiebreaking (as opposed to return 0) ensures consistent
4624 * sort results across multiple OS's. */
f0aa3801
BP
4625 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4626 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4627 if (port_qa != port_qb) {
4628 return port_qa > port_qb ? 1 : -1;
a130f1a8
KT
4629 } else {
4630 return netdev_rxq_get_queue_id(qa->rx)
4631 - netdev_rxq_get_queue_id(qb->rx);
4632 }
655856ef 4633 }
655856ef
KT
4634}
4635
e32971b8
DDP
4636/* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4637 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4638 * pmds to unpinned queues.
4639 *
4640 * The function doesn't touch the pmd threads, it just stores the assignment
4641 * in the 'pmd' member of each rxq. */
4642static void
4643rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4644{
4645 struct dp_netdev_port *port;
4646 struct rr_numa_list rr;
c37813fd 4647 struct rr_numa *non_local_numa = NULL;
655856ef 4648 struct dp_netdev_rxq ** rxqs = NULL;
97bf8f47 4649 int n_rxqs = 0;
655856ef
KT
4650 struct rr_numa *numa = NULL;
4651 int numa_id;
e77c97b9 4652 bool assign_cyc = dp->pmd_rxq_assign_cyc;
e32971b8
DDP
4653
4654 HMAP_FOR_EACH (port, node, &dp->ports) {
e32971b8
DDP
4655 if (!netdev_is_pmd(port->netdev)) {
4656 continue;
4657 }
4658
e32971b8
DDP
4659 for (int qid = 0; qid < port->n_rxq; qid++) {
4660 struct dp_netdev_rxq *q = &port->rxqs[qid];
4661
4662 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4663 struct dp_netdev_pmd_thread *pmd;
4664
4665 pmd = dp_netdev_get_pmd(dp, q->core_id);
4666 if (!pmd) {
4667 VLOG_WARN("There is no PMD thread on core %d. Queue "
4668 "%d on port \'%s\' will not be polled.",
4669 q->core_id, qid, netdev_get_name(port->netdev));
4670 } else {
4671 q->pmd = pmd;
4672 pmd->isolated = true;
433a3fa5
GM
4673 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4674 "rx queue %d.", pmd->core_id, pmd->numa_id,
4675 netdev_rxq_get_name(q->rx),
4676 netdev_rxq_get_queue_id(q->rx));
e32971b8
DDP
4677 dp_netdev_pmd_unref(pmd);
4678 }
4679 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
8368866e
KT
4680 uint64_t cycle_hist = 0;
4681
655856ef
KT
4682 if (n_rxqs == 0) {
4683 rxqs = xmalloc(sizeof *rxqs);
e32971b8 4684 } else {
655856ef 4685 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
e32971b8 4686 }
8368866e 4687
e77c97b9
KT
4688 if (assign_cyc) {
4689 /* Sum the queue intervals and store the cycle history. */
4690 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4691 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4692 }
4693 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4694 cycle_hist);
4695 }
655856ef
KT
4696 /* Store the queue. */
4697 rxqs[n_rxqs++] = q;
e32971b8
DDP
4698 }
4699 }
4700 }
4701
e77c97b9 4702 if (n_rxqs > 1 && assign_cyc) {
655856ef
KT
4703 /* Sort the queues in order of the processing cycles
4704 * they consumed during their last pmd interval. */
cc131ac1 4705 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
655856ef
KT
4706 }
4707
4708 rr_numa_list_populate(dp, &rr);
4709 /* Assign the sorted queues to pmds in round robin. */
97bf8f47 4710 for (int i = 0; i < n_rxqs; i++) {
655856ef
KT
4711 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4712 numa = rr_numa_list_lookup(&rr, numa_id);
4713 if (!numa) {
4714 /* There are no pmds on the queue's local NUMA node.
4715 Round robin on the NUMA nodes that do have pmds. */
4716 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4717 if (!non_local_numa) {
4718 VLOG_ERR("There is no available (non-isolated) pmd "
4719 "thread for port \'%s\' queue %d. This queue "
4720 "will not be polled. Is pmd-cpu-mask set to "
4721 "zero? Or are all PMDs isolated to other "
4722 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4723 netdev_rxq_get_queue_id(rxqs[i]->rx));
4724 continue;
4725 }
e77c97b9 4726 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
655856ef
KT
4727 VLOG_WARN("There's no available (non-isolated) pmd thread "
4728 "on numa node %d. Queue %d on port \'%s\' will "
4729 "be assigned to the pmd on core %d "
4730 "(numa node %d). Expect reduced performance.",
4731 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4732 netdev_rxq_get_name(rxqs[i]->rx),
4733 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4734 } else {
e77c97b9
KT
4735 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4736 if (assign_cyc) {
4737 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4738 "rx queue %d "
4739 "(measured processing cycles %"PRIu64").",
4740 rxqs[i]->pmd->core_id, numa_id,
4741 netdev_rxq_get_name(rxqs[i]->rx),
4742 netdev_rxq_get_queue_id(rxqs[i]->rx),
4743 dp_netdev_rxq_get_cycles(rxqs[i],
4744 RXQ_CYCLES_PROC_HIST));
4745 } else {
4746 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4747 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4748 netdev_rxq_get_name(rxqs[i]->rx),
4749 netdev_rxq_get_queue_id(rxqs[i]->rx));
4750 }
655856ef
KT
4751 }
4752 }
4753
e32971b8 4754 rr_numa_list_destroy(&rr);
655856ef 4755 free(rxqs);
e32971b8
DDP
4756}
4757
140dd699
IM
4758static void
4759reload_affected_pmds(struct dp_netdev *dp)
4760{
4761 struct dp_netdev_pmd_thread *pmd;
4762
4763 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4764 if (pmd->need_reload) {
241bad15 4765 flow_mark_flush(pmd);
140dd699 4766 dp_netdev_reload_pmd__(pmd);
8f077b31
DM
4767 }
4768 }
4769
4770 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4771 if (pmd->need_reload) {
4772 if (pmd->core_id != NON_PMD_CORE_ID) {
4773 bool reload;
4774
4775 do {
4776 atomic_read_explicit(&pmd->reload, &reload,
4777 memory_order_acquire);
4778 } while (reload);
4779 }
140dd699
IM
4780 pmd->need_reload = false;
4781 }
4782 }
4783}
4784
6e3c6fa4
DDP
4785static void
4786reconfigure_pmd_threads(struct dp_netdev *dp)
4787 OVS_REQUIRES(dp->port_mutex)
4788{
e32971b8
DDP
4789 struct dp_netdev_pmd_thread *pmd;
4790 struct ovs_numa_dump *pmd_cores;
140dd699
IM
4791 struct ovs_numa_info_core *core;
4792 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4793 struct hmapx_node *node;
e32971b8 4794 bool changed = false;
140dd699 4795 bool need_to_adjust_static_tx_qids = false;
e32971b8
DDP
4796
4797 /* The pmd threads should be started only if there's a pmd port in the
4798 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4799 * NR_PMD_THREADS per numa node. */
4800 if (!has_pmd_port(dp)) {
4801 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4802 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4803 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4804 } else {
4805 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4806 }
4807
140dd699
IM
4808 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4809 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4810 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4811 /* Adjustment is required to keep 'static_tx_qid's sequential and
4812 * avoid possible issues, for example, imbalanced tx queue usage
4813 * and unnecessary locking caused by remapping on netdev level. */
4814 need_to_adjust_static_tx_qids = true;
4815 }
4816
4817 /* Check for unwanted pmd threads */
4818 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4819 if (pmd->core_id == NON_PMD_CORE_ID) {
4820 continue;
4821 }
4822 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4823 pmd->core_id)) {
4824 hmapx_add(&to_delete, pmd);
4825 } else if (need_to_adjust_static_tx_qids) {
e2cafa86 4826 atomic_store_relaxed(&pmd->reload_tx_qid, true);
140dd699 4827 pmd->need_reload = true;
e32971b8
DDP
4828 }
4829 }
4830
140dd699
IM
4831 HMAPX_FOR_EACH (node, &to_delete) {
4832 pmd = (struct dp_netdev_pmd_thread *) node->data;
4833 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4834 pmd->numa_id, pmd->core_id);
4835 dp_netdev_del_pmd(dp, pmd);
4836 }
4837 changed = !hmapx_is_empty(&to_delete);
4838 hmapx_destroy(&to_delete);
e32971b8 4839
140dd699
IM
4840 if (need_to_adjust_static_tx_qids) {
4841 /* 'static_tx_qid's are not sequential now.
4842 * Reload remaining threads to fix this. */
4843 reload_affected_pmds(dp);
4844 }
e32971b8 4845
140dd699
IM
4846 /* Check for required new pmd threads */
4847 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4848 pmd = dp_netdev_get_pmd(dp, core->core_id);
4849 if (!pmd) {
8afbf2fa
IM
4850 struct ds name = DS_EMPTY_INITIALIZER;
4851
140dd699 4852 pmd = xzalloc(sizeof *pmd);
e32971b8 4853 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
8afbf2fa
IM
4854
4855 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4856 pmd->thread = ovs_thread_create(ds_cstr(&name),
4857 pmd_thread_main, pmd);
4858 ds_destroy(&name);
4859
140dd699
IM
4860 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4861 pmd->numa_id, pmd->core_id);
4862 changed = true;
4863 } else {
4864 dp_netdev_pmd_unref(pmd);
e32971b8 4865 }
140dd699
IM
4866 }
4867
4868 if (changed) {
4869 struct ovs_numa_info_numa *numa;
e32971b8
DDP
4870
4871 /* Log the number of pmd threads per numa node. */
4872 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
140dd699 4873 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
e32971b8
DDP
4874 numa->n_cores, numa->numa_id);
4875 }
4876 }
4877
4878 ovs_numa_dump_destroy(pmd_cores);
4879}
4880
e32971b8
DDP
4881static void
4882pmd_remove_stale_ports(struct dp_netdev *dp,
4883 struct dp_netdev_pmd_thread *pmd)
4884 OVS_EXCLUDED(pmd->port_mutex)
4885 OVS_REQUIRES(dp->port_mutex)
4886{
4887 struct rxq_poll *poll, *poll_next;
4888 struct tx_port *tx, *tx_next;
4889
4890 ovs_mutex_lock(&pmd->port_mutex);
4891 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4892 struct dp_netdev_port *port = poll->rxq->port;
4893
4894 if (port->need_reconfigure
4895 || !hmap_contains(&dp->ports, &port->node)) {
4896 dp_netdev_del_rxq_from_pmd(pmd, poll);
4897 }
4898 }
4899 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4900 struct dp_netdev_port *port = tx->port;
4901
4902 if (port->need_reconfigure
4903 || !hmap_contains(&dp->ports, &port->node)) {
4904 dp_netdev_del_port_tx_from_pmd(pmd, tx);
4905 }
4906 }
4907 ovs_mutex_unlock(&pmd->port_mutex);
4908}
4909
4910/* Must be called each time a port is added/removed or the cmask changes.
4911 * This creates and destroys pmd threads, reconfigures ports, opens their
4912 * rxqs and assigns all rxqs/txqs to pmd threads. */
4913static void
4914reconfigure_datapath(struct dp_netdev *dp)
4915 OVS_REQUIRES(dp->port_mutex)
4916{
6d9fead1 4917 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
e32971b8
DDP
4918 struct dp_netdev_pmd_thread *pmd;
4919 struct dp_netdev_port *port;
4920 int wanted_txqs;
6e3c6fa4 4921
a6a426d6
IM
4922 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4923
e32971b8
DDP
4924 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4925 * on the system and the user configuration. */
4926 reconfigure_pmd_threads(dp);
6e3c6fa4 4927
e32971b8 4928 wanted_txqs = cmap_count(&dp->poll_threads);
324c8374 4929
e32971b8
DDP
4930 /* The number of pmd threads might have changed, or a port can be new:
4931 * adjust the txqs. */
4932 HMAP_FOR_EACH (port, node, &dp->ports) {
4933 netdev_set_tx_multiq(port->netdev, wanted_txqs);
324c8374
IM
4934 }
4935
e32971b8
DDP
4936 /* Step 2: Remove from the pmd threads ports that have been removed or
4937 * need reconfiguration. */
4938
4939 /* Check for all the ports that need reconfiguration. We cache this in
85a4f238
IM
4940 * 'port->need_reconfigure', because netdev_is_reconf_required() can
4941 * change at any time. */
e32971b8
DDP
4942 HMAP_FOR_EACH (port, node, &dp->ports) {
4943 if (netdev_is_reconf_required(port->netdev)) {
4944 port->need_reconfigure = true;
4945 }
4946 }
4947
4948 /* Remove from the pmd threads all the ports that have been deleted or
4949 * need reconfiguration. */
4950 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4951 pmd_remove_stale_ports(dp, pmd);
4952 }
4953
4954 /* Reload affected pmd threads. We must wait for the pmd threads before
4955 * reconfiguring the ports, because a port cannot be reconfigured while
4956 * it's being used. */
4957 reload_affected_pmds(dp);
4958
4959 /* Step 3: Reconfigure ports. */
4960
4961 /* We only reconfigure the ports that we determined above, because they're
4962 * not being used by any pmd thread at the moment. If a port fails to
4963 * reconfigure we remove it from the datapath. */
f582b6df
BP
4964 struct dp_netdev_port *next_port;
4965 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
dc36593c 4966 int err;
6e3c6fa4 4967
e32971b8
DDP
4968 if (!port->need_reconfigure) {
4969 continue;
4970 }
4971
dc36593c
DDP
4972 err = port_reconfigure(port);
4973 if (err) {
4974 hmap_remove(&dp->ports, &port->node);
4975 seq_change(dp->port_seq);
4976 port_destroy(port);
324c8374 4977 } else {
e32971b8 4978 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
6e3c6fa4
DDP
4979 }
4980 }
e32971b8
DDP
4981
4982 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
4983 * for now, we just update the 'pmd' pointer in each rxq to point to the
4984 * wanted thread according to the scheduling policy. */
4985
4986 /* Reset all the pmd threads to non isolated. */
4987 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4988 pmd->isolated = false;
4989 }
4990
4991 /* Reset all the queues to unassigned */
4992 HMAP_FOR_EACH (port, node, &dp->ports) {
4993 for (int i = 0; i < port->n_rxq; i++) {
4994 port->rxqs[i].pmd = NULL;
4995 }
4996 }
4997
4998 /* Add pinned queues and mark pmd threads isolated. */
4999 rxq_scheduling(dp, true);
5000
5001 /* Add non-pinned queues. */
5002 rxq_scheduling(dp, false);
5003
5004 /* Step 5: Remove queues not compliant with new scheduling. */
6d9fead1
DM
5005
5006 /* Count all the threads that will have at least one queue to poll. */
5007 HMAP_FOR_EACH (port, node, &dp->ports) {
5008 for (int qid = 0; qid < port->n_rxq; qid++) {
5009 struct dp_netdev_rxq *q = &port->rxqs[qid];
5010
5011 if (q->pmd) {
5012 hmapx_add(&busy_threads, q->pmd);
5013 }
5014 }
5015 }
5016
e32971b8
DDP
5017 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5018 struct rxq_poll *poll, *poll_next;
5019
5020 ovs_mutex_lock(&pmd->port_mutex);
5021 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5022 if (poll->rxq->pmd != pmd) {
5023 dp_netdev_del_rxq_from_pmd(pmd, poll);
6d9fead1
DM
5024
5025 /* This pmd might sleep after this step if it has no rxq
5026 * remaining. Tell it to busy wait for new assignment if it
5027 * has at least one scheduled queue. */
5028 if (hmap_count(&pmd->poll_list) == 0 &&
5029 hmapx_contains(&busy_threads, pmd)) {
5030 atomic_store_relaxed(&pmd->wait_for_reload, true);
5031 }
e32971b8
DDP
5032 }
5033 }
5034 ovs_mutex_unlock(&pmd->port_mutex);
5035 }
5036
6d9fead1
DM
5037 hmapx_destroy(&busy_threads);
5038
e32971b8
DDP
5039 /* Reload affected pmd threads. We must wait for the pmd threads to remove
5040 * the old queues before readding them, otherwise a queue can be polled by
5041 * two threads at the same time. */
5042 reload_affected_pmds(dp);
5043
5044 /* Step 6: Add queues from scheduling, if they're not there already. */
5045 HMAP_FOR_EACH (port, node, &dp->ports) {
5046 if (!netdev_is_pmd(port->netdev)) {
5047 continue;
5048 }
5049
5050 for (int qid = 0; qid < port->n_rxq; qid++) {
5051 struct dp_netdev_rxq *q = &port->rxqs[qid];
5052
5053 if (q->pmd) {
5054 ovs_mutex_lock(&q->pmd->port_mutex);
5055 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5056 ovs_mutex_unlock(&q->pmd->port_mutex);
5057 }
5058 }
5059 }
5060
5061 /* Add every port to the tx cache of every pmd thread, if it's not
5062 * there already and if this pmd has at least one rxq to poll. */
5063 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5064 ovs_mutex_lock(&pmd->port_mutex);
5065 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5066 HMAP_FOR_EACH (port, node, &dp->ports) {
5067 dp_netdev_add_port_tx_to_pmd(pmd, port);
5068 }
5069 }
5070 ovs_mutex_unlock(&pmd->port_mutex);
5071 }
5072
5073 /* Reload affected pmd threads. */
5074 reload_affected_pmds(dp);
5bf84282
NK
5075
5076 /* Check if PMD Auto LB is to be enabled */
5077 set_pmd_auto_lb(dp);
6e3c6fa4
DDP
5078}
5079
050c60bf
DDP
5080/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5081static bool
5082ports_require_restart(const struct dp_netdev *dp)
5083 OVS_REQUIRES(dp->port_mutex)
5084{
5085 struct dp_netdev_port *port;
5086
5087 HMAP_FOR_EACH (port, node, &dp->ports) {
5088 if (netdev_is_reconf_required(port->netdev)) {
5089 return true;
5090 }
5091 }
5092
5093 return false;
5094}
5095
5bf84282
NK
5096/* Calculates variance in the values stored in array 'a'. 'n' is the number
5097 * of elements in array to be considered for calculating vairance.
5098 * Usage example: data array 'a' contains the processing load of each pmd and
5099 * 'n' is the number of PMDs. It returns the variance in processing load of
5100 * PMDs*/
5101static uint64_t
5102variance(uint64_t a[], int n)
5103{
5104 /* Compute mean (average of elements). */
5105 uint64_t sum = 0;
5106 uint64_t mean = 0;
5107 uint64_t sqDiff = 0;
5108
5109 if (!n) {
5110 return 0;
5111 }
5112
5113 for (int i = 0; i < n; i++) {
5114 sum += a[i];
5115 }
5116
5117 if (sum) {
5118 mean = sum / n;
5119
5120 /* Compute sum squared differences with mean. */
5121 for (int i = 0; i < n; i++) {
5122 sqDiff += (a[i] - mean)*(a[i] - mean);
5123 }
5124 }
5125 return (sqDiff ? (sqDiff / n) : 0);
5126}
5127
5128
5129/* Returns the variance in the PMDs usage as part of dry run of rxqs
5130 * assignment to PMDs. */
5131static bool
5132get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5133 uint32_t num_pmds, uint64_t *predicted_variance)
5134 OVS_REQUIRES(dp->port_mutex)
5135{
5136 struct dp_netdev_port *port;
5137 struct dp_netdev_pmd_thread *pmd;
5138 struct dp_netdev_rxq **rxqs = NULL;
5139 struct rr_numa *numa = NULL;
5140 struct rr_numa_list rr;
5141 int n_rxqs = 0;
5142 bool ret = false;
5143 uint64_t *pmd_usage;
5144
5145 if (!predicted_variance) {
5146 return ret;
5147 }
5148
5149 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5150
5151 HMAP_FOR_EACH (port, node, &dp->ports) {
5152 if (!netdev_is_pmd(port->netdev)) {
5153 continue;
5154 }
5155
5156 for (int qid = 0; qid < port->n_rxq; qid++) {
5157 struct dp_netdev_rxq *q = &port->rxqs[qid];
5158 uint64_t cycle_hist = 0;
5159
5160 if (q->pmd->isolated) {
5161 continue;
5162 }
5163
5164 if (n_rxqs == 0) {
5165 rxqs = xmalloc(sizeof *rxqs);
5166 } else {
5167 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5168 }
5169
5170 /* Sum the queue intervals and store the cycle history. */
5171 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5172 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5173 }
5174 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5175 cycle_hist);
5176 /* Store the queue. */
5177 rxqs[n_rxqs++] = q;
5178 }
5179 }
5180 if (n_rxqs > 1) {
5181 /* Sort the queues in order of the processing cycles
5182 * they consumed during their last pmd interval. */
5183 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5184 }
5185 rr_numa_list_populate(dp, &rr);
5186
5187 for (int i = 0; i < n_rxqs; i++) {
5188 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5189 numa = rr_numa_list_lookup(&rr, numa_id);
5190 if (!numa) {
5191 /* Abort if cross NUMA polling. */
5192 VLOG_DBG("PMD auto lb dry run."
5193 " Aborting due to cross-numa polling.");
5194 goto cleanup;
5195 }
5196
5197 pmd = rr_numa_get_pmd(numa, true);
5198 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5199 "to be assigned port \'%s\' rx queue %d "
5200 "(measured processing cycles %"PRIu64").",
5201 pmd->core_id, numa_id,
5202 netdev_rxq_get_name(rxqs[i]->rx),
5203 netdev_rxq_get_queue_id(rxqs[i]->rx),
5204 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5205
5206 for (int id = 0; id < num_pmds; id++) {
5207 if (pmd->core_id == core_list[id]) {
5208 /* Add the processing cycles of rxq to pmd polling it. */
5209 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5210 RXQ_CYCLES_PROC_HIST);
5211 }
5212 }
5213 }
5214
5215 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5216 uint64_t total_cycles = 0;
5217
5218 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5219 continue;
5220 }
5221
5222 /* Get the total pmd cycles for an interval. */
5223 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5224 /* Estimate the cycles to cover all intervals. */
5225 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5226 for (int id = 0; id < num_pmds; id++) {
5227 if (pmd->core_id == core_list[id]) {
5228 if (pmd_usage[id]) {
5229 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5230 }
5231 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5232 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5233 }
5234 }
5235 }
5236 *predicted_variance = variance(pmd_usage, num_pmds);
5237 ret = true;
5238
5239cleanup:
5240 rr_numa_list_destroy(&rr);
5241 free(rxqs);
5242 free(pmd_usage);
5243 return ret;
5244}
5245
5246/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5247 * better distribution of load on PMDs. */
5248static bool
5249pmd_rebalance_dry_run(struct dp_netdev *dp)
5250 OVS_REQUIRES(dp->port_mutex)
5251{
5252 struct dp_netdev_pmd_thread *pmd;
5253 uint64_t *curr_pmd_usage;
5254
5255 uint64_t curr_variance;
5256 uint64_t new_variance;
5257 uint64_t improvement = 0;
5258 uint32_t num_pmds;
5259 uint32_t *pmd_corelist;
eef85380 5260 struct rxq_poll *poll;
5bf84282
NK
5261 bool ret;
5262
5263 num_pmds = cmap_count(&dp->poll_threads);
5264
5265 if (num_pmds > 1) {
5266 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5267 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5268 } else {
5269 return false;
5270 }
5271
5272 num_pmds = 0;
5273 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5274 uint64_t total_cycles = 0;
5275 uint64_t total_proc = 0;
5276
5277 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5278 continue;
5279 }
5280
5281 /* Get the total pmd cycles for an interval. */
5282 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5283 /* Estimate the cycles to cover all intervals. */
5284 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5285
eef85380
IM
5286 ovs_mutex_lock(&pmd->port_mutex);
5287 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5bf84282 5288 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
eef85380 5289 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5bf84282 5290 }
5bf84282 5291 }
eef85380
IM
5292 ovs_mutex_unlock(&pmd->port_mutex);
5293
5bf84282
NK
5294 if (total_proc) {
5295 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5296 }
5297
5298 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5299 pmd->core_id, curr_pmd_usage[num_pmds]);
5300
5301 if (atomic_count_get(&pmd->pmd_overloaded)) {
5302 atomic_count_set(&pmd->pmd_overloaded, 0);
5303 }
5304
5305 pmd_corelist[num_pmds] = pmd->core_id;
5306 num_pmds++;
5307 }
5308
5309 curr_variance = variance(curr_pmd_usage, num_pmds);
5310 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5311
5312 if (ret) {
5313 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5314 " Predicted PMD variance: %"PRIu64"",
5315 curr_variance, new_variance);
5316
5317 if (new_variance < curr_variance) {
5318 improvement =
5319 ((curr_variance - new_variance) * 100) / curr_variance;
5320 }
5321 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5322 ret = false;
5323 }
5324 }
5325
5326 free(curr_pmd_usage);
5327 free(pmd_corelist);
5328 return ret;
5329}
5330
5331
a36de779
PS
5332/* Return true if needs to revalidate datapath flows. */
5333static bool
e4cfed38
PS
5334dpif_netdev_run(struct dpif *dpif)
5335{
5336 struct dp_netdev_port *port;
5337 struct dp_netdev *dp = get_dp_netdev(dpif);
546e57d4 5338 struct dp_netdev_pmd_thread *non_pmd;
a36de779 5339 uint64_t new_tnl_seq;
c71ea3c4 5340 bool need_to_flush = true;
5bf84282
NK
5341 bool pmd_rebalance = false;
5342 long long int now = time_msec();
5343 struct dp_netdev_pmd_thread *pmd;
e4cfed38 5344
e9985d6a 5345 ovs_mutex_lock(&dp->port_mutex);
546e57d4
DDP
5346 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5347 if (non_pmd) {
5348 ovs_mutex_lock(&dp->non_pmd_mutex);
5349 HMAP_FOR_EACH (port, node, &dp->ports) {
5350 if (!netdev_is_pmd(port->netdev)) {
5351 int i;
55c955bd 5352
2fbadeb6
IM
5353 if (port->emc_enabled) {
5354 atomic_read_relaxed(&dp->emc_insert_min,
5355 &non_pmd->ctx.emc_insert_min);
5356 } else {
5357 non_pmd->ctx.emc_insert_min = 0;
5358 }
5359
546e57d4 5360 for (i = 0; i < port->n_rxq; i++) {
35c91567
DM
5361
5362 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5363 continue;
5364 }
5365
c71ea3c4
IM
5366 if (dp_netdev_process_rxq_port(non_pmd,
5367 &port->rxqs[i],
5368 port->port_no)) {
5369 need_to_flush = false;
5370 }
546e57d4 5371 }
55c955bd 5372 }
e4cfed38 5373 }
c71ea3c4
IM
5374 if (need_to_flush) {
5375 /* We didn't receive anything in the process loop.
5376 * Check if we need to send something.
5377 * There was no time updates on current iteration. */
5378 pmd_thread_ctx_time_update(non_pmd);
5379 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5380 }
5381
b010be17 5382 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
546e57d4 5383 ovs_mutex_unlock(&dp->non_pmd_mutex);
6e3c6fa4 5384
546e57d4
DDP
5385 dp_netdev_pmd_unref(non_pmd);
5386 }
1c1e46ed 5387
5bf84282
NK
5388 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5389 if (pmd_alb->is_enabled) {
5390 if (!pmd_alb->rebalance_poll_timer) {
5391 pmd_alb->rebalance_poll_timer = now;
5392 } else if ((pmd_alb->rebalance_poll_timer +
5393 pmd_alb->rebalance_intvl) < now) {
5394 pmd_alb->rebalance_poll_timer = now;
5395 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5396 if (atomic_count_get(&pmd->pmd_overloaded) >=
5397 PMD_RXQ_INTERVAL_MAX) {
5398 pmd_rebalance = true;
5399 break;
5400 }
5401 }
5402
5403 if (pmd_rebalance &&
5404 !dp_netdev_is_reconf_required(dp) &&
5405 !ports_require_restart(dp) &&
5406 pmd_rebalance_dry_run(dp)) {
5407 VLOG_INFO("PMD auto lb dry run."
5408 " requesting datapath reconfigure.");
5409 dp_netdev_request_reconfigure(dp);
5410 }
5411 }
5412 }
5413
a6a426d6 5414 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
e32971b8 5415 reconfigure_datapath(dp);
6e3c6fa4
DDP
5416 }
5417 ovs_mutex_unlock(&dp->port_mutex);
5418
53902038 5419 tnl_neigh_cache_run();
7f9b8504 5420 tnl_port_map_run();
a36de779
PS
5421 new_tnl_seq = seq_read(tnl_conf_seq);
5422
5423 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5424 dp->last_tnl_conf_seq = new_tnl_seq;
5425 return true;
5426 }
5427 return false;
e4cfed38
PS
5428}
5429
5430static void
5431dpif_netdev_wait(struct dpif *dpif)
5432{
5433 struct dp_netdev_port *port;
5434 struct dp_netdev *dp = get_dp_netdev(dpif);
5435
59e6d833 5436 ovs_mutex_lock(&dp_netdev_mutex);
e9985d6a
DDP
5437 ovs_mutex_lock(&dp->port_mutex);
5438 HMAP_FOR_EACH (port, node, &dp->ports) {
050c60bf 5439 netdev_wait_reconf_required(port->netdev);
55c955bd
PS
5440 if (!netdev_is_pmd(port->netdev)) {
5441 int i;
5442
490e82af 5443 for (i = 0; i < port->n_rxq; i++) {
947dc567 5444 netdev_rxq_wait(port->rxqs[i].rx);
55c955bd 5445 }
e4cfed38
PS
5446 }
5447 }
e9985d6a 5448 ovs_mutex_unlock(&dp->port_mutex);
59e6d833 5449 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 5450 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
5451}
5452
d0cca6c3
DDP
5453static void
5454pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5455{
5456 struct tx_port *tx_port_cached;
5457
c71ea3c4
IM
5458 /* Flush all the queued packets. */
5459 dp_netdev_pmd_flush_output_packets(pmd, true);
324c8374 5460 /* Free all used tx queue ids. */
b010be17 5461 dpif_netdev_xps_revalidate_pmd(pmd, true);
324c8374 5462
57eebbb4
DDP
5463 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5464 free(tx_port_cached);
5465 }
5466 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
d0cca6c3
DDP
5467 free(tx_port_cached);
5468 }
5469}
5470
5471/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
899363ed
BB
5472 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5473 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5474 * one txq. */
d0cca6c3
DDP
5475static void
5476pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5477 OVS_REQUIRES(pmd->port_mutex)
5478{
5479 struct tx_port *tx_port, *tx_port_cached;
5480
5481 pmd_free_cached_ports(pmd);
57eebbb4
DDP
5482 hmap_shrink(&pmd->send_port_cache);
5483 hmap_shrink(&pmd->tnl_port_cache);
d0cca6c3
DDP
5484
5485 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
57eebbb4
DDP
5486 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5487 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5488 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5489 hash_port_no(tx_port_cached->port->port_no));
5490 }
5491
5492 if (netdev_n_txq(tx_port->port->netdev)) {
5493 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5494 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5495 hash_port_no(tx_port_cached->port->port_no));
5496 }
d0cca6c3
DDP
5497 }
5498}
5499
140dd699
IM
5500static void
5501pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5502{
5503 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5504 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5505 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5506 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5507 }
5508 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5509
5510 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5511 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5512}
5513
5514static void
5515pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5516{
5517 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5518 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5519 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5520}
5521
e4cfed38 5522static int
d0cca6c3 5523pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
947dc567 5524 struct polled_queue **ppoll_list)
e4cfed38 5525{
947dc567 5526 struct polled_queue *poll_list = *ppoll_list;
ae7ad0a1
IM
5527 struct rxq_poll *poll;
5528 int i;
e4cfed38 5529
d0cca6c3 5530 ovs_mutex_lock(&pmd->port_mutex);
947dc567
DDP
5531 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5532 * sizeof *poll_list);
a1fdee13 5533
ae7ad0a1 5534 i = 0;
947dc567 5535 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
922b28d4 5536 poll_list[i].rxq = poll->rxq;
947dc567 5537 poll_list[i].port_no = poll->rxq->port->port_no;
2fbadeb6 5538 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
35c91567
DM
5539 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5540 poll_list[i].change_seq =
5541 netdev_get_change_seq(poll->rxq->port->netdev);
947dc567 5542 i++;
e4cfed38 5543 }
d0cca6c3
DDP
5544
5545 pmd_load_cached_ports(pmd);
5546
5547 ovs_mutex_unlock(&pmd->port_mutex);
e4cfed38 5548
e4cfed38 5549 *ppoll_list = poll_list;
d42f9307 5550 return i;
e4cfed38
PS
5551}
5552
6c3eee82 5553static void *
e4cfed38 5554pmd_thread_main(void *f_)
6c3eee82 5555{
65f13b50 5556 struct dp_netdev_pmd_thread *pmd = f_;
82a48ead 5557 struct pmd_perf_stats *s = &pmd->perf_stats;
e4cfed38 5558 unsigned int lc = 0;
947dc567 5559 struct polled_queue *poll_list;
6d9fead1 5560 bool wait_for_reload = false;
e2cafa86 5561 bool reload_tx_qid;
d42f9307 5562 bool exiting;
6d9fead1 5563 bool reload;
e4cfed38
PS
5564 int poll_cnt;
5565 int i;
a2ac666d 5566 int process_packets = 0;
6c3eee82 5567
e4cfed38
PS
5568 poll_list = NULL;
5569
65f13b50
AW
5570 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5571 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6930c7e0
DDP
5572 ovs_numa_thread_setaffinity_core(pmd->core_id);
5573 dpdk_set_lcore_id(pmd->core_id);
d0cca6c3 5574 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
60d8ccae 5575 dfc_cache_init(&pmd->flow_cache);
140dd699 5576 pmd_alloc_static_tx_qid(pmd);
ae7ad0a1 5577
e2cafa86 5578reload:
5bf84282
NK
5579 atomic_count_init(&pmd->pmd_overloaded, 0);
5580
7dd671f0
MK
5581 /* List port/core affinity */
5582 for (i = 0; i < poll_cnt; i++) {
ce179f11 5583 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
922b28d4
KT
5584 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5585 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
4f5d13e2
KT
5586 /* Reset the rxq current cycles counter. */
5587 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
7dd671f0
MK
5588 }
5589
2788a1b1 5590 if (!poll_cnt) {
6d9fead1
DM
5591 if (wait_for_reload) {
5592 /* Don't sleep, control thread will ask for a reload shortly. */
5593 do {
5594 atomic_read_explicit(&pmd->reload, &reload,
5595 memory_order_acquire);
5596 } while (!reload);
5597 } else {
5598 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5599 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5600 poll_block();
5601 }
2788a1b1 5602 }
2788a1b1
DDP
5603 }
5604
2a2c67b4
KT
5605 pmd->intrvl_tsc_prev = 0;
5606 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
a19896ab 5607 cycles_counter_update(s);
79f36875
JS
5608 /* Protect pmd stats from external clearing while polling. */
5609 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
e4cfed38 5610 for (;;) {
79f36875 5611 uint64_t rx_packets = 0, tx_packets = 0;
c71ea3c4 5612
a19896ab 5613 pmd_perf_start_iteration(s);
79f36875 5614
e4cfed38 5615 for (i = 0; i < poll_cnt; i++) {
2fbadeb6 5616
35c91567
DM
5617 if (!poll_list[i].rxq_enabled) {
5618 continue;
5619 }
5620
2fbadeb6
IM
5621 if (poll_list[i].emc_enabled) {
5622 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5623 &pmd->ctx.emc_insert_min);
5624 } else {
5625 pmd->ctx.emc_insert_min = 0;
5626 }
5627
a2ac666d 5628 process_packets =
a19896ab 5629 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
a2ac666d 5630 poll_list[i].port_no);
79f36875 5631 rx_packets += process_packets;
e4cfed38
PS
5632 }
5633
79f36875 5634 if (!rx_packets) {
c71ea3c4
IM
5635 /* We didn't receive anything in the process loop.
5636 * Check if we need to send something.
5637 * There was no time updates on current iteration. */
5638 pmd_thread_ctx_time_update(pmd);
79f36875 5639 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
c71ea3c4
IM
5640 }
5641
e4cfed38 5642 if (lc++ > 1024) {
e4cfed38 5643 lc = 0;
84067a4c 5644
fbe0962b 5645 coverage_try_clear();
4809891b 5646 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
9dede5cf 5647 if (!ovsrcu_try_quiesce()) {
60d8ccae 5648 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
9dede5cf 5649 }
84067a4c 5650
35c91567
DM
5651 for (i = 0; i < poll_cnt; i++) {
5652 uint64_t current_seq =
5653 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5654 if (poll_list[i].change_seq != current_seq) {
5655 poll_list[i].change_seq = current_seq;
5656 poll_list[i].rxq_enabled =
5657 netdev_rxq_enabled(poll_list[i].rxq->rx);
5658 }
5659 }
6c3eee82 5660 }
68a0625b
DM
5661
5662 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5663 if (OVS_UNLIKELY(reload)) {
5664 break;
5665 }
5666
79f36875
JS
5667 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5668 pmd_perf_metrics_enabled(pmd));
e4cfed38 5669 }
79f36875 5670 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6c3eee82 5671
d0cca6c3 5672 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6d9fead1 5673 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
e2cafa86 5674 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
299c8d61 5675 atomic_read_relaxed(&pmd->exit, &exiting);
d42f9307
DDP
5676 /* Signal here to make sure the pmd finishes
5677 * reloading the updated configuration. */
5678 dp_netdev_pmd_reload_done(pmd);
5679
e2cafa86
DM
5680 if (reload_tx_qid) {
5681 pmd_free_static_tx_qid(pmd);
5682 pmd_alloc_static_tx_qid(pmd);
5683 }
9bbf1c3d 5684
d42f9307 5685 if (!exiting) {
e4cfed38
PS
5686 goto reload;
5687 }
6c3eee82 5688
e2cafa86 5689 pmd_free_static_tx_qid(pmd);
60d8ccae 5690 dfc_cache_uninit(&pmd->flow_cache);
e4cfed38 5691 free(poll_list);
d0cca6c3 5692 pmd_free_cached_ports(pmd);
6c3eee82
BP
5693 return NULL;
5694}
5695
6b31e073
RW
5696static void
5697dp_netdev_disable_upcall(struct dp_netdev *dp)
5698 OVS_ACQUIRES(dp->upcall_rwlock)
5699{
5700 fat_rwlock_wrlock(&dp->upcall_rwlock);
5701}
5702
5dddf960
JR
5703\f
5704/* Meters */
5705static void
5706dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5707 struct ofputil_meter_features *features)
5708{
4b27db64
JR
5709 features->max_meters = MAX_METERS;
5710 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5711 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5712 features->max_bands = MAX_BANDS;
5dddf960
JR
5713 features->max_color = 0;
5714}
5715
425a7b9e
JP
5716/* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5717 * that exceed a band are dropped in-place. */
4b27db64
JR
5718static void
5719dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5720 uint32_t meter_id, long long int now)
5721{
5722 struct dp_meter *meter;
5723 struct dp_meter_band *band;
79c81260 5724 struct dp_packet *packet;
4b27db64
JR
5725 long long int long_delta_t; /* msec */
5726 uint32_t delta_t; /* msec */
79c81260 5727 const size_t cnt = dp_packet_batch_size(packets_);
4b27db64
JR
5728 uint32_t bytes, volume;
5729 int exceeded_band[NETDEV_MAX_BURST];
5730 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5731 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5732
5733 if (meter_id >= MAX_METERS) {
5734 return;
5735 }
5736
5737 meter_lock(dp, meter_id);
5738 meter = dp->meters[meter_id];
5739 if (!meter) {
5740 goto out;
5741 }
5742
5743 /* Initialize as negative values. */
5744 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5745 /* Initialize as zeroes. */
5746 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5747
5748 /* All packets will hit the meter at the same time. */
42697ca7 5749 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
4b27db64 5750
acc5df0e
IM
5751 if (long_delta_t < 0) {
5752 /* This condition means that we have several threads fighting for a
5753 meter lock, and the one who received the packets a bit later wins.
5754 Assuming that all racing threads received packets at the same time
5755 to avoid overflow. */
5756 long_delta_t = 0;
5757 }
5758
4b27db64
JR
5759 /* Make sure delta_t will not be too large, so that bucket will not
5760 * wrap around below. */
5761 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5762 ? meter->max_delta_t : (uint32_t)long_delta_t;
5763
5764 /* Update meter stats. */
5765 meter->used = now;
5766 meter->packet_count += cnt;
5767 bytes = 0;
e883448e 5768 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5769 bytes += dp_packet_size(packet);
4b27db64
JR
5770 }
5771 meter->byte_count += bytes;
5772
5773 /* Meters can operate in terms of packets per second or kilobits per
5774 * second. */
5775 if (meter->flags & OFPMF13_PKTPS) {
5776 /* Rate in packets/second, bucket 1/1000 packets. */
5777 /* msec * packets/sec = 1/1000 packets. */
5778 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5779 } else {
5780 /* Rate in kbps, bucket in bits. */
5781 /* msec * kbps = bits */
5782 volume = bytes * 8;
5783 }
5784
5785 /* Update all bands and find the one hit with the highest rate for each
5786 * packet (if any). */
5787 for (int m = 0; m < meter->n_bands; ++m) {
5788 band = &meter->bands[m];
5789
5790 /* Update band's bucket. */
5791 band->bucket += delta_t * band->up.rate;
5792 if (band->bucket > band->up.burst_size) {
5793 band->bucket = band->up.burst_size;
5794 }
5795
5796 /* Drain the bucket for all the packets, if possible. */
5797 if (band->bucket >= volume) {
5798 band->bucket -= volume;
5799 } else {
5800 int band_exceeded_pkt;
5801
5802 /* Band limit hit, must process packet-by-packet. */
5803 if (meter->flags & OFPMF13_PKTPS) {
5804 band_exceeded_pkt = band->bucket / 1000;
5805 band->bucket %= 1000; /* Remainder stays in bucket. */
5806
5807 /* Update the exceeding band for each exceeding packet.
5808 * (Only one band will be fired by a packet, and that
5809 * can be different for each packet.) */
e883448e 5810 for (int i = band_exceeded_pkt; i < cnt; i++) {
4b27db64
JR
5811 if (band->up.rate > exceeded_rate[i]) {
5812 exceeded_rate[i] = band->up.rate;
5813 exceeded_band[i] = m;
5814 }
5815 }
5816 } else {
5817 /* Packet sizes differ, must process one-by-one. */
5818 band_exceeded_pkt = cnt;
e883448e 5819 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
79c81260 5820 uint32_t bits = dp_packet_size(packet) * 8;
4b27db64
JR
5821
5822 if (band->bucket >= bits) {
5823 band->bucket -= bits;
5824 } else {
5825 if (i < band_exceeded_pkt) {
5826 band_exceeded_pkt = i;
5827 }
5828 /* Update the exceeding band for the exceeding packet.
5829 * (Only one band will be fired by a packet, and that
5830 * can be different for each packet.) */
5831 if (band->up.rate > exceeded_rate[i]) {
5832 exceeded_rate[i] = band->up.rate;
5833 exceeded_band[i] = m;
5834 }
5835 }
5836 }
5837 }
5838 /* Remember the first exceeding packet. */
5839 if (exceeded_pkt > band_exceeded_pkt) {
5840 exceeded_pkt = band_exceeded_pkt;
5841 }
5842 }
5843 }
5844
425a7b9e
JP
5845 /* Fire the highest rate band exceeded by each packet, and drop
5846 * packets if needed. */
4b27db64 5847 size_t j;
79c81260 5848 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
4b27db64
JR
5849 if (exceeded_band[j] >= 0) {
5850 /* Meter drop packet. */
5851 band = &meter->bands[exceeded_band[j]];
5852 band->packet_count += 1;
5853 band->byte_count += dp_packet_size(packet);
a13a0209 5854 COVERAGE_INC(datapath_drop_meter);
4b27db64
JR
5855 dp_packet_delete(packet);
5856 } else {
5857 /* Meter accepts packet. */
5858 dp_packet_batch_refill(packets_, packet, j);
5859 }
5860 }
5861 out:
5862 meter_unlock(dp, meter_id);
5863}
5864
5865/* Meter set/get/del processing is still single-threaded. */
5dddf960 5866static int
8101f03f 5867dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
4b27db64 5868 struct ofputil_meter_config *config)
5dddf960 5869{
4b27db64 5870 struct dp_netdev *dp = get_dp_netdev(dpif);
8101f03f 5871 uint32_t mid = meter_id.uint32;
4b27db64
JR
5872 struct dp_meter *meter;
5873 int i;
5874
4b27db64
JR
5875 if (mid >= MAX_METERS) {
5876 return EFBIG; /* Meter_id out of range. */
5877 }
5878
6508c845 5879 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
4b27db64
JR
5880 return EBADF; /* Unsupported flags set */
5881 }
2029ce9a 5882
6508c845
JP
5883 if (config->n_bands > MAX_BANDS) {
5884 return EINVAL;
2029ce9a
AVA
5885 }
5886
4b27db64
JR
5887 for (i = 0; i < config->n_bands; ++i) {
5888 switch (config->bands[i].type) {
5889 case OFPMBT13_DROP:
5890 break;
5891 default:
5892 return ENODEV; /* Unsupported band type */
5893 }
5894 }
5895
5896 /* Allocate meter */
5897 meter = xzalloc(sizeof *meter
5898 + config->n_bands * sizeof(struct dp_meter_band));
4b27db64 5899
d0db81ea
JP
5900 meter->flags = config->flags;
5901 meter->n_bands = config->n_bands;
5902 meter->max_delta_t = 0;
5903 meter->used = time_usec();
4b27db64 5904
d0db81ea
JP
5905 /* set up bands */
5906 for (i = 0; i < config->n_bands; ++i) {
5907 uint32_t band_max_delta_t;
4b27db64 5908
d0db81ea
JP
5909 /* Set burst size to a workable value if none specified. */
5910 if (config->bands[i].burst_size == 0) {
5911 config->bands[i].burst_size = config->bands[i].rate;
5912 }
5913
5914 meter->bands[i].up = config->bands[i];
5915 /* Convert burst size to the bucket units: */
5916 /* pkts => 1/1000 packets, kilobits => bits. */
5917 meter->bands[i].up.burst_size *= 1000;
5918 /* Initialize bucket to empty. */
5919 meter->bands[i].bucket = 0;
5920
5921 /* Figure out max delta_t that is enough to fill any bucket. */
5922 band_max_delta_t
5923 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5924 if (band_max_delta_t > meter->max_delta_t) {
5925 meter->max_delta_t = band_max_delta_t;
5926 }
4b27db64 5927 }
d0db81ea
JP
5928
5929 meter_lock(dp, mid);
5930 dp_delete_meter(dp, mid); /* Free existing meter, if any */
5931 dp->meters[mid] = meter;
5932 meter_unlock(dp, mid);
5933
5934 return 0;
5dddf960
JR
5935}
5936
5937static int
4b27db64
JR
5938dpif_netdev_meter_get(const struct dpif *dpif,
5939 ofproto_meter_id meter_id_,
5940 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 5941{
4b27db64 5942 const struct dp_netdev *dp = get_dp_netdev(dpif);
4b27db64 5943 uint32_t meter_id = meter_id_.uint32;
866bc756 5944 int retval = 0;
4b27db64
JR
5945
5946 if (meter_id >= MAX_METERS) {
5947 return EFBIG;
5948 }
866bc756
JP
5949
5950 meter_lock(dp, meter_id);
5951 const struct dp_meter *meter = dp->meters[meter_id];
4b27db64 5952 if (!meter) {
866bc756
JP
5953 retval = ENOENT;
5954 goto done;
4b27db64
JR
5955 }
5956 if (stats) {
5957 int i = 0;
5958
4b27db64
JR
5959 stats->packet_in_count = meter->packet_count;
5960 stats->byte_in_count = meter->byte_count;
5961
5962 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5963 stats->bands[i].packet_count = meter->bands[i].packet_count;
5964 stats->bands[i].byte_count = meter->bands[i].byte_count;
5965 }
4b27db64
JR
5966
5967 stats->n_bands = i;
5968 }
866bc756
JP
5969
5970done:
5971 meter_unlock(dp, meter_id);
5972 return retval;
5dddf960
JR
5973}
5974
5975static int
4b27db64
JR
5976dpif_netdev_meter_del(struct dpif *dpif,
5977 ofproto_meter_id meter_id_,
5978 struct ofputil_meter_stats *stats, uint16_t n_bands)
5dddf960 5979{
4b27db64
JR
5980 struct dp_netdev *dp = get_dp_netdev(dpif);
5981 int error;
5982
5983 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5984 if (!error) {
5985 uint32_t meter_id = meter_id_.uint32;
5986
5987 meter_lock(dp, meter_id);
5988 dp_delete_meter(dp, meter_id);
5989 meter_unlock(dp, meter_id);
4b27db64
JR
5990 }
5991 return error;
5dddf960
JR
5992}
5993
5994\f
6b31e073
RW
5995static void
5996dpif_netdev_disable_upcall(struct dpif *dpif)
5997 OVS_NO_THREAD_SAFETY_ANALYSIS
5998{
5999 struct dp_netdev *dp = get_dp_netdev(dpif);
6000 dp_netdev_disable_upcall(dp);
6001}
6002
6003static void
6004dp_netdev_enable_upcall(struct dp_netdev *dp)
6005 OVS_RELEASES(dp->upcall_rwlock)
6006{
6007 fat_rwlock_unlock(&dp->upcall_rwlock);
6008}
6009
6010static void
6011dpif_netdev_enable_upcall(struct dpif *dpif)
6012 OVS_NO_THREAD_SAFETY_ANALYSIS
6013{
6014 struct dp_netdev *dp = get_dp_netdev(dpif);
6015 dp_netdev_enable_upcall(dp);
6016}
6017
ae7ad0a1 6018static void
accf8626
AW
6019dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6020{
6d9fead1 6021 atomic_store_relaxed(&pmd->wait_for_reload, false);
e2cafa86 6022 atomic_store_relaxed(&pmd->reload_tx_qid, false);
2788a1b1 6023 pmd->last_reload_seq = seq_read(pmd->reload_seq);
8f077b31 6024 atomic_store_explicit(&pmd->reload, false, memory_order_release);
accf8626
AW
6025}
6026
1c1e46ed 6027/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
546e57d4
DDP
6028 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6029 * 'core_id' is NON_PMD_CORE_ID).
1c1e46ed
AW
6030 *
6031 * Caller must unrefs the returned reference. */
65f13b50 6032static struct dp_netdev_pmd_thread *
bd5131ba 6033dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
6034{
6035 struct dp_netdev_pmd_thread *pmd;
55847abe 6036 const struct cmap_node *pnode;
65f13b50 6037
b19befae 6038 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
6039 if (!pnode) {
6040 return NULL;
6041 }
65f13b50
AW
6042 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6043
1c1e46ed 6044 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
6045}
6046
f2eee189
AW
6047/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6048static void
6049dp_netdev_set_nonpmd(struct dp_netdev *dp)
e9985d6a 6050 OVS_REQUIRES(dp->port_mutex)
f2eee189
AW
6051{
6052 struct dp_netdev_pmd_thread *non_pmd;
6053
6054 non_pmd = xzalloc(sizeof *non_pmd);
00873463 6055 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
f2eee189
AW
6056}
6057
1c1e46ed
AW
6058/* Caller must have valid pointer to 'pmd'. */
6059static bool
6060dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6061{
6062 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6063}
6064
6065static void
6066dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6067{
6068 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6069 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6070 }
6071}
6072
6073/* Given cmap position 'pos', tries to ref the next node. If try_ref()
6074 * fails, keeps checking for next node until reaching the end of cmap.
6075 *
6076 * Caller must unrefs the returned reference. */
6077static struct dp_netdev_pmd_thread *
6078dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6079{
6080 struct dp_netdev_pmd_thread *next;
6081
6082 do {
6083 struct cmap_node *node;
6084
6085 node = cmap_next_position(&dp->poll_threads, pos);
6086 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6087 : NULL;
6088 } while (next && !dp_netdev_pmd_try_ref(next));
6089
6090 return next;
6091}
6092
65f13b50 6093/* Configures the 'pmd' based on the input argument. */
6c3eee82 6094static void
65f13b50 6095dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
00873463 6096 unsigned core_id, int numa_id)
65f13b50
AW
6097{
6098 pmd->dp = dp;
65f13b50
AW
6099 pmd->core_id = core_id;
6100 pmd->numa_id = numa_id;
e32971b8 6101 pmd->need_reload = false;
c71ea3c4 6102 pmd->n_output_batches = 0;
1c1e46ed
AW
6103
6104 ovs_refcount_init(&pmd->ref_cnt);
299c8d61 6105 atomic_init(&pmd->exit, false);
2788a1b1
DDP
6106 pmd->reload_seq = seq_create();
6107 pmd->last_reload_seq = seq_read(pmd->reload_seq);
14e3e12a 6108 atomic_init(&pmd->reload, false);
1c1e46ed 6109 ovs_mutex_init(&pmd->flow_mutex);
d0cca6c3 6110 ovs_mutex_init(&pmd->port_mutex);
1c1e46ed 6111 cmap_init(&pmd->flow_table);
3453b4d6 6112 cmap_init(&pmd->classifiers);
58ed6df0 6113 pmd->ctx.last_rxq = NULL;
b010be17
IM
6114 pmd_thread_ctx_time_update(pmd);
6115 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6116 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
947dc567 6117 hmap_init(&pmd->poll_list);
d0cca6c3 6118 hmap_init(&pmd->tx_ports);
57eebbb4
DDP
6119 hmap_init(&pmd->tnl_port_cache);
6120 hmap_init(&pmd->send_port_cache);
65f13b50
AW
6121 /* init the 'flow_cache' since there is no
6122 * actual thread created for NON_PMD_CORE_ID. */
6123 if (core_id == NON_PMD_CORE_ID) {
60d8ccae 6124 dfc_cache_init(&pmd->flow_cache);
140dd699 6125 pmd_alloc_static_tx_qid(pmd);
65f13b50 6126 }
82a48ead 6127 pmd_perf_stats_init(&pmd->perf_stats);
65f13b50
AW
6128 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6129 hash_int(core_id, 0));
6130}
6131
1c1e46ed
AW
6132static void
6133dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6134{
3453b4d6
JS
6135 struct dpcls *cls;
6136
1c1e46ed 6137 dp_netdev_pmd_flow_flush(pmd);
57eebbb4
DDP
6138 hmap_destroy(&pmd->send_port_cache);
6139 hmap_destroy(&pmd->tnl_port_cache);
d0cca6c3 6140 hmap_destroy(&pmd->tx_ports);
947dc567 6141 hmap_destroy(&pmd->poll_list);
3453b4d6
JS
6142 /* All flows (including their dpcls_rules) have been deleted already */
6143 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6144 dpcls_destroy(cls);
7c269972 6145 ovsrcu_postpone(free, cls);
3453b4d6
JS
6146 }
6147 cmap_destroy(&pmd->classifiers);
1c1e46ed
AW
6148 cmap_destroy(&pmd->flow_table);
6149 ovs_mutex_destroy(&pmd->flow_mutex);
2788a1b1 6150 seq_destroy(pmd->reload_seq);
d0cca6c3 6151 ovs_mutex_destroy(&pmd->port_mutex);
1c1e46ed
AW
6152 free(pmd);
6153}
6154
6155/* Stops the pmd thread, removes it from the 'dp->poll_threads',
6156 * and unrefs the struct. */
65f13b50 6157static void
e4e74c3a 6158dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 6159{
d0cca6c3
DDP
6160 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6161 * but extra cleanup is necessary */
65f13b50 6162 if (pmd->core_id == NON_PMD_CORE_ID) {
febf4a7a 6163 ovs_mutex_lock(&dp->non_pmd_mutex);
60d8ccae 6164 dfc_cache_uninit(&pmd->flow_cache);
d0cca6c3 6165 pmd_free_cached_ports(pmd);
140dd699 6166 pmd_free_static_tx_qid(pmd);
febf4a7a 6167 ovs_mutex_unlock(&dp->non_pmd_mutex);
65f13b50 6168 } else {
299c8d61 6169 atomic_store_relaxed(&pmd->exit, true);
65f13b50 6170 dp_netdev_reload_pmd__(pmd);
65f13b50
AW
6171 xpthread_join(pmd->thread, NULL);
6172 }
ae7ad0a1 6173
d0cca6c3 6174 dp_netdev_pmd_clear_ports(pmd);
ae7ad0a1 6175
e4e74c3a
AW
6176 /* Purges the 'pmd''s flows after stopping the thread, but before
6177 * destroying the flows, so that the flow stats can be collected. */
6178 if (dp->dp_purge_cb) {
6179 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6180 }
65f13b50 6181 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 6182 dp_netdev_pmd_unref(pmd);
65f13b50 6183}
6c3eee82 6184
e32971b8
DDP
6185/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6186 * thread. */
65f13b50 6187static void
e32971b8 6188dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
65f13b50
AW
6189{
6190 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
6191 struct dp_netdev_pmd_thread **pmd_list;
6192 size_t k = 0, n_pmds;
6193
e32971b8 6194 n_pmds = cmap_count(&dp->poll_threads);
d916785c 6195 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
6196
6197 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
e32971b8 6198 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
b9584f21
DDP
6199 continue;
6200 }
d916785c
DDP
6201 /* We cannot call dp_netdev_del_pmd(), since it alters
6202 * 'dp->poll_threads' (while we're iterating it) and it
6203 * might quiesce. */
6204 ovs_assert(k < n_pmds);
6205 pmd_list[k++] = pmd;
6c3eee82 6206 }
d916785c
DDP
6207
6208 for (size_t i = 0; i < k; i++) {
6209 dp_netdev_del_pmd(dp, pmd_list[i]);
6210 }
6211 free(pmd_list);
65f13b50 6212}
6c3eee82 6213
d0cca6c3
DDP
6214/* Deletes all rx queues from pmd->poll_list and all the ports from
6215 * pmd->tx_ports. */
cc245ce8 6216static void
d0cca6c3 6217dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
6218{
6219 struct rxq_poll *poll;
d0cca6c3 6220 struct tx_port *port;
cc245ce8 6221
d0cca6c3 6222 ovs_mutex_lock(&pmd->port_mutex);
947dc567 6223 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
cc245ce8
IM
6224 free(poll);
6225 }
d0cca6c3
DDP
6226 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6227 free(port);
6228 }
6229 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8
IM
6230}
6231
e32971b8 6232/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
b68872d8 6233static void
e32971b8
DDP
6234dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6235 struct dp_netdev_rxq *rxq)
6236 OVS_REQUIRES(pmd->port_mutex)
b68872d8 6237{
e32971b8
DDP
6238 int qid = netdev_rxq_get_queue_id(rxq->rx);
6239 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6240 struct rxq_poll *poll;
b68872d8 6241
e32971b8
DDP
6242 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6243 if (poll->rxq == rxq) {
6244 /* 'rxq' is already polled by this thread. Do nothing. */
6245 return;
d0cca6c3 6246 }
cc245ce8 6247 }
cc245ce8 6248
e32971b8
DDP
6249 poll = xmalloc(sizeof *poll);
6250 poll->rxq = rxq;
6251 hmap_insert(&pmd->poll_list, &poll->node, hash);
b68872d8 6252
e32971b8 6253 pmd->need_reload = true;
ae7ad0a1
IM
6254}
6255
e32971b8 6256/* Delete 'poll' from poll_list of PMD thread. */
ae7ad0a1 6257static void
e32971b8
DDP
6258dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6259 struct rxq_poll *poll)
d0cca6c3 6260 OVS_REQUIRES(pmd->port_mutex)
ae7ad0a1 6261{
e32971b8
DDP
6262 hmap_remove(&pmd->poll_list, &poll->node);
6263 free(poll);
ae7ad0a1 6264
e32971b8 6265 pmd->need_reload = true;
ae7ad0a1
IM
6266}
6267
d0cca6c3
DDP
6268/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6269 * changes to take effect. */
cc245ce8 6270static void
d0cca6c3
DDP
6271dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6272 struct dp_netdev_port *port)
e32971b8 6273 OVS_REQUIRES(pmd->port_mutex)
d0cca6c3 6274{
57eebbb4
DDP
6275 struct tx_port *tx;
6276
e32971b8
DDP
6277 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6278 if (tx) {
6279 /* 'port' is already on this thread tx cache. Do nothing. */
6280 return;
6281 }
6282
57eebbb4 6283 tx = xzalloc(sizeof *tx);
d0cca6c3 6284
324c8374
IM
6285 tx->port = port;
6286 tx->qid = -1;
c71ea3c4 6287 tx->flush_time = 0LL;
009e0033 6288 dp_packet_batch_init(&tx->output_pkts);
d0cca6c3 6289
324c8374 6290 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
e32971b8 6291 pmd->need_reload = true;
d0cca6c3
DDP
6292}
6293
e32971b8
DDP
6294/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6295 * changes to take effect. */
b9584f21 6296static void
e32971b8
DDP
6297dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6298 struct tx_port *tx)
6299 OVS_REQUIRES(pmd->port_mutex)
b9584f21 6300{
e32971b8
DDP
6301 hmap_remove(&pmd->tx_ports, &tx->node);
6302 free(tx);
6303 pmd->need_reload = true;
6c3eee82
BP
6304}
6305\f
b5cbbcf6
AZ
6306static char *
6307dpif_netdev_get_datapath_version(void)
6308{
6309 return xstrdup("<built-in>");
6310}
6311
72865317 6312static void
1c1e46ed 6313dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 6314 uint16_t tcp_flags, long long now)
72865317 6315{
eb94da30 6316 uint16_t flags;
72865317 6317
eb94da30
DDP
6318 atomic_store_relaxed(&netdev_flow->stats.used, now);
6319 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6320 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6321 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6322 flags |= tcp_flags;
6323 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
6324}
6325
623540e4 6326static int
e14deea0 6327dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 6328 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
6329 enum dpif_upcall_type type, const struct nlattr *userdata,
6330 struct ofpbuf *actions, struct ofpbuf *put_actions)
6331{
1c1e46ed 6332 struct dp_netdev *dp = pmd->dp;
623540e4 6333
623540e4
EJ
6334 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6335 return ENODEV;
6336 }
6337
6338 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6339 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 6340 char *packet_str;
cf62fa4c 6341 struct ofpbuf key;
5262eea1
JG
6342 struct odp_flow_key_parms odp_parms = {
6343 .flow = flow,
1dea1435 6344 .mask = wc ? &wc->masks : NULL,
2494ccd7 6345 .support = dp_netdev_support,
5262eea1 6346 };
623540e4
EJ
6347
6348 ofpbuf_init(&key, 0);
5262eea1 6349 odp_flow_key_from_flow(&odp_parms, &key);
2482b0b0 6350 packet_str = ofp_dp_packet_to_string(packet_);
623540e4 6351
6fd6ed71 6352 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
6353
6354 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6355 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6356
6357 ofpbuf_uninit(&key);
6358 free(packet_str);
6fd6ed71 6359
623540e4
EJ
6360 ds_destroy(&ds);
6361 }
6362
8d8ab6c2
JG
6363 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6364 actions, wc, put_actions, dp->upcall_aux);
623540e4
EJ
6365}
6366
bde94613
FA
6367static inline uint32_t
6368dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6369 const struct miniflow *mf)
6370{
6371 uint32_t hash;
6372
6373 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6374 hash = dp_packet_get_rss_hash(packet);
6375 } else {
6376 hash = miniflow_hash_5tuple(mf, 0);
6377 dp_packet_set_rss_hash(packet, hash);
6378 }
6379
6380 return hash;
6381}
6382
9bbf1c3d 6383static inline uint32_t
048963aa
DDP
6384dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6385 const struct miniflow *mf)
9bbf1c3d 6386{
048963aa 6387 uint32_t hash, recirc_depth;
9bbf1c3d 6388
f2f44f5d
DDP
6389 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6390 hash = dp_packet_get_rss_hash(packet);
6391 } else {
9bbf1c3d 6392 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 6393 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 6394 }
048963aa
DDP
6395
6396 /* The RSS hash must account for the recirculation depth to avoid
6397 * collisions in the exact match cache */
6398 recirc_depth = *recirc_depth_get_unsafe();
6399 if (OVS_UNLIKELY(recirc_depth)) {
6400 hash = hash_finish(hash, recirc_depth);
048963aa 6401 }
9bbf1c3d
DDP
6402 return hash;
6403}
6404
f7ce4811 6405struct packet_batch_per_flow {
8cbf4f47
DDP
6406 unsigned int byte_count;
6407 uint16_t tcp_flags;
8cbf4f47
DDP
6408 struct dp_netdev_flow *flow;
6409
1895cc8d 6410 struct dp_packet_batch array;
8cbf4f47
DDP
6411};
6412
6413static inline void
f7ce4811
PS
6414packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6415 struct dp_packet *packet,
aab96ec4 6416 uint16_t tcp_flags)
8cbf4f47 6417{
cf62fa4c 6418 batch->byte_count += dp_packet_size(packet);
aab96ec4 6419 batch->tcp_flags |= tcp_flags;
940ac2ce 6420 dp_packet_batch_add(&batch->array, packet);
8cbf4f47
DDP
6421}
6422
6423static inline void
f7ce4811
PS
6424packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6425 struct dp_netdev_flow *flow)
8cbf4f47 6426{
11e5cf1f 6427 flow->batch = batch;
8cbf4f47 6428
11e5cf1f 6429 batch->flow = flow;
1895cc8d 6430 dp_packet_batch_init(&batch->array);
8cbf4f47
DDP
6431 batch->byte_count = 0;
6432 batch->tcp_flags = 0;
8cbf4f47
DDP
6433}
6434
6435static inline void
f7ce4811 6436packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
b010be17 6437 struct dp_netdev_pmd_thread *pmd)
8cbf4f47
DDP
6438{
6439 struct dp_netdev_actions *actions;
6440 struct dp_netdev_flow *flow = batch->flow;
6441
940ac2ce
PC
6442 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6443 batch->byte_count,
05f9e707 6444 batch->tcp_flags, pmd->ctx.now / 1000);
8cbf4f47
DDP
6445
6446 actions = dp_netdev_flow_get_actions(flow);
6447
66e4ad8a 6448 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
b010be17 6449 actions->actions, actions->size);
8cbf4f47
DDP
6450}
6451
8aaa125d 6452static inline void
e14deea0 6453dp_netdev_queue_batches(struct dp_packet *pkt,
aab96ec4 6454 struct dp_netdev_flow *flow, uint16_t tcp_flags,
47a45d86
KT
6455 struct packet_batch_per_flow *batches,
6456 size_t *n_batches)
9bbf1c3d 6457{
f7ce4811 6458 struct packet_batch_per_flow *batch = flow->batch;
11e5cf1f 6459
f9fe365b
AZ
6460 if (OVS_UNLIKELY(!batch)) {
6461 batch = &batches[(*n_batches)++];
f7ce4811 6462 packet_batch_per_flow_init(batch, flow);
9bbf1c3d
DDP
6463 }
6464
aab96ec4 6465 packet_batch_per_flow_update(batch, pkt, tcp_flags);
9bbf1c3d
DDP
6466}
6467
9b4f08cd
VDA
6468static inline void
6469packet_enqueue_to_flow_map(struct dp_packet *packet,
6470 struct dp_netdev_flow *flow,
6471 uint16_t tcp_flags,
6472 struct dp_packet_flow_map *flow_map,
6473 size_t index)
6474{
6475 struct dp_packet_flow_map *map = &flow_map[index];
6476 map->flow = flow;
6477 map->packet = packet;
6478 map->tcp_flags = tcp_flags;
6479}
6480
60d8ccae
YW
6481/* SMC lookup function for a batch of packets.
6482 * By doing batching SMC lookup, we can use prefetch
6483 * to hide memory access latency.
6484 */
6485static inline void
6486smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6487 struct netdev_flow_key *keys,
6488 struct netdev_flow_key **missed_keys,
6489 struct dp_packet_batch *packets_,
9b4f08cd
VDA
6490 const int cnt,
6491 struct dp_packet_flow_map *flow_map,
6492 uint8_t *index_map)
60d8ccae
YW
6493{
6494 int i;
6495 struct dp_packet *packet;
6496 size_t n_smc_hit = 0, n_missed = 0;
6497 struct dfc_cache *cache = &pmd->flow_cache;
6498 struct smc_cache *smc_cache = &cache->smc_cache;
6499 const struct cmap_node *flow_node;
9b4f08cd
VDA
6500 int recv_idx;
6501 uint16_t tcp_flags;
60d8ccae
YW
6502
6503 /* Prefetch buckets for all packets */
6504 for (i = 0; i < cnt; i++) {
6505 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6506 }
6507
6508 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6509 struct dp_netdev_flow *flow = NULL;
6510 flow_node = smc_entry_get(pmd, keys[i].hash);
6511 bool hit = false;
9b4f08cd
VDA
6512 /* Get the original order of this packet in received batch. */
6513 recv_idx = index_map[i];
60d8ccae
YW
6514
6515 if (OVS_LIKELY(flow_node != NULL)) {
6516 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6517 /* Since we dont have per-port megaflow to check the port
6518 * number, we need to verify that the input ports match. */
6519 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6520 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
9b4f08cd
VDA
6521 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6522
60d8ccae 6523 /* SMC hit and emc miss, we insert into EMC */
60d8ccae
YW
6524 keys[i].len =
6525 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
18e08953 6526 emc_probabilistic_insert(pmd, &keys[i], flow);
9b4f08cd
VDA
6527 /* Add these packets into the flow map in the same order
6528 * as received.
6529 */
6530 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6531 flow_map, recv_idx);
60d8ccae
YW
6532 n_smc_hit++;
6533 hit = true;
6534 break;
6535 }
6536 }
6537 if (hit) {
6538 continue;
6539 }
6540 }
6541
6542 /* SMC missed. Group missed packets together at
6543 * the beginning of the 'packets' array. */
6544 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6545
6546 /* Preserve the order of packet for flow batching. */
6547 index_map[n_missed] = recv_idx;
6548
60d8ccae
YW
6549 /* Put missed keys to the pointer arrays return to the caller */
6550 missed_keys[n_missed++] = &keys[i];
6551 }
6552
6553 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6554}
6555
6556/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
a90ed026 6557 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d 6558 * miniflow is copied into 'keys' and the packet pointer is moved at the
60d8ccae
YW
6559 * beginning of the 'packets' array. The pointers of missed keys are put in the
6560 * missed_keys pointer array for future processing.
9bbf1c3d
DDP
6561 *
6562 * The function returns the number of packets that needs to be processed in the
6563 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026 6564 *
02305520
FA
6565 * For performance reasons a caller may choose not to initialize the metadata
6566 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6567 * is not valid and must be initialized by this function using 'port_no'.
6568 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6569 * will be ignored.
9bbf1c3d
DDP
6570 */
6571static inline size_t
60d8ccae 6572dfc_processing(struct dp_netdev_pmd_thread *pmd,
72c84bc2 6573 struct dp_packet_batch *packets_,
1895cc8d 6574 struct netdev_flow_key *keys,
60d8ccae 6575 struct netdev_flow_key **missed_keys,
f7ce4811 6576 struct packet_batch_per_flow batches[], size_t *n_batches,
9b4f08cd
VDA
6577 struct dp_packet_flow_map *flow_map,
6578 size_t *n_flows, uint8_t *index_map,
a90ed026 6579 bool md_is_valid, odp_port_t port_no)
72865317 6580{
b89c678b 6581 struct netdev_flow_key *key = &keys[0];
60d8ccae
YW
6582 size_t n_missed = 0, n_emc_hit = 0;
6583 struct dfc_cache *cache = &pmd->flow_cache;
72c84bc2 6584 struct dp_packet *packet;
45df9fef 6585 const size_t cnt = dp_packet_batch_size(packets_);
2fbadeb6 6586 uint32_t cur_min = pmd->ctx.emc_insert_min;
72c84bc2 6587 int i;
aab96ec4 6588 uint16_t tcp_flags;
60d8ccae 6589 bool smc_enable_db;
9b4f08cd
VDA
6590 size_t map_cnt = 0;
6591 bool batch_enable = true;
8cbf4f47 6592
60d8ccae 6593 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
82a48ead
JS
6594 pmd_perf_update_counter(&pmd->perf_stats,
6595 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6596 cnt);
f79b1ddb 6597
45df9fef 6598 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
9bbf1c3d 6599 struct dp_netdev_flow *flow;
aab96ec4 6600 uint32_t mark;
9bbf1c3d 6601
5a2fed48
AZ
6602 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6603 dp_packet_delete(packet);
a13a0209 6604 COVERAGE_INC(datapath_drop_rx_invalid_packet);
84d6d5eb
EJ
6605 continue;
6606 }
8cbf4f47 6607
45df9fef 6608 if (i != cnt - 1) {
72c84bc2 6609 struct dp_packet **packets = packets_->packets;
a90ed026 6610 /* Prefetch next packet data and metadata. */
72a5e2b8 6611 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 6612 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
6613 }
6614
a90ed026
DDP
6615 if (!md_is_valid) {
6616 pkt_metadata_init(&packet->md, port_no);
6617 }
aab96ec4
YL
6618
6619 if ((*recirc_depth_get() == 0) &&
6620 dp_packet_has_flow_mark(packet, &mark)) {
6621 flow = mark_to_flow_find(pmd, mark);
9b4f08cd 6622 if (OVS_LIKELY(flow)) {
aab96ec4 6623 tcp_flags = parse_tcp_flags(packet);
9b4f08cd
VDA
6624 if (OVS_LIKELY(batch_enable)) {
6625 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6626 n_batches);
6627 } else {
6628 /* Flow batching should be performed only after fast-path
6629 * processing is also completed for packets with emc miss
6630 * or else it will result in reordering of packets with
6631 * same datapath flows. */
6632 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6633 flow_map, map_cnt++);
6634 }
aab96ec4
YL
6635 continue;
6636 }
6637 }
6638
5a2fed48 6639 miniflow_extract(packet, &key->mf);
d262ac2c 6640 key->len = 0; /* Not computed yet. */
b137383e
IM
6641 key->hash =
6642 (md_is_valid == false)
6643 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6644 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6645
6646 /* If EMC is disabled skip emc_lookup */
6647 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
8aaa125d 6648 if (OVS_LIKELY(flow)) {
aab96ec4 6649 tcp_flags = miniflow_get_tcp_flags(&key->mf);
60d8ccae 6650 n_emc_hit++;
9b4f08cd
VDA
6651 if (OVS_LIKELY(batch_enable)) {
6652 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6653 n_batches);
6654 } else {
6655 /* Flow batching should be performed only after fast-path
6656 * processing is also completed for packets with emc miss
6657 * or else it will result in reordering of packets with
6658 * same datapath flows. */
6659 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6660 flow_map, map_cnt++);
6661 }
8aaa125d 6662 } else {
d1aa0b94 6663 /* Exact match cache missed. Group missed packets together at
72c84bc2
AZ
6664 * the beginning of the 'packets' array. */
6665 dp_packet_batch_refill(packets_, packet, i);
9b4f08cd
VDA
6666
6667 /* Preserve the order of packet for flow batching. */
6668 index_map[n_missed] = map_cnt;
6669 flow_map[map_cnt++].flow = NULL;
6670
400486f7 6671 /* 'key[n_missed]' contains the key of the current packet and it
60d8ccae
YW
6672 * will be passed to SMC lookup. The next key should be extracted
6673 * to 'keys[n_missed + 1]'.
6674 * We also maintain a pointer array to keys missed both SMC and EMC
6675 * which will be returned to the caller for future processing. */
6676 missed_keys[n_missed] = key;
400486f7 6677 key = &keys[++n_missed];
9b4f08cd
VDA
6678
6679 /* Skip batching for subsequent packets to avoid reordering. */
6680 batch_enable = false;
9bbf1c3d
DDP
6681 }
6682 }
9b4f08cd
VDA
6683 /* Count of packets which are not flow batched. */
6684 *n_flows = map_cnt;
9bbf1c3d 6685
60d8ccae
YW
6686 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6687
6688 if (!smc_enable_db) {
6689 return dp_packet_batch_size(packets_);
6690 }
6691
6692 /* Packets miss EMC will do a batch lookup in SMC if enabled */
9b4f08cd
VDA
6693 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6694 n_missed, flow_map, index_map);
4f150744 6695
72c84bc2 6696 return dp_packet_batch_size(packets_);
9bbf1c3d
DDP
6697}
6698
82a48ead 6699static inline int
47a45d86
KT
6700handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6701 struct dp_packet *packet,
a260d966 6702 const struct netdev_flow_key *key,
82a48ead 6703 struct ofpbuf *actions, struct ofpbuf *put_actions)
a260d966
PS
6704{
6705 struct ofpbuf *add_actions;
6706 struct dp_packet_batch b;
6707 struct match match;
6708 ovs_u128 ufid;
6709 int error;
79f36875 6710 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
a260d966
PS
6711
6712 match.tun_md.valid = false;
6713 miniflow_expand(&key->mf, &match.flow);
c98eedf9 6714 memset(&match.wc, 0, sizeof match.wc);
a260d966
PS
6715
6716 ofpbuf_clear(actions);
6717 ofpbuf_clear(put_actions);
6718
7a5e0ee7 6719 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
a260d966
PS
6720 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6721 &ufid, DPIF_UC_MISS, NULL, actions,
6722 put_actions);
6723 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6724 dp_packet_delete(packet);
a13a0209 6725 COVERAGE_INC(datapath_drop_upcall_error);
82a48ead 6726 return error;
a260d966
PS
6727 }
6728
6729 /* The Netlink encoding of datapath flow keys cannot express
6730 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6731 * tag is interpreted as exact match on the fact that there is no
6732 * VLAN. Unless we refactor a lot of code that translates between
6733 * Netlink and struct flow representations, we have to do the same
35fe9efb 6734 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
f0fb825a
EG
6735 if (!match.wc.masks.vlans[0].tci) {
6736 match.wc.masks.vlans[0].tci = htons(0xffff);
a260d966
PS
6737 }
6738
6739 /* We can't allow the packet batching in the next loop to execute
6740 * the actions. Otherwise, if there are any slow path actions,
6741 * we'll send the packet up twice. */
72c84bc2 6742 dp_packet_batch_init_packet(&b, packet);
66e4ad8a 6743 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
b010be17 6744 actions->data, actions->size);
a260d966
PS
6745
6746 add_actions = put_actions->size ? put_actions : actions;
6747 if (OVS_LIKELY(error != ENOSPC)) {
6748 struct dp_netdev_flow *netdev_flow;
6749
6750 /* XXX: There's a race window where a flow covering this packet
6751 * could have already been installed since we last did the flow
6752 * lookup before upcall. This could be solved by moving the
6753 * mutex lock outside the loop, but that's an awful long time
af741ca3 6754 * to be locking revalidators out of making flow modifications. */
a260d966 6755 ovs_mutex_lock(&pmd->flow_mutex);
3453b4d6 6756 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
a260d966
PS
6757 if (OVS_LIKELY(!netdev_flow)) {
6758 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6759 add_actions->data,
6760 add_actions->size);
6761 }
6762 ovs_mutex_unlock(&pmd->flow_mutex);
60d8ccae
YW
6763 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6764 smc_insert(pmd, key, hash);
4c30b246 6765 emc_probabilistic_insert(pmd, key, netdev_flow);
a260d966 6766 }
79f36875
JS
6767 if (pmd_perf_metrics_enabled(pmd)) {
6768 /* Update upcall stats. */
6769 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6770 struct pmd_perf_stats *s = &pmd->perf_stats;
6771 s->current.upcalls++;
6772 s->current.upcall_cycles += cycles;
6773 histogram_add_sample(&s->cycles_per_upcall, cycles);
6774 }
82a48ead 6775 return error;
a260d966
PS
6776}
6777
9bbf1c3d 6778static inline void
65f13b50 6779fast_path_processing(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6780 struct dp_packet_batch *packets_,
60d8ccae 6781 struct netdev_flow_key **keys,
9b4f08cd
VDA
6782 struct dp_packet_flow_map *flow_map,
6783 uint8_t *index_map,
b010be17 6784 odp_port_t in_port)
9bbf1c3d 6785{
31c82130 6786 const size_t cnt = dp_packet_batch_size(packets_);
1a0d5831 6787#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
6788 const size_t PKT_ARRAY_SIZE = cnt;
6789#else
1a0d5831 6790 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 6791 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 6792#endif
31c82130 6793 struct dp_packet *packet;
3453b4d6 6794 struct dpcls *cls;
0de8783a 6795 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50 6796 struct dp_netdev *dp = pmd->dp;
82a48ead 6797 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
3453b4d6 6798 int lookup_cnt = 0, add_lookup_cnt;
9bbf1c3d
DDP
6799 bool any_miss;
6800
e883448e 6801 for (size_t i = 0; i < cnt; i++) {
0de8783a 6802 /* Key length is needed in all the cases, hash computed on demand. */
60d8ccae 6803 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
9bbf1c3d 6804 }
3453b4d6
JS
6805 /* Get the classifier for the in_port */
6806 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6807 if (OVS_LIKELY(cls)) {
60d8ccae
YW
6808 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6809 rules, cnt, &lookup_cnt);
3453b4d6
JS
6810 } else {
6811 any_miss = true;
6812 memset(rules, 0, sizeof(rules));
6813 }
623540e4
EJ
6814 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6815 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6816 struct ofpbuf actions, put_actions;
623540e4
EJ
6817
6818 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6819 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6820
e883448e 6821 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 6822 struct dp_netdev_flow *netdev_flow;
623540e4 6823
0de8783a 6824 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
6825 continue;
6826 }
6827
6828 /* It's possible that an earlier slow path execution installed
0de8783a 6829 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 6830 * to catch it here than execute a miss. */
60d8ccae 6831 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
3453b4d6 6832 &add_lookup_cnt);
623540e4 6833 if (netdev_flow) {
3453b4d6 6834 lookup_cnt += add_lookup_cnt;
0de8783a 6835 rules[i] = &netdev_flow->cr;
623540e4
EJ
6836 continue;
6837 }
6838
60d8ccae 6839 int error = handle_packet_upcall(pmd, packet, keys[i],
82a48ead
JS
6840 &actions, &put_actions);
6841
6842 if (OVS_UNLIKELY(error)) {
6843 upcall_fail_cnt++;
6844 } else {
6845 upcall_ok_cnt++;
6846 }
623540e4
EJ
6847 }
6848
6849 ofpbuf_uninit(&actions);
6850 ofpbuf_uninit(&put_actions);
6851 fat_rwlock_unlock(&dp->upcall_rwlock);
ac8c2081 6852 } else if (OVS_UNLIKELY(any_miss)) {
e883448e 6853 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
0de8783a 6854 if (OVS_UNLIKELY(!rules[i])) {
31c82130 6855 dp_packet_delete(packet);
a13a0209 6856 COVERAGE_INC(datapath_drop_lock_error);
82a48ead 6857 upcall_fail_cnt++;
ac8c2081
DDP
6858 }
6859 }
623540e4 6860 }
84d6d5eb 6861
e883448e 6862 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
84d6d5eb 6863 struct dp_netdev_flow *flow;
9b4f08cd
VDA
6864 /* Get the original order of this packet in received batch. */
6865 int recv_idx = index_map[i];
6866 uint16_t tcp_flags;
8cbf4f47 6867
0de8783a 6868 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
6869 continue;
6870 }
6871
84d6d5eb 6872 flow = dp_netdev_flow_cast(rules[i]);
60d8ccae
YW
6873 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
6874 smc_insert(pmd, keys[i], hash);
0de8783a 6875
60d8ccae 6876 emc_probabilistic_insert(pmd, keys[i], flow);
9b4f08cd
VDA
6877 /* Add these packets into the flow map in the same order
6878 * as received.
6879 */
6880 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6881 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6882 flow_map, recv_idx);
8cbf4f47
DDP
6883 }
6884
82a48ead
JS
6885 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6886 cnt - upcall_ok_cnt - upcall_fail_cnt);
6887 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6888 lookup_cnt);
6889 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6890 upcall_ok_cnt);
6891 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6892 upcall_fail_cnt);
72865317
BP
6893}
6894
a90ed026
DDP
6895/* Packets enter the datapath from a port (or from recirculation) here.
6896 *
02305520
FA
6897 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6898 * When false the metadata in 'packets' need to be initialized. */
adcf00ba 6899static void
a90ed026 6900dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6901 struct dp_packet_batch *packets,
a90ed026 6902 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 6903{
1a0d5831 6904#if !defined(__CHECKER__) && !defined(_WIN32)
37eabc70 6905 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
9bbf1c3d 6906#else
1a0d5831 6907 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 6908 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 6909#endif
47a45d86
KT
6910 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6911 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
60d8ccae 6912 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
f7ce4811 6913 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
72c84bc2 6914 size_t n_batches;
9b4f08cd
VDA
6915 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6916 uint8_t index_map[PKT_ARRAY_SIZE];
6917 size_t n_flows, i;
6918
3453b4d6 6919 odp_port_t in_port;
9bbf1c3d 6920
8aaa125d 6921 n_batches = 0;
60d8ccae 6922 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
9b4f08cd
VDA
6923 flow_map, &n_flows, index_map, md_is_valid, port_no);
6924
72c84bc2 6925 if (!dp_packet_batch_is_empty(packets)) {
3453b4d6
JS
6926 /* Get ingress port from first packet's metadata. */
6927 in_port = packets->packets[0]->md.in_port.odp_port;
60d8ccae 6928 fast_path_processing(pmd, packets, missed_keys,
9b4f08cd 6929 flow_map, index_map, in_port);
8aaa125d
DDP
6930 }
6931
9b4f08cd
VDA
6932 /* Batch rest of packets which are in flow map. */
6933 for (i = 0; i < n_flows; i++) {
6934 struct dp_packet_flow_map *map = &flow_map[i];
6935
6936 if (OVS_UNLIKELY(!map->flow)) {
6937 continue;
6938 }
6939 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6940 batches, &n_batches);
6941 }
6942
ad9f0581
BB
6943 /* All the flow batches need to be reset before any call to
6944 * packet_batch_per_flow_execute() as it could potentially trigger
6945 * recirculation. When a packet matching flow ‘j’ happens to be
6946 * recirculated, the nested call to dp_netdev_input__() could potentially
6947 * classify the packet as matching another flow - say 'k'. It could happen
6948 * that in the previous call to dp_netdev_input__() that same flow 'k' had
6949 * already its own batches[k] still waiting to be served. So if its
6950 * ‘batch’ member is not reset, the recirculated packet would be wrongly
6951 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
603f2ce0
EJ
6952 for (i = 0; i < n_batches; i++) {
6953 batches[i].flow->batch = NULL;
6954 }
6955
8aaa125d 6956 for (i = 0; i < n_batches; i++) {
b010be17 6957 packet_batch_per_flow_execute(&batches[i], pmd);
9bbf1c3d
DDP
6958 }
6959}
6960
a90ed026
DDP
6961static void
6962dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6963 struct dp_packet_batch *packets,
a90ed026
DDP
6964 odp_port_t port_no)
6965{
3453b4d6 6966 dp_netdev_input__(pmd, packets, false, port_no);
a90ed026
DDP
6967}
6968
6969static void
6970dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
1895cc8d 6971 struct dp_packet_batch *packets)
a90ed026 6972{
3453b4d6 6973 dp_netdev_input__(pmd, packets, true, 0);
a90ed026
DDP
6974}
6975
9080a111 6976struct dp_netdev_execute_aux {
65f13b50 6977 struct dp_netdev_pmd_thread *pmd;
66e4ad8a 6978 const struct flow *flow;
9080a111
JR
6979};
6980
e4e74c3a
AW
6981static void
6982dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6983 void *aux)
6984{
6985 struct dp_netdev *dp = get_dp_netdev(dpif);
6986 dp->dp_purge_aux = aux;
6987 dp->dp_purge_cb = cb;
6988}
6989
6b31e073 6990static void
623540e4
EJ
6991dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6992 void *aux)
6b31e073
RW
6993{
6994 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 6995 dp->upcall_aux = aux;
6b31e073
RW
6996 dp->upcall_cb = cb;
6997}
6998
324c8374
IM
6999static void
7000dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
b010be17 7001 bool purge)
324c8374
IM
7002{
7003 struct tx_port *tx;
7004 struct dp_netdev_port *port;
7005 long long interval;
7006
57eebbb4 7007 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
9f7a3035 7008 if (!tx->port->dynamic_txqs) {
324c8374
IM
7009 continue;
7010 }
b010be17 7011 interval = pmd->ctx.now - tx->last_used;
05f9e707 7012 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
324c8374
IM
7013 port = tx->port;
7014 ovs_mutex_lock(&port->txq_used_mutex);
7015 port->txq_used[tx->qid]--;
7016 ovs_mutex_unlock(&port->txq_used_mutex);
7017 tx->qid = -1;
7018 }
7019 }
7020}
7021
7022static int
7023dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
b010be17 7024 struct tx_port *tx)
324c8374
IM
7025{
7026 struct dp_netdev_port *port;
7027 long long interval;
7028 int i, min_cnt, min_qid;
7029
b010be17
IM
7030 interval = pmd->ctx.now - tx->last_used;
7031 tx->last_used = pmd->ctx.now;
324c8374 7032
05f9e707 7033 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
324c8374
IM
7034 return tx->qid;
7035 }
7036
7037 port = tx->port;
7038
7039 ovs_mutex_lock(&port->txq_used_mutex);
7040 if (tx->qid >= 0) {
7041 port->txq_used[tx->qid]--;
7042 tx->qid = -1;
7043 }
7044
7045 min_cnt = -1;
7046 min_qid = 0;
7047 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7048 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7049 min_cnt = port->txq_used[i];
7050 min_qid = i;
7051 }
7052 }
7053
7054 port->txq_used[min_qid]++;
7055 tx->qid = min_qid;
7056
7057 ovs_mutex_unlock(&port->txq_used_mutex);
7058
b010be17 7059 dpif_netdev_xps_revalidate_pmd(pmd, false);
324c8374
IM
7060
7061 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7062 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7063 return min_qid;
7064}
7065
d0cca6c3 7066static struct tx_port *
57eebbb4
DDP
7067pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7068 odp_port_t port_no)
7069{
7070 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7071}
7072
7073static struct tx_port *
7074pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7075 odp_port_t port_no)
d0cca6c3 7076{
57eebbb4 7077 return tx_port_lookup(&pmd->send_port_cache, port_no);
d0cca6c3
DDP
7078}
7079
a36de779 7080static int
d0cca6c3 7081push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
1895cc8d
PS
7082 const struct nlattr *attr,
7083 struct dp_packet_batch *batch)
a36de779 7084{
d0cca6c3 7085 struct tx_port *tun_port;
a36de779 7086 const struct ovs_action_push_tnl *data;
4c742796 7087 int err;
a36de779
PS
7088
7089 data = nl_attr_get(attr);
7090
81765c00 7091 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
a36de779 7092 if (!tun_port) {
4c742796
PS
7093 err = -EINVAL;
7094 goto error;
a36de779 7095 }
324c8374 7096 err = netdev_push_header(tun_port->port->netdev, batch, data);
4c742796
PS
7097 if (!err) {
7098 return 0;
7099 }
7100error:
7101 dp_packet_delete_batch(batch, true);
7102 return err;
a36de779
PS
7103}
7104
66525ef3
PS
7105static void
7106dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7d7ded7a 7107 struct dp_packet *packet, bool should_steal,
66525ef3
PS
7108 struct flow *flow, ovs_u128 *ufid,
7109 struct ofpbuf *actions,
b010be17 7110 const struct nlattr *userdata)
66525ef3
PS
7111{
7112 struct dp_packet_batch b;
7113 int error;
7114
7115 ofpbuf_clear(actions);
7116
7117 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7118 DPIF_UC_ACTION, userdata, actions,
7119 NULL);
7120 if (!error || error == ENOSPC) {
72c84bc2 7121 dp_packet_batch_init_packet(&b, packet);
7d7ded7a 7122 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
b010be17 7123 actions->data, actions->size);
7d7ded7a 7124 } else if (should_steal) {
66525ef3 7125 dp_packet_delete(packet);
a13a0209 7126 COVERAGE_INC(datapath_drop_userspace_action_error);
66525ef3
PS
7127 }
7128}
7129
a36de779 7130static void
1895cc8d 7131dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7d7ded7a 7132 const struct nlattr *a, bool should_steal)
4b27db64 7133 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
7134{
7135 struct dp_netdev_execute_aux *aux = aux_;
623540e4 7136 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
7137 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7138 struct dp_netdev *dp = pmd->dp;
09f9da0b 7139 int type = nl_attr_type(a);
d0cca6c3 7140 struct tx_port *p;
a13a0209 7141 uint32_t packet_count, packets_dropped;
9080a111 7142
09f9da0b
JR
7143 switch ((enum ovs_action_attr)type) {
7144 case OVS_ACTION_ATTR_OUTPUT:
57eebbb4 7145 p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
26a5075b 7146 if (OVS_LIKELY(p)) {
009e0033
IM
7147 struct dp_packet *packet;
7148 struct dp_packet_batch out;
347ba9bb 7149
7d7ded7a 7150 if (!should_steal) {
009e0033
IM
7151 dp_packet_batch_clone(&out, packets_);
7152 dp_packet_batch_reset_cutlen(packets_);
7153 packets_ = &out;
324c8374 7154 }
009e0033 7155 dp_packet_batch_apply_cutlen(packets_);
347ba9bb 7156
009e0033
IM
7157#ifdef DPDK_NETDEV
7158 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7159 && packets_->packets[0]->source
7160 != p->output_pkts.packets[0]->source)) {
7161 /* XXX: netdev-dpdk assumes that all packets in a single
7162 * output batch has the same source. Flush here to
7163 * avoid memory access issues. */
7164 dp_netdev_pmd_flush_output_on_port(pmd, p);
7165 }
7166#endif
c71ea3c4
IM
7167 if (dp_packet_batch_size(&p->output_pkts)
7168 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7169 /* Flush here to avoid overflow. */
009e0033
IM
7170 dp_netdev_pmd_flush_output_on_port(pmd, p);
7171 }
c71ea3c4
IM
7172
7173 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7174 pmd->n_output_batches++;
7175 }
7176
e883448e 7177 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
58ed6df0
IM
7178 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7179 pmd->ctx.last_rxq;
009e0033
IM
7180 dp_packet_batch_add(&p->output_pkts, packet);
7181 }
ac8c2081 7182 return;
a13a0209
AT
7183 } else {
7184 COVERAGE_ADD(datapath_drop_invalid_port,
7185 dp_packet_batch_size(packets_));
8a4e3a85 7186 }
09f9da0b
JR
7187 break;
7188
a36de779 7189 case OVS_ACTION_ATTR_TUNNEL_PUSH:
47e1b3b6
IM
7190 if (should_steal) {
7191 /* We're requested to push tunnel header, but also we need to take
7192 * the ownership of these packets. Thus, we can avoid performing
7193 * the action, because the caller will not use the result anyway.
7194 * Just break to free the batch. */
7195 break;
a36de779 7196 }
47e1b3b6 7197 dp_packet_batch_apply_cutlen(packets_);
a13a0209
AT
7198 packet_count = dp_packet_batch_size(packets_);
7199 if (push_tnl_action(pmd, a, packets_)) {
7200 COVERAGE_ADD(datapath_drop_tunnel_push_error,
7201 packet_count);
7202 }
47e1b3b6 7203 return;
a36de779
PS
7204
7205 case OVS_ACTION_ATTR_TUNNEL_POP:
7206 if (*depth < MAX_RECIRC_DEPTH) {
aaca4fe0 7207 struct dp_packet_batch *orig_packets_ = packets_;
8611f9a4 7208 odp_port_t portno = nl_attr_get_odp_port(a);
a36de779 7209
57eebbb4 7210 p = pmd_tnl_port_cache_lookup(pmd, portno);
a36de779 7211 if (p) {
1895cc8d 7212 struct dp_packet_batch tnl_pkt;
a36de779 7213
7d7ded7a 7214 if (!should_steal) {
aaca4fe0
WT
7215 dp_packet_batch_clone(&tnl_pkt, packets_);
7216 packets_ = &tnl_pkt;
7217 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
7218 }
7219
aaca4fe0
WT
7220 dp_packet_batch_apply_cutlen(packets_);
7221
a13a0209 7222 packet_count = dp_packet_batch_size(packets_);
324c8374 7223 netdev_pop_header(p->port->netdev, packets_);
a13a0209
AT
7224 packets_dropped =
7225 packet_count - dp_packet_batch_size(packets_);
7226 if (packets_dropped) {
7227 COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7228 packets_dropped);
7229 }
72c84bc2 7230 if (dp_packet_batch_is_empty(packets_)) {
1c8f98d9
PS
7231 return;
7232 }
9235b479 7233
72c84bc2 7234 struct dp_packet *packet;
e883448e 7235 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7236 packet->md.in_port.odp_port = portno;
a36de779 7237 }
9235b479
PS
7238
7239 (*depth)++;
7240 dp_netdev_recirculate(pmd, packets_);
7241 (*depth)--;
a36de779
PS
7242 return;
7243 }
a13a0209
AT
7244 COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7245 dp_packet_batch_size(packets_));
7246 } else {
7247 COVERAGE_ADD(datapath_drop_recirc_error,
7248 dp_packet_batch_size(packets_));
a36de779
PS
7249 }
7250 break;
7251
623540e4
EJ
7252 case OVS_ACTION_ATTR_USERSPACE:
7253 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
aaca4fe0 7254 struct dp_packet_batch *orig_packets_ = packets_;
623540e4 7255 const struct nlattr *userdata;
aaca4fe0 7256 struct dp_packet_batch usr_pkt;
623540e4
EJ
7257 struct ofpbuf actions;
7258 struct flow flow;
7af12bd7 7259 ovs_u128 ufid;
aaca4fe0 7260 bool clone = false;
4fc65926 7261
623540e4
EJ
7262 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7263 ofpbuf_init(&actions, 0);
8cbf4f47 7264
aaca4fe0 7265 if (packets_->trunc) {
7d7ded7a 7266 if (!should_steal) {
aaca4fe0
WT
7267 dp_packet_batch_clone(&usr_pkt, packets_);
7268 packets_ = &usr_pkt;
aaca4fe0
WT
7269 clone = true;
7270 dp_packet_batch_reset_cutlen(orig_packets_);
7271 }
7272
7273 dp_packet_batch_apply_cutlen(packets_);
7274 }
7275
72c84bc2 7276 struct dp_packet *packet;
e883448e 7277 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7278 flow_extract(packet, &flow);
7a5e0ee7 7279 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7d7ded7a 7280 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
b010be17 7281 &ufid, &actions, userdata);
db73f716 7282 }
aaca4fe0
WT
7283
7284 if (clone) {
7285 dp_packet_delete_batch(packets_, true);
7286 }
7287
623540e4
EJ
7288 ofpbuf_uninit(&actions);
7289 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 7290
ac8c2081
DDP
7291 return;
7292 }
a13a0209
AT
7293 COVERAGE_ADD(datapath_drop_lock_error,
7294 dp_packet_batch_size(packets_));
09f9da0b 7295 break;
572f732a 7296
adcf00ba
AZ
7297 case OVS_ACTION_ATTR_RECIRC:
7298 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 7299 struct dp_packet_batch recirc_pkts;
572f732a 7300
7d7ded7a 7301 if (!should_steal) {
1895cc8d
PS
7302 dp_packet_batch_clone(&recirc_pkts, packets_);
7303 packets_ = &recirc_pkts;
28e2fa02 7304 }
8cbf4f47 7305
72c84bc2 7306 struct dp_packet *packet;
e883448e 7307 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
72c84bc2 7308 packet->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 7309 }
28e2fa02
DDP
7310
7311 (*depth)++;
1895cc8d 7312 dp_netdev_recirculate(pmd, packets_);
adcf00ba
AZ
7313 (*depth)--;
7314
ac8c2081 7315 return;
adcf00ba 7316 }
ac8c2081 7317
a13a0209
AT
7318 COVERAGE_ADD(datapath_drop_recirc_error,
7319 dp_packet_batch_size(packets_));
ac8c2081 7320 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 7321 break;
572f732a 7322
5cf3edb3
DDP
7323 case OVS_ACTION_ATTR_CT: {
7324 const struct nlattr *b;
a76a37ef 7325 bool force = false;
5cf3edb3
DDP
7326 bool commit = false;
7327 unsigned int left;
7328 uint16_t zone = 0;
7329 const char *helper = NULL;
7330 const uint32_t *setmark = NULL;
7331 const struct ovs_key_ct_labels *setlabel = NULL;
4cddb1f0
DB
7332 struct nat_action_info_t nat_action_info;
7333 struct nat_action_info_t *nat_action_info_ref = NULL;
7334 bool nat_config = false;
5cf3edb3
DDP
7335
7336 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7337 nl_attr_get_size(a)) {
7338 enum ovs_ct_attr sub_type = nl_attr_type(b);
7339
7340 switch(sub_type) {
b80e259f 7341 case OVS_CT_ATTR_FORCE_COMMIT:
a76a37ef
JR
7342 force = true;
7343 /* fall through. */
5cf3edb3
DDP
7344 case OVS_CT_ATTR_COMMIT:
7345 commit = true;
7346 break;
7347 case OVS_CT_ATTR_ZONE:
7348 zone = nl_attr_get_u16(b);
7349 break;
7350 case OVS_CT_ATTR_HELPER:
7351 helper = nl_attr_get_string(b);
7352 break;
7353 case OVS_CT_ATTR_MARK:
7354 setmark = nl_attr_get(b);
7355 break;
7356 case OVS_CT_ATTR_LABELS:
7357 setlabel = nl_attr_get(b);
7358 break;
8e83854c
JR
7359 case OVS_CT_ATTR_EVENTMASK:
7360 /* Silently ignored, as userspace datapath does not generate
7361 * netlink events. */
7362 break;
ebe62ec1
YHW
7363 case OVS_CT_ATTR_TIMEOUT:
7364 /* Userspace datapath does not support customized timeout
7365 * policy yet. */
7366 break;
4cddb1f0
DB
7367 case OVS_CT_ATTR_NAT: {
7368 const struct nlattr *b_nest;
7369 unsigned int left_nest;
7370 bool ip_min_specified = false;
7371 bool proto_num_min_specified = false;
7372 bool ip_max_specified = false;
7373 bool proto_num_max_specified = false;
7374 memset(&nat_action_info, 0, sizeof nat_action_info);
7375 nat_action_info_ref = &nat_action_info;
7376
7377 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7378 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7379
7380 switch (sub_type_nest) {
7381 case OVS_NAT_ATTR_SRC:
7382 case OVS_NAT_ATTR_DST:
7383 nat_config = true;
7384 nat_action_info.nat_action |=
7385 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7386 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7387 break;
7388 case OVS_NAT_ATTR_IP_MIN:
7389 memcpy(&nat_action_info.min_addr,
7390 nl_attr_get(b_nest),
7391 nl_attr_get_size(b_nest));
7392 ip_min_specified = true;
7393 break;
7394 case OVS_NAT_ATTR_IP_MAX:
7395 memcpy(&nat_action_info.max_addr,
7396 nl_attr_get(b_nest),
7397 nl_attr_get_size(b_nest));
7398 ip_max_specified = true;
7399 break;
7400 case OVS_NAT_ATTR_PROTO_MIN:
7401 nat_action_info.min_port =
7402 nl_attr_get_u16(b_nest);
7403 proto_num_min_specified = true;
7404 break;
7405 case OVS_NAT_ATTR_PROTO_MAX:
7406 nat_action_info.max_port =
7407 nl_attr_get_u16(b_nest);
7408 proto_num_max_specified = true;
7409 break;
7410 case OVS_NAT_ATTR_PERSISTENT:
7411 case OVS_NAT_ATTR_PROTO_HASH:
7412 case OVS_NAT_ATTR_PROTO_RANDOM:
7413 break;
7414 case OVS_NAT_ATTR_UNSPEC:
7415 case __OVS_NAT_ATTR_MAX:
7416 OVS_NOT_REACHED();
7417 }
7418 }
7419
7420 if (ip_min_specified && !ip_max_specified) {
7421 nat_action_info.max_addr = nat_action_info.min_addr;
7422 }
7423 if (proto_num_min_specified && !proto_num_max_specified) {
7424 nat_action_info.max_port = nat_action_info.min_port;
7425 }
7426 if (proto_num_min_specified || proto_num_max_specified) {
7427 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7428 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7429 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7430 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7431 }
7432 }
7433 break;
7434 }
5cf3edb3
DDP
7435 case OVS_CT_ATTR_UNSPEC:
7436 case __OVS_CT_ATTR_MAX:
7437 OVS_NOT_REACHED();
7438 }
7439 }
7440
4cddb1f0
DB
7441 /* We won't be able to function properly in this case, hence
7442 * complain loudly. */
7443 if (nat_config && !commit) {
7444 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7445 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7446 }
7447
57593fd2 7448 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
bd7d93f8 7449 commit, zone, setmark, setlabel, aux->flow->tp_src,
b010be17 7450 aux->flow->tp_dst, helper, nat_action_info_ref,
05f9e707 7451 pmd->ctx.now / 1000);
07659514 7452 break;
5cf3edb3 7453 }
07659514 7454
5dddf960 7455 case OVS_ACTION_ATTR_METER:
4b27db64 7456 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
b010be17 7457 pmd->ctx.now);
4b27db64
JR
7458 break;
7459
09f9da0b
JR
7460 case OVS_ACTION_ATTR_PUSH_VLAN:
7461 case OVS_ACTION_ATTR_POP_VLAN:
7462 case OVS_ACTION_ATTR_PUSH_MPLS:
7463 case OVS_ACTION_ATTR_POP_MPLS:
7464 case OVS_ACTION_ATTR_SET:
6d670e7f 7465 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 7466 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 7467 case OVS_ACTION_ATTR_HASH:
09f9da0b 7468 case OVS_ACTION_ATTR_UNSPEC:
aaca4fe0 7469 case OVS_ACTION_ATTR_TRUNC:
6fcecb85
YY
7470 case OVS_ACTION_ATTR_PUSH_ETH:
7471 case OVS_ACTION_ATTR_POP_ETH:
535e3acf 7472 case OVS_ACTION_ATTR_CLONE:
f59cb331
YY
7473 case OVS_ACTION_ATTR_PUSH_NSH:
7474 case OVS_ACTION_ATTR_POP_NSH:
1fe178d2 7475 case OVS_ACTION_ATTR_CT_CLEAR:
5b34f8fc 7476 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
a13a0209 7477 case OVS_ACTION_ATTR_DROP:
09f9da0b
JR
7478 case __OVS_ACTION_ATTR_MAX:
7479 OVS_NOT_REACHED();
da546e07 7480 }
ac8c2081 7481
7d7ded7a 7482 dp_packet_delete_batch(packets_, should_steal);
98403001
BP
7483}
7484
4edb9ae9 7485static void
65f13b50 7486dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 7487 struct dp_packet_batch *packets,
7d7ded7a 7488 bool should_steal, const struct flow *flow,
b010be17 7489 const struct nlattr *actions, size_t actions_len)
72865317 7490{
b010be17 7491 struct dp_netdev_execute_aux aux = { pmd, flow };
9080a111 7492
7d7ded7a 7493 odp_execute_actions(&aux, packets, should_steal, actions,
8cbf4f47 7494 actions_len, dp_execute_cb);
72865317
BP
7495}
7496
4d4e68ed
DDP
7497struct dp_netdev_ct_dump {
7498 struct ct_dpif_dump_state up;
7499 struct conntrack_dump dump;
7500 struct conntrack *ct;
7501 struct dp_netdev *dp;
7502};
7503
7504static int
7505dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
ded30c74 7506 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
7507{
7508 struct dp_netdev *dp = get_dp_netdev(dpif);
7509 struct dp_netdev_ct_dump *dump;
7510
7511 dump = xzalloc(sizeof *dump);
7512 dump->dp = dp;
57593fd2 7513 dump->ct = dp->conntrack;
4d4e68ed 7514
57593fd2 7515 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
4d4e68ed
DDP
7516
7517 *dump_ = &dump->up;
7518
7519 return 0;
7520}
7521
7522static int
7523dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7524 struct ct_dpif_dump_state *dump_,
7525 struct ct_dpif_entry *entry)
7526{
7527 struct dp_netdev_ct_dump *dump;
7528
7529 INIT_CONTAINER(dump, dump_, up);
7530
7531 return conntrack_dump_next(&dump->dump, entry);
7532}
7533
7534static int
7535dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7536 struct ct_dpif_dump_state *dump_)
7537{
7538 struct dp_netdev_ct_dump *dump;
7539 int err;
7540
7541 INIT_CONTAINER(dump, dump_, up);
7542
7543 err = conntrack_dump_done(&dump->dump);
7544
7545 free(dump);
7546
7547 return err;
7548}
7549
5d9cbb4c 7550static int
817a7657
YHW
7551dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7552 const struct ct_dpif_tuple *tuple)
5d9cbb4c
DDP
7553{
7554 struct dp_netdev *dp = get_dp_netdev(dpif);
7555
817a7657 7556 if (tuple) {
57593fd2 7557 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
817a7657 7558 }
57593fd2 7559 return conntrack_flush(dp->conntrack, zone);
5d9cbb4c
DDP
7560}
7561
c92339ad
DB
7562static int
7563dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7564{
7565 struct dp_netdev *dp = get_dp_netdev(dpif);
7566
57593fd2 7567 return conntrack_set_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7568}
7569
7570static int
7571dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7572{
7573 struct dp_netdev *dp = get_dp_netdev(dpif);
7574
57593fd2 7575 return conntrack_get_maxconns(dp->conntrack, maxconns);
c92339ad
DB
7576}
7577
875075b3
DB
7578static int
7579dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7580{
7581 struct dp_netdev *dp = get_dp_netdev(dpif);
7582
57593fd2 7583 return conntrack_get_nconns(dp->conntrack, nconns);
875075b3
DB
7584}
7585
64207120
DB
7586static int
7587dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7588{
7589 struct dp_netdev *dp = get_dp_netdev(dpif);
7590
7591 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7592}
7593
7594static int
7595dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7596{
7597 struct dp_netdev *dp = get_dp_netdev(dpif);
7598 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7599 return 0;
7600}
7601
a7f33fdb
DB
7602static int
7603dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
7604 const uint32_t *default_limits,
7605 const struct ovs_list *zone_limits)
7606{
7607 int err = 0;
7608 struct dp_netdev *dp = get_dp_netdev(dpif);
7609 if (default_limits) {
7610 err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
7611 if (err != 0) {
7612 return err;
7613 }
7614 }
7615
7616 struct ct_dpif_zone_limit *zone_limit;
7617 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7618 err = zone_limit_update(dp->conntrack, zone_limit->zone,
7619 zone_limit->limit);
7620 if (err != 0) {
7621 break;
7622 }
7623 }
7624 return err;
7625}
7626
7627static int
7628dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
7629 uint32_t *default_limit,
7630 const struct ovs_list *zone_limits_request,
7631 struct ovs_list *zone_limits_reply)
7632{
7633 struct dp_netdev *dp = get_dp_netdev(dpif);
7634 struct conntrack_zone_limit czl;
7635
7636 czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
7637 if (czl.zone == DEFAULT_ZONE) {
7638 *default_limit = czl.limit;
7639 } else {
7640 return EINVAL;
7641 }
7642
7643 if (!ovs_list_is_empty(zone_limits_request)) {
7644 struct ct_dpif_zone_limit *zone_limit;
7645 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
7646 czl = zone_limit_get(dp->conntrack, zone_limit->zone);
7647 if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
7648 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
7649 czl.limit, czl.count);
7650 } else {
7651 return EINVAL;
7652 }
7653 }
7654 } else {
7655 for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
7656 czl = zone_limit_get(dp->conntrack, z);
7657 if (czl.zone == z) {
7658 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
7659 czl.count);
7660 }
7661 }
7662 }
7663
7664 return 0;
7665}
7666
7667static int
7668dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
7669 const struct ovs_list *zone_limits)
7670{
7671 int err = 0;
7672 struct dp_netdev *dp = get_dp_netdev(dpif);
7673 struct ct_dpif_zone_limit *zone_limit;
7674 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7675 err = zone_limit_delete(dp->conntrack, zone_limit->zone);
7676 if (err != 0) {
7677 break;
7678 }
7679 }
7680
7681 return err;
7682}
7683
4ea96698
DB
7684static int
7685dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7686{
7687 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7688 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
4ea96698
DB
7689}
7690
7691static int
7692dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7693{
7694 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7695 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
4ea96698
DB
7696}
7697
7698static int
7699dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7700{
7701 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7702 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
4ea96698
DB
7703}
7704
7705/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7706 * diverge. */
7707static int
7708dpif_netdev_ipf_get_status(struct dpif *dpif,
7709 struct dpif_ipf_status *dpif_ipf_status)
7710{
7711 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7712 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
4ea96698
DB
7713 (struct ipf_status *) dpif_ipf_status);
7714 return 0;
7715}
7716
7717static int
7718dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7719 struct ipf_dump_ctx **ipf_dump_ctx)
7720{
7721 return ipf_dump_start(ipf_dump_ctx);
7722}
7723
7724static int
7725dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7726{
7727 struct dp_netdev *dp = get_dp_netdev(dpif);
57593fd2 7728 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
4ea96698
DB
7729 dump);
7730}
7731
7732static int
7733dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7734{
7735 return ipf_dump_done(ipf_dump_ctx);
7736
7737}
7738
72865317 7739const struct dpif_class dpif_netdev_class = {
72865317 7740 "netdev",
f87c1357 7741 true, /* cleanup_required */
6553d06b 7742 dpif_netdev_init,
2197d7ab 7743 dpif_netdev_enumerate,
0aeaabc8 7744 dpif_netdev_port_open_type,
72865317
BP
7745 dpif_netdev_open,
7746 dpif_netdev_close,
7dab847a 7747 dpif_netdev_destroy,
e4cfed38
PS
7748 dpif_netdev_run,
7749 dpif_netdev_wait,
72865317 7750 dpif_netdev_get_stats,
dcdcad68 7751 NULL, /* set_features */
72865317
BP
7752 dpif_netdev_port_add,
7753 dpif_netdev_port_del,
3eb67853 7754 dpif_netdev_port_set_config,
72865317
BP
7755 dpif_netdev_port_query_by_number,
7756 dpif_netdev_port_query_by_name,
98403001 7757 NULL, /* port_get_pid */
b0ec0f27
BP
7758 dpif_netdev_port_dump_start,
7759 dpif_netdev_port_dump_next,
7760 dpif_netdev_port_dump_done,
72865317
BP
7761 dpif_netdev_port_poll,
7762 dpif_netdev_port_poll_wait,
72865317 7763 dpif_netdev_flow_flush,
ac64794a
BP
7764 dpif_netdev_flow_dump_create,
7765 dpif_netdev_flow_dump_destroy,
7766 dpif_netdev_flow_dump_thread_create,
7767 dpif_netdev_flow_dump_thread_destroy,
704a1e09 7768 dpif_netdev_flow_dump_next,
1a0c894a 7769 dpif_netdev_operate,
6b31e073
RW
7770 NULL, /* recv_set */
7771 NULL, /* handlers_set */
d4f6865c 7772 dpif_netdev_set_config,
5bf93d67 7773 dpif_netdev_queue_to_priority,
6b31e073
RW
7774 NULL, /* recv */
7775 NULL, /* recv_wait */
7776 NULL, /* recv_purge */
e4e74c3a 7777 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
7778 dpif_netdev_register_upcall_cb,
7779 dpif_netdev_enable_upcall,
7780 dpif_netdev_disable_upcall,
b5cbbcf6 7781 dpif_netdev_get_datapath_version,
4d4e68ed
DDP
7782 dpif_netdev_ct_dump_start,
7783 dpif_netdev_ct_dump_next,
7784 dpif_netdev_ct_dump_done,
5d9cbb4c 7785 dpif_netdev_ct_flush,
c92339ad
DB
7786 dpif_netdev_ct_set_maxconns,
7787 dpif_netdev_ct_get_maxconns,
875075b3 7788 dpif_netdev_ct_get_nconns,
64207120
DB
7789 dpif_netdev_ct_set_tcp_seq_chk,
7790 dpif_netdev_ct_get_tcp_seq_chk,
a7f33fdb
DB
7791 dpif_netdev_ct_set_limits,
7792 dpif_netdev_ct_get_limits,
7793 dpif_netdev_ct_del_limits,
1f161318
YHW
7794 NULL, /* ct_set_timeout_policy */
7795 NULL, /* ct_get_timeout_policy */
7796 NULL, /* ct_del_timeout_policy */
7797 NULL, /* ct_timeout_policy_dump_start */
7798 NULL, /* ct_timeout_policy_dump_next */
7799 NULL, /* ct_timeout_policy_dump_done */
187bb41f 7800 NULL, /* ct_get_timeout_policy_name */
4ea96698
DB
7801 dpif_netdev_ipf_set_enabled,
7802 dpif_netdev_ipf_set_min_frag,
7803 dpif_netdev_ipf_set_max_nfrags,
7804 dpif_netdev_ipf_get_status,
7805 dpif_netdev_ipf_dump_start,
7806 dpif_netdev_ipf_dump_next,
7807 dpif_netdev_ipf_dump_done,
5dddf960
JR
7808 dpif_netdev_meter_get_features,
7809 dpif_netdev_meter_set,
7810 dpif_netdev_meter_get,
7811 dpif_netdev_meter_del,
72865317 7812};
614c4892 7813
74cc3969
BP
7814static void
7815dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7816 const char *argv[], void *aux OVS_UNUSED)
7817{
e9985d6a 7818 struct dp_netdev_port *port;
74cc3969 7819 struct dp_netdev *dp;
ff073a71 7820 odp_port_t port_no;
74cc3969 7821
8a4e3a85 7822 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
7823 dp = shash_find_data(&dp_netdevs, argv[1]);
7824 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 7825 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
7826 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7827 return;
7828 }
8a4e3a85
BP
7829 ovs_refcount_ref(&dp->ref_cnt);
7830 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 7831
59e6d833 7832 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 7833 if (get_port_by_name(dp, argv[2], &port)) {
74cc3969 7834 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 7835 goto exit;
74cc3969
BP
7836 }
7837
ff073a71
BP
7838 port_no = u32_to_odp(atoi(argv[3]));
7839 if (!port_no || port_no == ODPP_NONE) {
74cc3969 7840 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 7841 goto exit;
74cc3969 7842 }
ff073a71 7843 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 7844 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 7845 goto exit;
74cc3969 7846 }
59e6d833 7847
e9985d6a
DDP
7848 /* Remove port. */
7849 hmap_remove(&dp->ports, &port->node);
e32971b8 7850 reconfigure_datapath(dp);
59e6d833 7851
e9985d6a
DDP
7852 /* Reinsert with new port number. */
7853 port->port_no = port_no;
7854 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
e32971b8 7855 reconfigure_datapath(dp);
59e6d833 7856
d33ed218 7857 seq_change(dp->port_seq);
74cc3969 7858 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
7859
7860exit:
59e6d833 7861 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 7862 dp_netdev_unref(dp);
74cc3969
BP
7863}
7864
0cbfe35d
BP
7865static void
7866dpif_dummy_register__(const char *type)
7867{
7868 struct dpif_class *class;
7869
7870 class = xmalloc(sizeof *class);
7871 *class = dpif_netdev_class;
7872 class->type = xstrdup(type);
7873 dp_register_provider(class);
7874}
7875
8420c7ad
BP
7876static void
7877dpif_dummy_override(const char *type)
7878{
65d43fdc
YT
7879 int error;
7880
7881 /*
7882 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7883 * a userland-only build. It's useful for testsuite.
7884 */
7885 error = dp_unregister_provider(type);
7886 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
7887 dpif_dummy_register__(type);
7888 }
7889}
7890
614c4892 7891void
8420c7ad 7892dpif_dummy_register(enum dummy_level level)
614c4892 7893{
8420c7ad 7894 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
7895 struct sset types;
7896 const char *type;
7897
7898 sset_init(&types);
7899 dp_enumerate_types(&types);
7900 SSET_FOR_EACH (type, &types) {
8420c7ad 7901 dpif_dummy_override(type);
0cbfe35d
BP
7902 }
7903 sset_destroy(&types);
8420c7ad
BP
7904 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7905 dpif_dummy_override("system");
614c4892 7906 }
0cbfe35d
BP
7907
7908 dpif_dummy_register__("dummy");
74cc3969
BP
7909
7910 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 7911 "dp port new-number",
74cc3969 7912 3, 3, dpif_dummy_change_port_number, NULL);
614c4892 7913}
0de8783a
JR
7914\f
7915/* Datapath Classifier. */
7916
0fcf0776
ZF
7917static void
7918dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7919{
7920 cmap_destroy(&subtable->rules);
a0b36b39 7921 ovsrcu_postpone(free, subtable->mf_masks);
0fcf0776
ZF
7922 ovsrcu_postpone(free, subtable);
7923}
7924
0de8783a
JR
7925/* Initializes 'cls' as a classifier that initially contains no classification
7926 * rules. */
7927static void
7928dpcls_init(struct dpcls *cls)
7929{
7930 cmap_init(&cls->subtables_map);
da9cfca6 7931 pvector_init(&cls->subtables);
0de8783a
JR
7932}
7933
7934static void
7935dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7936{
3453b4d6 7937 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
da9cfca6 7938 pvector_remove(&cls->subtables, subtable);
0de8783a
JR
7939 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7940 subtable->mask.hash);
0fcf0776 7941 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
0de8783a
JR
7942}
7943
7944/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
7945 * caller's responsibility.
7946 * May only be called after all the readers have been terminated. */
7947static void
7948dpcls_destroy(struct dpcls *cls)
7949{
7950 if (cls) {
7951 struct dpcls_subtable *subtable;
7952
7953 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 7954 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
7955 dpcls_destroy_subtable(cls, subtable);
7956 }
7957 cmap_destroy(&cls->subtables_map);
da9cfca6 7958 pvector_destroy(&cls->subtables);
0de8783a
JR
7959 }
7960}
7961
7962static struct dpcls_subtable *
7963dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7964{
7965 struct dpcls_subtable *subtable;
7966
7967 /* Need to add one. */
caeb4906
JR
7968 subtable = xmalloc(sizeof *subtable
7969 - sizeof subtable->mask.mf + mask->len);
0de8783a 7970 cmap_init(&subtable->rules);
3453b4d6 7971 subtable->hit_cnt = 0;
0de8783a 7972 netdev_flow_key_clone(&subtable->mask, mask);
aadede3d 7973
a0b36b39
HH
7974 /* The count of bits in the mask defines the space required for masks.
7975 * Then call gen_masks() to create the appropriate masks, avoiding the cost
7976 * of doing runtime calculations. */
7977 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
7978 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
7979 subtable->mf_bits_set_unit0 = unit0;
7980 subtable->mf_bits_set_unit1 = unit1;
7981 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
7982 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
7983
f54d8f00
HH
7984 /* Probe for a specialized generic lookup function. */
7985 subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
7986
7987 /* If not set, assign generic lookup. Generic works for any miniflow. */
7988 if (!subtable->lookup_func) {
7989 subtable->lookup_func = dpcls_subtable_lookup_generic;
7990 }
aadede3d 7991
0de8783a 7992 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
3453b4d6 7993 /* Add the new subtable at the end of the pvector (with no hits yet) */
da9cfca6 7994 pvector_insert(&cls->subtables, subtable, 0);
84dbfb2b 7995 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
3453b4d6 7996 cmap_count(&cls->subtables_map), subtable, cls->in_port);
da9cfca6 7997 pvector_publish(&cls->subtables);
0de8783a
JR
7998
7999 return subtable;
8000}
8001
8002static inline struct dpcls_subtable *
8003dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8004{
8005 struct dpcls_subtable *subtable;
8006
8007 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8008 &cls->subtables_map) {
8009 if (netdev_flow_key_equal(&subtable->mask, mask)) {
8010 return subtable;
8011 }
8012 }
8013 return dpcls_create_subtable(cls, mask);
8014}
8015
3453b4d6
JS
8016
8017/* Periodically sort the dpcls subtable vectors according to hit counts */
8018static void
8019dpcls_sort_subtable_vector(struct dpcls *cls)
8020{
8021 struct pvector *pvec = &cls->subtables;
8022 struct dpcls_subtable *subtable;
8023
8024 PVECTOR_FOR_EACH (subtable, pvec) {
8025 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8026 subtable->hit_cnt = 0;
8027 }
8028 pvector_publish(pvec);
8029}
8030
8031static inline void
4809891b
KT
8032dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8033 struct polled_queue *poll_list, int poll_cnt)
3453b4d6
JS
8034{
8035 struct dpcls *cls;
5bf84282
NK
8036 uint64_t tot_idle = 0, tot_proc = 0;
8037 unsigned int pmd_load = 0;
3453b4d6 8038
b010be17 8039 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
2a2c67b4 8040 uint64_t curr_tsc;
5bf84282
NK
8041 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8042 if (pmd_alb->is_enabled && !pmd->isolated
8043 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8044 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8045 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8046 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8047 {
8048 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8049 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8050 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8051 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8052
8053 if (tot_proc) {
8054 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8055 }
8056
8057 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8058 atomic_count_inc(&pmd->pmd_overloaded);
8059 } else {
8060 atomic_count_set(&pmd->pmd_overloaded, 0);
8061 }
8062 }
8063
8064 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8065 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8066 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8067 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8068
4809891b
KT
8069 /* Get the cycles that were used to process each queue and store. */
8070 for (unsigned i = 0; i < poll_cnt; i++) {
8071 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8072 RXQ_CYCLES_PROC_CURR);
8073 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8074 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8075 0);
8076 }
2a2c67b4
KT
8077 curr_tsc = cycles_counter_update(&pmd->perf_stats);
8078 if (pmd->intrvl_tsc_prev) {
8079 /* There is a prev timestamp, store a new intrvl cycle count. */
8080 atomic_store_relaxed(&pmd->intrvl_cycles,
8081 curr_tsc - pmd->intrvl_tsc_prev);
8082 }
8083 pmd->intrvl_tsc_prev = curr_tsc;
4809891b 8084 /* Start new measuring interval */
b010be17 8085 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
4809891b
KT
8086 }
8087
b010be17 8088 if (pmd->ctx.now > pmd->next_optimization) {
3453b4d6
JS
8089 /* Try to obtain the flow lock to block out revalidator threads.
8090 * If not possible, just try next time. */
8091 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8092 /* Optimize each classifier */
8093 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8094 dpcls_sort_subtable_vector(cls);
8095 }
8096 ovs_mutex_unlock(&pmd->flow_mutex);
8097 /* Start new measuring interval */
b010be17
IM
8098 pmd->next_optimization = pmd->ctx.now
8099 + DPCLS_OPTIMIZATION_INTERVAL;
3453b4d6
JS
8100 }
8101 }
8102}
8103
0de8783a
JR
8104/* Insert 'rule' into 'cls'. */
8105static void
8106dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8107 const struct netdev_flow_key *mask)
8108{
8109 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8110
3453b4d6 8111 /* Refer to subtable's mask, also for later removal. */
0de8783a
JR
8112 rule->mask = &subtable->mask;
8113 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8114}
8115
8116/* Removes 'rule' from 'cls', also destructing the 'rule'. */
8117static void
8118dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8119{
8120 struct dpcls_subtable *subtable;
8121
8122 ovs_assert(rule->mask);
8123
3453b4d6 8124 /* Get subtable from reference in rule->mask. */
0de8783a 8125 INIT_CONTAINER(subtable, rule->mask, mask);
0de8783a
JR
8126 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8127 == 0) {
3453b4d6 8128 /* Delete empty subtable. */
0de8783a 8129 dpcls_destroy_subtable(cls, subtable);
da9cfca6 8130 pvector_publish(&cls->subtables);
0de8783a
JR
8131 }
8132}
8133
a0b36b39
HH
8134/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8135static inline void
8136netdev_flow_key_gen_mask_unit(uint64_t iter,
8137 const uint64_t count,
8138 uint64_t *mf_masks)
8139{
8140 int i;
8141 for (i = 0; i < count; i++) {
8142 uint64_t lowest_bit = (iter & -iter);
8143 iter &= ~lowest_bit;
8144 mf_masks[i] = (lowest_bit - 1);
8145 }
8146 /* Checks that count has covered all bits in the iter bitmap. */
8147 ovs_assert(iter == 0);
8148}
8149
8150/* Generate a mask for each block in the miniflow, based on the bits set. This
8151 * allows easily masking packets with the generated array here, without
8152 * calculations. This replaces runtime-calculating the masks.
8153 * @param key The table to generate the mf_masks for
8154 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8155 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8156 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8157 */
8158void
8159netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8160 uint64_t *mf_masks,
8161 const uint32_t mf_bits_u0,
8162 const uint32_t mf_bits_u1)
8163{
8164 uint64_t iter_u0 = tbl->mf.map.bits[0];
8165 uint64_t iter_u1 = tbl->mf.map.bits[1];
8166
8167 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8168 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8169}
8170
361d808d
JR
8171/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8172 * in 'mask' the values in 'key' and 'target' are the same. */
f5ace7cd 8173bool
0de8783a
JR
8174dpcls_rule_matches_key(const struct dpcls_rule *rule,
8175 const struct netdev_flow_key *target)
8176{
09b0fa9c
JR
8177 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8178 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 8179 uint64_t value;
0de8783a 8180
5fcff47b
JR
8181 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8182 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
8183 return false;
8184 }
8185 }
8186 return true;
8187}
8188
5b1c9c78
FA
8189/* For each miniflow in 'keys' performs a classifier lookup writing the result
8190 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
0de8783a
JR
8191 * NULL it is skipped.
8192 *
8193 * This function is optimized for use in the userspace datapath and therefore
8194 * does not implement a lot of features available in the standard
8195 * classifier_lookup() function. Specifically, it does not implement
8196 * priorities, instead returning any rule which matches the flow.
8197 *
5b1c9c78 8198 * Returns true if all miniflows found a corresponding rule. */
0de8783a 8199static bool
60d8ccae 8200dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
3453b4d6
JS
8201 struct dpcls_rule **rules, const size_t cnt,
8202 int *num_lookups_p)
0de8783a 8203{
5b1c9c78 8204 /* The received 'cnt' miniflows are the search-keys that will be processed
63906f18
BB
8205 * to find a matching entry into the available subtables.
8206 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
aadede3d 8207#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
63906f18 8208 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
0de8783a 8209
0de8783a 8210 struct dpcls_subtable *subtable;
aadede3d 8211 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
63906f18
BB
8212
8213 if (cnt != MAP_BITS) {
8214 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
0de8783a
JR
8215 }
8216 memset(rules, 0, cnt * sizeof *rules);
8217
3453b4d6 8218 int lookups_match = 0, subtable_pos = 1;
aadede3d 8219 uint32_t found_map;
3453b4d6 8220
5b1c9c78
FA
8221 /* The Datapath classifier - aka dpcls - is composed of subtables.
8222 * Subtables are dynamically created as needed when new rules are inserted.
8223 * Each subtable collects rules with matches on a specific subset of packet
8224 * fields as defined by the subtable's mask. We proceed to process every
8225 * search-key against each subtable, but when a match is found for a
8226 * search-key, the search for that key can stop because the rules are
8227 * non-overlapping. */
da9cfca6 8228 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
aadede3d
HH
8229 /* Call the subtable specific lookup function. */
8230 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
63906f18 8231
aadede3d
HH
8232 /* Count the number of subtables searched for this packet match. This
8233 * estimates the "spread" of subtables looked at per matched packet. */
8234 uint32_t pkts_matched = count_1bits(found_map);
8235 lookups_match += pkts_matched * subtable_pos;
63906f18 8236
aadede3d
HH
8237 /* Clear the found rules, and return early if all packets are found. */
8238 keys_map &= ~found_map;
63906f18 8239 if (!keys_map) {
3453b4d6
JS
8240 if (num_lookups_p) {
8241 *num_lookups_p = lookups_match;
8242 }
aadede3d 8243 return true;
0de8783a 8244 }
3453b4d6
JS
8245 subtable_pos++;
8246 }
aadede3d 8247
3453b4d6
JS
8248 if (num_lookups_p) {
8249 *num_lookups_p = lookups_match;
0de8783a 8250 }
aadede3d 8251 return false;
0de8783a 8252}