]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netdev.c
dpif-netdev: Add reconfiguration request to dp_netdev.
[mirror_ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
d262ac2c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
7f3adc00 24#include <net/if.h>
7daedce4 25#include <netinet/in.h>
cdee00fd 26#include <stdint.h>
72865317
BP
27#include <stdlib.h>
28#include <string.h>
29#include <sys/ioctl.h>
7daedce4 30#include <sys/socket.h>
72865317 31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
9f861c91 34#include "bitmap.h"
59e6d833 35#include "cmap.h"
7daedce4 36#include "coverage.h"
72865317 37#include "csum.h"
e14deea0 38#include "dp-packet.h"
614c4892 39#include "dpif.h"
72865317 40#include "dpif-provider.h"
614c4892 41#include "dummy.h"
afae68b1 42#include "fat-rwlock.h"
72865317 43#include "flow.h"
762d146a 44#include "hmapx.h"
6c3eee82 45#include "latch.h"
72865317 46#include "netdev.h"
8617afff 47#include "netdev-dpdk.h"
de281153 48#include "netdev-vport.h"
cdee00fd 49#include "netlink.h"
f094af7b 50#include "odp-execute.h"
72865317 51#include "odp-util.h"
25d436fb
BW
52#include "openvswitch/dynamic-string.h"
53#include "openvswitch/list.h"
54#include "openvswitch/match.h"
55#include "openvswitch/ofp-print.h"
64c96779 56#include "openvswitch/ofpbuf.h"
25d436fb 57#include "openvswitch/vlog.h"
5a034064 58#include "ovs-numa.h"
61e7deb1 59#include "ovs-rcu.h"
72865317
BP
60#include "packets.h"
61#include "poll-loop.h"
0de8783a 62#include "pvector.h"
26c6b6cd 63#include "random.h"
d33ed218 64#include "seq.h"
ee89ea7b 65#include "openvswitch/shash.h"
0cbfe35d 66#include "sset.h"
72865317 67#include "timeval.h"
53902038 68#include "tnl-neigh-cache.h"
7f9b8504 69#include "tnl-ports.h"
74cc3969 70#include "unixctl.h"
72865317 71#include "util.h"
7daedce4 72
d98e6007 73VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 74
8bb113da 75#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
76/* Use per thread recirc_depth to prevent recirculation loop. */
77#define MAX_RECIRC_DEPTH 5
78DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 79
72865317 80/* Configuration parameters. */
72865317
BP
81enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
82
8a4e3a85
BP
83/* Protects against changes to 'dp_netdevs'. */
84static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
85
86/* Contains all 'struct dp_netdev's. */
87static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
88 = SHASH_INITIALIZER(&dp_netdevs);
89
623540e4 90static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 91
2494ccd7
JS
92static struct odp_support dp_netdev_support = {
93 .max_mpls_depth = SIZE_MAX,
94 .recirc = true,
95};
96
79df317f 97/* Stores a miniflow with inline values */
9bbf1c3d 98
9bbf1c3d 99struct netdev_flow_key {
caeb4906
JR
100 uint32_t hash; /* Hash function differs for different users. */
101 uint32_t len; /* Length of the following miniflow (incl. map). */
0de8783a 102 struct miniflow mf;
8fd47924 103 uint64_t buf[FLOW_MAX_PACKET_U64S];
9bbf1c3d
DDP
104};
105
106/* Exact match cache for frequently used flows
107 *
108 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
109 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 110 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
111 *
112 * A cache entry holds a reference to its 'dp_netdev_flow'.
113 *
114 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
115 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
116 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
117 * value is the index of a cache entry where the miniflow could be.
118 *
119 *
120 * Thread-safety
121 * =============
122 *
123 * Each pmd_thread has its own private exact match cache.
124 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
125 */
126
fc82e877 127#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
128#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
129#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
130#define EM_FLOW_HASH_SEGS 2
131
132struct emc_entry {
9bbf1c3d 133 struct dp_netdev_flow *flow;
0de8783a 134 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
135};
136
137struct emc_cache {
138 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 139 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
140};
141
142/* Iterate in the exact match cache through every entry that might contain a
143 * miniflow with hash 'HASH'. */
144#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
145 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
146 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
147 i__ < EM_FLOW_HASH_SEGS; \
148 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
149\f
150/* Simple non-wildcarding single-priority classifier. */
151
152struct dpcls {
153 struct cmap subtables_map;
154 struct pvector subtables;
155};
9bbf1c3d 156
0de8783a
JR
157/* A rule to be inserted to the classifier. */
158struct dpcls_rule {
159 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
160 struct netdev_flow_key *mask; /* Subtable's mask. */
161 struct netdev_flow_key flow; /* Matching key. */
162 /* 'flow' must be the last field, additional space is allocated here. */
163};
164
165static void dpcls_init(struct dpcls *);
166static void dpcls_destroy(struct dpcls *);
167static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
168 const struct netdev_flow_key *mask);
169static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
170static bool dpcls_lookup(const struct dpcls *cls,
171 const struct netdev_flow_key keys[],
172 struct dpcls_rule **rules, size_t cnt);
173\f
8a4e3a85
BP
174/* Datapath based on the network device interface from netdev.h.
175 *
176 *
177 * Thread-safety
178 * =============
179 *
180 * Some members, marked 'const', are immutable. Accessing other members
181 * requires synchronization, as noted in more detail below.
182 *
183 * Acquisition order is, from outermost to innermost:
184 *
185 * dp_netdev_mutex (global)
59e6d833 186 * port_mutex
d0cca6c3 187 * non_pmd_mutex
8a4e3a85 188 */
72865317 189struct dp_netdev {
8a4e3a85
BP
190 const struct dpif_class *const class;
191 const char *const name;
6b31e073 192 struct dpif *dpif;
6a8267c5
BP
193 struct ovs_refcount ref_cnt;
194 atomic_flag destroyed;
72865317 195
8a4e3a85
BP
196 /* Ports.
197 *
e9985d6a
DDP
198 * Any lookup into 'ports' or any access to the dp_netdev_ports found
199 * through 'ports' requires taking 'port_mutex'. */
59e6d833 200 struct ovs_mutex port_mutex;
e9985d6a 201 struct hmap ports;
d33ed218 202 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 203
6b31e073
RW
204 /* Protects access to ofproto-dpif-upcall interface during revalidator
205 * thread synchronization. */
206 struct fat_rwlock upcall_rwlock;
623540e4
EJ
207 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
208 void *upcall_aux;
6b31e073 209
e4e74c3a
AW
210 /* Callback function for notifying the purging of dp flows (during
211 * reseting pmd deletion). */
212 dp_purge_callback *dp_purge_cb;
213 void *dp_purge_aux;
214
65f13b50
AW
215 /* Stores all 'struct dp_netdev_pmd_thread's. */
216 struct cmap poll_threads;
217
218 /* Protects the access of the 'struct dp_netdev_pmd_thread'
219 * instance for non-pmd thread. */
220 struct ovs_mutex non_pmd_mutex;
221
222 /* Each pmd thread will store its pointer to
223 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
224 ovsthread_key_t per_pmd_key;
f2eee189 225
a6a426d6
IM
226 struct seq *reconfigure_seq;
227 uint64_t last_reconfigure_seq;
228
a14b8947 229 /* Cpu mask for pin of pmd threads. */
f2eee189 230 char *pmd_cmask;
6e3c6fa4 231
a36de779 232 uint64_t last_tnl_conf_seq;
72865317
BP
233};
234
8a4e3a85 235static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
e9985d6a
DDP
236 odp_port_t)
237 OVS_REQUIRES(dp->port_mutex);
ff073a71 238
51852a57 239enum dp_stat_type {
abcf3ef4
DDP
240 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
241 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
51852a57
BP
242 DP_STAT_MISS, /* Packets that did not match. */
243 DP_STAT_LOST, /* Packets not passed up to the client. */
244 DP_N_STATS
245};
246
55e3ca97
DDP
247enum pmd_cycles_counter_type {
248 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
249 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
250 PMD_N_CYCLES
251};
252
324c8374
IM
253#define XPS_TIMEOUT_MS 500LL
254
72865317
BP
255/* A port in a netdev-based datapath. */
256struct dp_netdev_port {
35303d71 257 odp_port_t port_no;
72865317 258 struct netdev *netdev;
e9985d6a 259 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 260 struct netdev_saved_flags *sf;
490e82af 261 unsigned n_rxq; /* Number of elements in 'rxq' */
55c955bd 262 struct netdev_rxq **rxq;
324c8374
IM
263 bool dynamic_txqs; /* If true XPS will be used. */
264 unsigned *txq_used; /* Number of threads that uses each tx queue. */
265 struct ovs_mutex txq_used_mutex;
0cbfe35d 266 char *type; /* Port type as requested by user. */
72865317
BP
267};
268
1c1e46ed
AW
269/* Contained by struct dp_netdev_flow's 'stats' member. */
270struct dp_netdev_flow_stats {
eb94da30
DDP
271 atomic_llong used; /* Last used time, in monotonic msecs. */
272 atomic_ullong packet_count; /* Number of packets matched. */
273 atomic_ullong byte_count; /* Number of bytes matched. */
274 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
275};
276
277/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
278 *
279 *
280 * Thread-safety
281 * =============
282 *
283 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 284 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
285 *
286 * Motivation
287 * ----------
288 *
289 * The thread safety rules described here for "struct dp_netdev_flow" are
290 * motivated by two goals:
291 *
292 * - Prevent threads that read members of "struct dp_netdev_flow" from
293 * reading bad data due to changes by some thread concurrently modifying
294 * those members.
295 *
296 * - Prevent two threads making changes to members of a given "struct
297 * dp_netdev_flow" from interfering with each other.
298 *
299 *
300 * Rules
301 * -----
302 *
ed79f89a
DDP
303 * A flow 'flow' may be accessed without a risk of being freed during an RCU
304 * grace period. Code that needs to hold onto a flow for a while
305 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
306 *
307 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
308 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
309 * from modification.
8a4e3a85
BP
310 *
311 * Some members, marked 'const', are immutable. Accessing other members
312 * requires synchronization, as noted in more detail below.
313 */
72865317 314struct dp_netdev_flow {
11e5cf1f 315 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 316 /* Hash table index by unmasked flow. */
1c1e46ed
AW
317 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
318 /* 'flow_table'. */
70e5ed6f 319 const ovs_u128 ufid; /* Unique flow identifier. */
bd5131ba 320 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 321 /* flow. */
72865317 322
ed79f89a
DDP
323 /* Number of references.
324 * The classifier owns one reference.
325 * Any thread trying to keep a rule from being freed should hold its own
326 * reference. */
327 struct ovs_refcount ref_cnt;
328
11e5cf1f
DDP
329 bool dead;
330
1c1e46ed
AW
331 /* Statistics. */
332 struct dp_netdev_flow_stats stats;
8a4e3a85 333
45c626a3 334 /* Actions. */
61e7deb1 335 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 336
11e5cf1f
DDP
337 /* While processing a group of input packets, the datapath uses the next
338 * member to store a pointer to the output batch for the flow. It is
339 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
f7ce4811
PS
340 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
341 struct packet_batch_per_flow *batch;
11e5cf1f 342
0de8783a
JR
343 /* Packet classification. */
344 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
345 /* 'cr' must be the last member. */
72865317
BP
346};
347
ed79f89a 348static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 349static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f
JS
350static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
351 struct flow *);
8a4e3a85 352
a84cb64a
BP
353/* A set of datapath actions within a "struct dp_netdev_flow".
354 *
355 *
356 * Thread-safety
357 * =============
358 *
45c626a3 359 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 360struct dp_netdev_actions {
a84cb64a
BP
361 /* These members are immutable: they do not change during the struct's
362 * lifetime. */
a84cb64a 363 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 364 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
365};
366
367struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
368 size_t);
61e7deb1
BP
369struct dp_netdev_actions *dp_netdev_flow_get_actions(
370 const struct dp_netdev_flow *);
371static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 372
1c1e46ed
AW
373/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
374struct dp_netdev_pmd_stats {
375 /* Indexed by DP_STAT_*. */
eb94da30 376 atomic_ullong n[DP_N_STATS];
1c1e46ed
AW
377};
378
55e3ca97
DDP
379/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
380struct dp_netdev_pmd_cycles {
381 /* Indexed by PMD_CYCLES_*. */
382 atomic_ullong n[PMD_N_CYCLES];
383};
384
ae7ad0a1
IM
385/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
386struct rxq_poll {
387 struct dp_netdev_port *port;
388 struct netdev_rxq *rx;
389 struct ovs_list node;
390};
391
d0cca6c3
DDP
392/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
393struct tx_port {
324c8374
IM
394 struct dp_netdev_port *port;
395 int qid;
396 long long last_used;
d0cca6c3
DDP
397 struct hmap_node node;
398};
399
e4cfed38
PS
400/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
401 * the performance overhead of interrupt processing. Therefore netdev can
402 * not implement rx-wait for these devices. dpif-netdev needs to poll
403 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 404 * devices assigned to itself.
e4cfed38
PS
405 *
406 * DPDK used PMD for accessing NIC.
407 *
65f13b50
AW
408 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
409 * I/O of all non-pmd threads. There will be no actual thread created
410 * for the instance.
1c1e46ed
AW
411 *
412 * Each struct has its own flow table and classifier. Packets received
413 * from managed ports are looked up in the corresponding pmd thread's
414 * flow table, and are executed with the found actions.
415 * */
65f13b50 416struct dp_netdev_pmd_thread {
6c3eee82 417 struct dp_netdev *dp;
1c1e46ed 418 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
65f13b50 419 struct cmap_node node; /* In 'dp->poll_threads'. */
accf8626
AW
420
421 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
422 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
423
65f13b50
AW
424 /* Per thread exact-match cache. Note, the instance for cpu core
425 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
d0cca6c3
DDP
426 * need to be protected by 'non_pmd_mutex'. Every other instance
427 * will only be accessed by its own pmd thread. */
9bbf1c3d 428 struct emc_cache flow_cache;
1c1e46ed
AW
429
430 /* Classifier and Flow-Table.
431 *
432 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
433 * changes to 'cls' must be made while still holding the 'flow_mutex'.
434 */
435 struct ovs_mutex flow_mutex;
436 struct dpcls cls;
437 struct cmap flow_table OVS_GUARDED; /* Flow table. */
438
439 /* Statistics. */
440 struct dp_netdev_pmd_stats stats;
441
55e3ca97
DDP
442 /* Cycles counters */
443 struct dp_netdev_pmd_cycles cycles;
444
445 /* Used to count cicles. See 'cycles_counter_end()' */
446 unsigned long long last_cycles;
447
65f13b50
AW
448 struct latch exit_latch; /* For terminating the pmd thread. */
449 atomic_uint change_seq; /* For reloading pmd ports. */
6c3eee82 450 pthread_t thread;
bd5131ba 451 unsigned core_id; /* CPU core id of this pmd thread. */
65f13b50 452 int numa_id; /* numa node id of this pmd thread. */
81acebda 453
324c8374
IM
454 /* Queue id used by this pmd thread to send packets on all netdevs if
455 * XPS disabled for this netdev. All static_tx_qid's are unique and less
456 * than 'ovs_numa_get_n_cores() + 1'. */
457 atomic_int static_tx_qid;
6553d06b 458
d0cca6c3 459 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
ae7ad0a1
IM
460 /* List of rx queues to poll. */
461 struct ovs_list poll_list OVS_GUARDED;
d0cca6c3
DDP
462 /* Number of elements in 'poll_list' */
463 int poll_cnt;
464 /* Map of 'tx_port's used for transmission. Written by the main thread,
465 * read by the pmd thread. */
466 struct hmap tx_ports OVS_GUARDED;
467
468 /* Map of 'tx_port' used in the fast path. This is a thread-local copy of
469 * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
470 * by multiple threads, and thusly need to be protected by 'non_pmd_mutex'.
471 * Every other instance will only be accessed by its own pmd thread. */
472 struct hmap port_cache;
ae7ad0a1 473
6553d06b
DDP
474 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
475 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
476 * values and subtracts them from 'stats' and 'cycles' before
477 * reporting to the user */
478 unsigned long long stats_zero[DP_N_STATS];
479 uint64_t cycles_zero[PMD_N_CYCLES];
6c3eee82
BP
480};
481
84067a4c
JR
482#define PMD_INITIAL_SEQ 1
483
72865317
BP
484/* Interface to netdev-based datapath. */
485struct dpif_netdev {
486 struct dpif dpif;
487 struct dp_netdev *dp;
d33ed218 488 uint64_t last_port_seq;
72865317
BP
489};
490
8a4e3a85 491static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
e9985d6a
DDP
492 struct dp_netdev_port **portp)
493 OVS_REQUIRES(dp->port_mutex);
8a4e3a85 494static int get_port_by_name(struct dp_netdev *dp, const char *devname,
e9985d6a
DDP
495 struct dp_netdev_port **portp)
496 OVS_REQUIRES(dp->port_mutex);
8a4e3a85
BP
497static void dp_netdev_free(struct dp_netdev *)
498 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
499static int do_add_port(struct dp_netdev *dp, const char *devname,
500 const char *type, odp_port_t port_no)
59e6d833 501 OVS_REQUIRES(dp->port_mutex);
c40b890f 502static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 503 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
504static int dpif_netdev_open(const struct dpif_class *, const char *name,
505 bool create, struct dpif **);
65f13b50 506static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 507 struct dp_packet_batch *,
41ccaa24 508 bool may_steal,
4edb9ae9 509 const struct nlattr *actions,
324c8374
IM
510 size_t actions_len,
511 long long now);
65f13b50 512static void dp_netdev_input(struct dp_netdev_pmd_thread *,
1895cc8d 513 struct dp_packet_batch *, odp_port_t port_no);
a90ed026 514static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
1895cc8d 515 struct dp_packet_batch *);
41ccaa24 516
6b31e073 517static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 518static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50 519static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
00873463
DDP
520 struct dp_netdev *dp, unsigned core_id,
521 int numa_id);
1c1e46ed 522static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
e9985d6a
DDP
523static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
524 OVS_REQUIRES(dp->port_mutex);
525
b19befae 526static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 527 unsigned core_id);
1c1e46ed
AW
528static struct dp_netdev_pmd_thread *
529dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
65f13b50
AW
530static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
531static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
e9985d6a
DDP
532static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
533 OVS_REQUIRES(dp->port_mutex);
d0cca6c3 534static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
cc245ce8
IM
535static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
536 struct dp_netdev_port *port);
d0cca6c3
DDP
537static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
538 struct dp_netdev_port *port);
539static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
540 struct dp_netdev_port *port);
541static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
542 struct dp_netdev_port *port,
543 struct netdev_rxq *rx);
ae7ad0a1
IM
544static struct dp_netdev_pmd_thread *
545dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
e9985d6a
DDP
546static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
547 OVS_REQUIRES(dp->port_mutex);
1c1e46ed
AW
548static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
549static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
550static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
d0cca6c3
DDP
551static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
552 OVS_REQUIRES(pmd->port_mutex);
72865317 553
324c8374
IM
554static void
555dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
556 long long now, bool purge);
557static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
558 struct tx_port *tx, long long now);
559
67ad54cb 560static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d
DDP
561static void emc_clear_entry(struct emc_entry *ce);
562
563static void
564emc_cache_init(struct emc_cache *flow_cache)
565{
566 int i;
567
67ad54cb 568 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
569 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
570 flow_cache->entries[i].flow = NULL;
0de8783a 571 flow_cache->entries[i].key.hash = 0;
09b0fa9c 572 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 573 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
574 }
575}
576
577static void
578emc_cache_uninit(struct emc_cache *flow_cache)
579{
580 int i;
581
582 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
583 emc_clear_entry(&flow_cache->entries[i]);
584 }
585}
586
67ad54cb
AW
587/* Check and clear dead flow references slowly (one entry at each
588 * invocation). */
589static void
590emc_cache_slow_sweep(struct emc_cache *flow_cache)
591{
592 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
593
594 if (!emc_entry_alive(entry)) {
595 emc_clear_entry(entry);
596 }
597 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
598}
599
c4ea7529
BP
600/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
601bool
602dpif_is_netdev(const struct dpif *dpif)
603{
604 return dpif->dpif_class->open == dpif_netdev_open;
605}
606
72865317
BP
607static struct dpif_netdev *
608dpif_netdev_cast(const struct dpif *dpif)
609{
c4ea7529 610 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
611 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
612}
613
614static struct dp_netdev *
615get_dp_netdev(const struct dpif *dpif)
616{
617 return dpif_netdev_cast(dpif)->dp;
618}
6553d06b
DDP
619\f
620enum pmd_info_type {
ce179f11
IM
621 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
622 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
623 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
6553d06b
DDP
624};
625
626static void
627pmd_info_show_stats(struct ds *reply,
628 struct dp_netdev_pmd_thread *pmd,
629 unsigned long long stats[DP_N_STATS],
630 uint64_t cycles[PMD_N_CYCLES])
631{
632 unsigned long long total_packets = 0;
633 uint64_t total_cycles = 0;
634 int i;
635
636 /* These loops subtracts reference values ('*_zero') from the counters.
637 * Since loads and stores are relaxed, it might be possible for a '*_zero'
638 * value to be more recent than the current value we're reading from the
639 * counter. This is not a big problem, since these numbers are not
640 * supposed to be too accurate, but we should at least make sure that
641 * the result is not negative. */
642 for (i = 0; i < DP_N_STATS; i++) {
643 if (stats[i] > pmd->stats_zero[i]) {
644 stats[i] -= pmd->stats_zero[i];
645 } else {
646 stats[i] = 0;
647 }
648
649 if (i != DP_STAT_LOST) {
650 /* Lost packets are already included in DP_STAT_MISS */
651 total_packets += stats[i];
652 }
653 }
654
655 for (i = 0; i < PMD_N_CYCLES; i++) {
656 if (cycles[i] > pmd->cycles_zero[i]) {
657 cycles[i] -= pmd->cycles_zero[i];
658 } else {
659 cycles[i] = 0;
660 }
661
662 total_cycles += cycles[i];
663 }
664
665 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
666 ? "main thread" : "pmd thread");
667
668 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
669 ds_put_format(reply, " numa_id %d", pmd->numa_id);
670 }
d5c199ea 671 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 672 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
673 }
674 ds_put_cstr(reply, ":\n");
675
676 ds_put_format(reply,
677 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
678 "\tmiss:%llu\n\tlost:%llu\n",
679 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
680 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
681
682 if (total_cycles == 0) {
683 return;
684 }
685
686 ds_put_format(reply,
687 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
688 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
689 cycles[PMD_CYCLES_POLLING],
690 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
691 cycles[PMD_CYCLES_PROCESSING],
692 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
693
694 if (total_packets == 0) {
695 return;
696 }
697
698 ds_put_format(reply,
699 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
700 total_cycles / (double)total_packets,
701 total_cycles, total_packets);
702
703 ds_put_format(reply,
704 "\tavg processing cycles per packet: "
705 "%.02f (%"PRIu64"/%llu)\n",
706 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
707 cycles[PMD_CYCLES_PROCESSING], total_packets);
708}
709
710static void
711pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
712 struct dp_netdev_pmd_thread *pmd,
713 unsigned long long stats[DP_N_STATS],
714 uint64_t cycles[PMD_N_CYCLES])
715{
716 int i;
717
718 /* We cannot write 'stats' and 'cycles' (because they're written by other
719 * threads) and we shouldn't change 'stats' (because they're used to count
720 * datapath stats, which must not be cleared here). Instead, we save the
721 * current values and subtract them from the values to be displayed in the
722 * future */
723 for (i = 0; i < DP_N_STATS; i++) {
724 pmd->stats_zero[i] = stats[i];
725 }
726 for (i = 0; i < PMD_N_CYCLES; i++) {
727 pmd->cycles_zero[i] = cycles[i];
728 }
729}
730
ce179f11
IM
731static void
732pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
733{
734 if (pmd->core_id != NON_PMD_CORE_ID) {
735 struct rxq_poll *poll;
736 const char *prev_name = NULL;
737
738 ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
739 pmd->numa_id, pmd->core_id);
740
d0cca6c3 741 ovs_mutex_lock(&pmd->port_mutex);
ce179f11
IM
742 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
743 const char *name = netdev_get_name(poll->port->netdev);
744
745 if (!prev_name || strcmp(name, prev_name)) {
746 if (prev_name) {
747 ds_put_cstr(reply, "\n");
748 }
749 ds_put_format(reply, "\tport: %s\tqueue-id:",
750 netdev_get_name(poll->port->netdev));
751 }
752 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
753 prev_name = name;
754 }
d0cca6c3 755 ovs_mutex_unlock(&pmd->port_mutex);
ce179f11
IM
756 ds_put_cstr(reply, "\n");
757 }
758}
759
6553d06b
DDP
760static void
761dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
762 void *aux)
763{
764 struct ds reply = DS_EMPTY_INITIALIZER;
765 struct dp_netdev_pmd_thread *pmd;
766 struct dp_netdev *dp = NULL;
767 enum pmd_info_type type = *(enum pmd_info_type *) aux;
768
769 ovs_mutex_lock(&dp_netdev_mutex);
770
771 if (argc == 2) {
772 dp = shash_find_data(&dp_netdevs, argv[1]);
773 } else if (shash_count(&dp_netdevs) == 1) {
774 /* There's only one datapath */
775 dp = shash_first(&dp_netdevs)->data;
776 }
777
778 if (!dp) {
779 ovs_mutex_unlock(&dp_netdev_mutex);
780 unixctl_command_reply_error(conn,
781 "please specify an existing datapath");
782 return;
783 }
784
785 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
ce179f11
IM
786 if (type == PMD_INFO_SHOW_RXQ) {
787 pmd_info_show_rxq(&reply, pmd);
788 } else {
789 unsigned long long stats[DP_N_STATS];
790 uint64_t cycles[PMD_N_CYCLES];
791 int i;
6553d06b 792
ce179f11
IM
793 /* Read current stats and cycle counters */
794 for (i = 0; i < ARRAY_SIZE(stats); i++) {
795 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
796 }
797 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
798 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
799 }
6553d06b 800
ce179f11
IM
801 if (type == PMD_INFO_CLEAR_STATS) {
802 pmd_info_clear_stats(&reply, pmd, stats, cycles);
803 } else if (type == PMD_INFO_SHOW_STATS) {
804 pmd_info_show_stats(&reply, pmd, stats, cycles);
805 }
6553d06b
DDP
806 }
807 }
808
809 ovs_mutex_unlock(&dp_netdev_mutex);
810
811 unixctl_command_reply(conn, ds_cstr(&reply));
812 ds_destroy(&reply);
813}
814\f
815static int
816dpif_netdev_init(void)
817{
818 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
819 clear_aux = PMD_INFO_CLEAR_STATS,
820 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b
DDP
821
822 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
823 0, 1, dpif_netdev_pmd_info,
824 (void *)&show_aux);
825 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
826 0, 1, dpif_netdev_pmd_info,
827 (void *)&clear_aux);
ce179f11
IM
828 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
829 0, 1, dpif_netdev_pmd_info,
830 (void *)&poll_aux);
6553d06b
DDP
831 return 0;
832}
72865317 833
2197d7ab 834static int
2240af25
DDP
835dpif_netdev_enumerate(struct sset *all_dps,
836 const struct dpif_class *dpif_class)
2197d7ab
GL
837{
838 struct shash_node *node;
839
97be1538 840 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 841 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
842 struct dp_netdev *dp = node->data;
843 if (dpif_class != dp->class) {
844 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
845 * If the class doesn't match, skip this dpif. */
846 continue;
847 }
2197d7ab
GL
848 sset_add(all_dps, node->name);
849 }
97be1538 850 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 851
2197d7ab
GL
852 return 0;
853}
854
add90f6f
EJ
855static bool
856dpif_netdev_class_is_dummy(const struct dpif_class *class)
857{
858 return class != &dpif_netdev_class;
859}
860
0aeaabc8
JP
861static const char *
862dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
863{
864 return strcmp(type, "internal") ? type
add90f6f 865 : dpif_netdev_class_is_dummy(class) ? "dummy"
0aeaabc8
JP
866 : "tap";
867}
868
72865317
BP
869static struct dpif *
870create_dpif_netdev(struct dp_netdev *dp)
871{
462278db 872 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 873 struct dpif_netdev *dpif;
72865317 874
6a8267c5 875 ovs_refcount_ref(&dp->ref_cnt);
72865317 876
72865317 877 dpif = xmalloc(sizeof *dpif);
614c4892 878 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 879 dpif->dp = dp;
d33ed218 880 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
881
882 return &dpif->dpif;
883}
884
4e022ec0
AW
885/* Choose an unused, non-zero port number and return it on success.
886 * Return ODPP_NONE on failure. */
887static odp_port_t
e44768b7 888choose_port(struct dp_netdev *dp, const char *name)
59e6d833 889 OVS_REQUIRES(dp->port_mutex)
e44768b7 890{
4e022ec0 891 uint32_t port_no;
e44768b7
JP
892
893 if (dp->class != &dpif_netdev_class) {
894 const char *p;
895 int start_no = 0;
896
897 /* If the port name begins with "br", start the number search at
898 * 100 to make writing tests easier. */
899 if (!strncmp(name, "br", 2)) {
900 start_no = 100;
901 }
902
903 /* If the port name contains a number, try to assign that port number.
904 * This can make writing unit tests easier because port numbers are
905 * predictable. */
906 for (p = name; *p != '\0'; p++) {
907 if (isdigit((unsigned char) *p)) {
908 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
909 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
910 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 911 return u32_to_odp(port_no);
e44768b7
JP
912 }
913 break;
914 }
915 }
916 }
917
ff073a71
BP
918 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
919 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 920 return u32_to_odp(port_no);
e44768b7
JP
921 }
922 }
923
4e022ec0 924 return ODPP_NONE;
e44768b7
JP
925}
926
72865317 927static int
614c4892
BP
928create_dp_netdev(const char *name, const struct dpif_class *class,
929 struct dp_netdev **dpp)
8a4e3a85 930 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
931{
932 struct dp_netdev *dp;
933 int error;
72865317 934
462278db 935 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
936 shash_add(&dp_netdevs, name, dp);
937
938 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
939 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 940 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 941 atomic_flag_clear(&dp->destroyed);
8a4e3a85 942
59e6d833 943 ovs_mutex_init(&dp->port_mutex);
e9985d6a 944 hmap_init(&dp->ports);
d33ed218 945 dp->port_seq = seq_create();
6b31e073
RW
946 fat_rwlock_init(&dp->upcall_rwlock);
947
a6a426d6
IM
948 dp->reconfigure_seq = seq_create();
949 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
950
6b31e073
RW
951 /* Disable upcalls by default. */
952 dp_netdev_disable_upcall(dp);
623540e4 953 dp->upcall_aux = NULL;
6b31e073 954 dp->upcall_cb = NULL;
e44768b7 955
65f13b50
AW
956 cmap_init(&dp->poll_threads);
957 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
958 ovsthread_key_create(&dp->per_pmd_key, NULL);
959
e9985d6a 960 ovs_mutex_lock(&dp->port_mutex);
f2eee189 961 dp_netdev_set_nonpmd(dp);
65f13b50 962
4e022ec0 963 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
59e6d833 964 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
965 if (error) {
966 dp_netdev_free(dp);
462278db 967 return error;
72865317
BP
968 }
969
a36de779 970 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 971 *dpp = dp;
72865317
BP
972 return 0;
973}
974
a6a426d6
IM
975static void
976dp_netdev_request_reconfigure(struct dp_netdev *dp)
977{
978 seq_change(dp->reconfigure_seq);
979}
980
981static bool
982dp_netdev_is_reconf_required(struct dp_netdev *dp)
983{
984 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
985}
986
72865317 987static int
614c4892 988dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 989 bool create, struct dpif **dpifp)
72865317 990{
462278db 991 struct dp_netdev *dp;
5279f8fd 992 int error;
462278db 993
97be1538 994 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
995 dp = shash_find_data(&dp_netdevs, name);
996 if (!dp) {
5279f8fd 997 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 998 } else {
5279f8fd
BP
999 error = (dp->class != class ? EINVAL
1000 : create ? EEXIST
1001 : 0);
1002 }
1003 if (!error) {
1004 *dpifp = create_dpif_netdev(dp);
6b31e073 1005 dp->dpif = *dpifp;
72865317 1006 }
97be1538 1007 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 1008
5279f8fd 1009 return error;
72865317
BP
1010}
1011
88ace79b
DDP
1012static void
1013dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1014 OVS_NO_THREAD_SAFETY_ANALYSIS
1015{
1016 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1017 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1018
1019 /* Before freeing a lock we should release it */
1020 fat_rwlock_unlock(&dp->upcall_rwlock);
1021 fat_rwlock_destroy(&dp->upcall_rwlock);
1022}
1023
8a4e3a85
BP
1024/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1025 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
1026static void
1027dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 1028 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 1029{
e9985d6a 1030 struct dp_netdev_port *port, *next;
4ad28026 1031
8a4e3a85
BP
1032 shash_find_and_delete(&dp_netdevs, dp->name);
1033
65f13b50
AW
1034 dp_netdev_destroy_all_pmds(dp);
1035 ovs_mutex_destroy(&dp->non_pmd_mutex);
1036 ovsthread_key_delete(dp->per_pmd_key);
6c3eee82 1037
59e6d833 1038 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 1039 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
c40b890f 1040 do_del_port(dp, port);
1ba530f4 1041 }
59e6d833 1042 ovs_mutex_unlock(&dp->port_mutex);
d916785c 1043 cmap_destroy(&dp->poll_threads);
51852a57 1044
a6a426d6
IM
1045 seq_destroy(dp->reconfigure_seq);
1046
d33ed218 1047 seq_destroy(dp->port_seq);
e9985d6a 1048 hmap_destroy(&dp->ports);
3186ea46 1049 ovs_mutex_destroy(&dp->port_mutex);
88ace79b
DDP
1050
1051 /* Upcalls must be disabled at this point */
1052 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 1053
f2eee189 1054 free(dp->pmd_cmask);
8a4e3a85 1055 free(CONST_CAST(char *, dp->name));
72865317
BP
1056 free(dp);
1057}
1058
8a4e3a85
BP
1059static void
1060dp_netdev_unref(struct dp_netdev *dp)
1061{
1062 if (dp) {
1063 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1064 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1065 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1066 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1067 dp_netdev_free(dp);
1068 }
1069 ovs_mutex_unlock(&dp_netdev_mutex);
1070 }
1071}
1072
72865317
BP
1073static void
1074dpif_netdev_close(struct dpif *dpif)
1075{
1076 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1077
8a4e3a85 1078 dp_netdev_unref(dp);
72865317
BP
1079 free(dpif);
1080}
1081
1082static int
7dab847a 1083dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1084{
1085 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1086
6a8267c5 1087 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1088 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1089 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1090 OVS_NOT_REACHED();
1091 }
1092 }
5279f8fd 1093
72865317
BP
1094 return 0;
1095}
1096
eb94da30
DDP
1097/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1098 * load/store semantics. While the increment is not atomic, the load and
1099 * store operations are, making it impossible to read inconsistent values.
1100 *
1101 * This is used to update thread local stats counters. */
1102static void
1103non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1104{
1105 unsigned long long tmp;
1106
1107 atomic_read_relaxed(var, &tmp);
1108 tmp += n;
1109 atomic_store_relaxed(var, tmp);
1110}
1111
72865317 1112static int
a8d9304d 1113dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1114{
1115 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1116 struct dp_netdev_pmd_thread *pmd;
8a4e3a85 1117
1c1e46ed
AW
1118 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1119 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
eb94da30 1120 unsigned long long n;
1c1e46ed 1121 stats->n_flows += cmap_count(&pmd->flow_table);
eb94da30 1122
abcf3ef4
DDP
1123 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1124 stats->n_hit += n;
1125 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
eb94da30
DDP
1126 stats->n_hit += n;
1127 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1128 stats->n_missed += n;
1129 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1130 stats->n_lost += n;
51852a57 1131 }
1ce3fa06 1132 stats->n_masks = UINT32_MAX;
847108dc 1133 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1134
72865317
BP
1135 return 0;
1136}
1137
e4cfed38 1138static void
65f13b50 1139dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1140{
65f13b50
AW
1141 int old_seq;
1142
accf8626 1143 if (pmd->core_id == NON_PMD_CORE_ID) {
d0cca6c3
DDP
1144 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1145 ovs_mutex_lock(&pmd->port_mutex);
1146 pmd_load_cached_ports(pmd);
1147 ovs_mutex_unlock(&pmd->port_mutex);
1148 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
accf8626
AW
1149 return;
1150 }
1151
1152 ovs_mutex_lock(&pmd->cond_mutex);
65f13b50 1153 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
accf8626
AW
1154 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1155 ovs_mutex_unlock(&pmd->cond_mutex);
65f13b50 1156}
e4cfed38 1157
59e6d833
BP
1158static uint32_t
1159hash_port_no(odp_port_t port_no)
1160{
1161 return hash_int(odp_to_u32(port_no), 0);
1162}
1163
72865317 1164static int
b8d29252
DDP
1165port_create(const char *devname, const char *open_type, const char *type,
1166 odp_port_t port_no, struct dp_netdev_port **portp)
72865317 1167{
4b609110 1168 struct netdev_saved_flags *sf;
72865317 1169 struct dp_netdev_port *port;
2499a8ce 1170 enum netdev_flags flags;
b8d29252
DDP
1171 struct netdev *netdev;
1172 int n_open_rxqs = 0;
324c8374 1173 int n_cores = 0;
b8d29252 1174 int i, error;
324c8374 1175 bool dynamic_txqs = false;
72865317 1176
b8d29252 1177 *portp = NULL;
72865317
BP
1178
1179 /* Open and validate network device. */
0cbfe35d 1180 error = netdev_open(devname, open_type, &netdev);
72865317 1181 if (error) {
b8d29252 1182 return error;
72865317 1183 }
72865317
BP
1184 /* XXX reject non-Ethernet devices */
1185
2499a8ce
AC
1186 netdev_get_flags(netdev, &flags);
1187 if (flags & NETDEV_LOOPBACK) {
1188 VLOG_ERR("%s: cannot add a loopback device", devname);
d17f4f08 1189 error = EINVAL;
b8d29252 1190 goto out;
2499a8ce
AC
1191 }
1192
5a034064 1193 if (netdev_is_pmd(netdev)) {
324c8374 1194 n_cores = ovs_numa_get_n_cores();
5a034064
AW
1195
1196 if (n_cores == OVS_CORE_UNSPEC) {
1197 VLOG_ERR("%s, cannot get cpu core info", devname);
d17f4f08 1198 error = ENOENT;
b8d29252 1199 goto out;
5a034064
AW
1200 }
1201 /* There can only be ovs_numa_get_n_cores() pmd threads,
3bcc10c0
DDP
1202 * so creates a txq for each, and one extra for the non
1203 * pmd threads. */
050c60bf 1204 error = netdev_set_tx_multiq(netdev, n_cores + 1);
7251515e 1205 if (error && (error != EOPNOTSUPP)) {
5a034064 1206 VLOG_ERR("%s, cannot set multiq", devname);
b8d29252 1207 goto out;
5a034064
AW
1208 }
1209 }
050c60bf
DDP
1210
1211 if (netdev_is_reconf_required(netdev)) {
1212 error = netdev_reconfigure(netdev);
1213 if (error) {
1214 goto out;
1215 }
1216 }
1217
324c8374
IM
1218 if (netdev_is_pmd(netdev)) {
1219 if (netdev_n_txq(netdev) < n_cores + 1) {
1220 dynamic_txqs = true;
1221 }
1222 }
1223
e4cfed38 1224 port = xzalloc(sizeof *port);
35303d71 1225 port->port_no = port_no;
e4cfed38 1226 port->netdev = netdev;
490e82af 1227 port->n_rxq = netdev_n_rxq(netdev);
b8d29252 1228 port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
324c8374 1229 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
e4cfed38 1230 port->type = xstrdup(type);
324c8374
IM
1231 ovs_mutex_init(&port->txq_used_mutex);
1232 port->dynamic_txqs = dynamic_txqs;
d17f4f08 1233
490e82af 1234 for (i = 0; i < port->n_rxq; i++) {
55c955bd 1235 error = netdev_rxq_open(netdev, &port->rxq[i], i);
d17f4f08 1236 if (error) {
55c955bd
PS
1237 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1238 devname, ovs_strerror(errno));
d17f4f08 1239 goto out_rxq_close;
55c955bd 1240 }
d17f4f08 1241 n_open_rxqs++;
7b6b0ef4
BP
1242 }
1243
4b609110 1244 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 1245 if (error) {
d17f4f08 1246 goto out_rxq_close;
72865317 1247 }
4b609110 1248 port->sf = sf;
e4cfed38 1249
b8d29252 1250 *portp = port;
72865317
BP
1251
1252 return 0;
d17f4f08
DDP
1253
1254out_rxq_close:
1255 for (i = 0; i < n_open_rxqs; i++) {
1256 netdev_rxq_close(port->rxq[i]);
1257 }
324c8374 1258 ovs_mutex_destroy(&port->txq_used_mutex);
d17f4f08 1259 free(port->type);
324c8374 1260 free(port->txq_used);
d17f4f08
DDP
1261 free(port->rxq);
1262 free(port);
b8d29252 1263
d17f4f08 1264out:
b8d29252 1265 netdev_close(netdev);
d17f4f08 1266 return error;
72865317
BP
1267}
1268
b8d29252
DDP
1269static int
1270do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1271 odp_port_t port_no)
1272 OVS_REQUIRES(dp->port_mutex)
1273{
1274 struct dp_netdev_port *port;
1275 int error;
1276
1277 /* Reject devices already in 'dp'. */
1278 if (!get_port_by_name(dp, devname, &port)) {
1279 return EEXIST;
1280 }
1281
1282 error = port_create(devname, dpif_netdev_port_open_type(dp->class, type),
1283 type, port_no, &port);
1284 if (error) {
1285 return error;
1286 }
1287
b8d29252 1288 if (netdev_is_pmd(port->netdev)) {
d0cca6c3
DDP
1289 int numa_id = netdev_get_numa_id(port->netdev);
1290
1291 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1292 dp_netdev_set_pmds_on_numa(dp, numa_id);
b8d29252 1293 }
d0cca6c3
DDP
1294
1295 dp_netdev_add_port_to_pmds(dp, port);
1296
e9985d6a 1297 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
b8d29252
DDP
1298 seq_change(dp->port_seq);
1299
1300 return 0;
1301}
1302
247527db
BP
1303static int
1304dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1305 odp_port_t *port_nop)
247527db
BP
1306{
1307 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1308 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1309 const char *dpif_port;
4e022ec0 1310 odp_port_t port_no;
5279f8fd 1311 int error;
247527db 1312
59e6d833 1313 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1314 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1315 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1316 port_no = *port_nop;
1317 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1318 } else {
3aa30359 1319 port_no = choose_port(dp, dpif_port);
5279f8fd 1320 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1321 }
5279f8fd 1322 if (!error) {
247527db 1323 *port_nop = port_no;
5279f8fd 1324 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1325 }
59e6d833 1326 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1327
1328 return error;
72865317
BP
1329}
1330
1331static int
4e022ec0 1332dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1333{
1334 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1335 int error;
1336
59e6d833 1337 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1338 if (port_no == ODPP_LOCAL) {
1339 error = EINVAL;
1340 } else {
1341 struct dp_netdev_port *port;
1342
1343 error = get_port_by_number(dp, port_no, &port);
1344 if (!error) {
1345 do_del_port(dp, port);
1346 }
1347 }
59e6d833 1348 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1349
1350 return error;
72865317
BP
1351}
1352
1353static bool
4e022ec0 1354is_valid_port_number(odp_port_t port_no)
72865317 1355{
ff073a71
BP
1356 return port_no != ODPP_NONE;
1357}
1358
1359static struct dp_netdev_port *
1360dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
e9985d6a 1361 OVS_REQUIRES(dp->port_mutex)
ff073a71
BP
1362{
1363 struct dp_netdev_port *port;
1364
e9985d6a 1365 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 1366 if (port->port_no == port_no) {
ff073a71
BP
1367 return port;
1368 }
1369 }
1370 return NULL;
72865317
BP
1371}
1372
1373static int
1374get_port_by_number(struct dp_netdev *dp,
4e022ec0 1375 odp_port_t port_no, struct dp_netdev_port **portp)
e9985d6a 1376 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1377{
1378 if (!is_valid_port_number(port_no)) {
1379 *portp = NULL;
1380 return EINVAL;
1381 } else {
ff073a71 1382 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
1383 return *portp ? 0 : ENOENT;
1384 }
1385}
1386
b284085e 1387static void
62453dad 1388port_destroy(struct dp_netdev_port *port)
b284085e 1389{
62453dad
DDP
1390 if (!port) {
1391 return;
b284085e 1392 }
b284085e 1393
62453dad
DDP
1394 netdev_close(port->netdev);
1395 netdev_restore_flags(port->sf);
accf8626 1396
62453dad
DDP
1397 for (unsigned i = 0; i < port->n_rxq; i++) {
1398 netdev_rxq_close(port->rxq[i]);
b284085e 1399 }
324c8374
IM
1400 ovs_mutex_destroy(&port->txq_used_mutex);
1401 free(port->txq_used);
62453dad
DDP
1402 free(port->rxq);
1403 free(port->type);
1404 free(port);
b284085e
PS
1405}
1406
72865317
BP
1407static int
1408get_port_by_name(struct dp_netdev *dp,
1409 const char *devname, struct dp_netdev_port **portp)
59e6d833 1410 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1411{
1412 struct dp_netdev_port *port;
1413
e9985d6a 1414 HMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 1415 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
1416 *portp = port;
1417 return 0;
1418 }
1419 }
1420 return ENOENT;
1421}
1422
347ba9bb
IM
1423static int
1424get_n_pmd_threads(struct dp_netdev *dp)
1425{
1426 /* There is one non pmd thread in dp->poll_threads */
1427 return cmap_count(&dp->poll_threads) - 1;
1428}
1429
65f13b50
AW
1430static int
1431get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1432{
1433 struct dp_netdev_pmd_thread *pmd;
1434 int n_pmds = 0;
1435
1436 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1437 if (pmd->numa_id == numa_id) {
1438 n_pmds++;
1439 }
1440 }
1441
1442 return n_pmds;
1443}
1444
1445/* Returns 'true' if there is a port with pmd netdev and the netdev
1446 * is on numa node 'numa_id'. */
1447static bool
1448has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
e9985d6a 1449 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
1450{
1451 struct dp_netdev_port *port;
1452
e9985d6a 1453 HMAP_FOR_EACH (port, node, &dp->ports) {
65f13b50
AW
1454 if (netdev_is_pmd(port->netdev)
1455 && netdev_get_numa_id(port->netdev) == numa_id) {
1456 return true;
1457 }
1458 }
1459
1460 return false;
1461}
1462
1463
c40b890f
BP
1464static void
1465do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 1466 OVS_REQUIRES(dp->port_mutex)
72865317 1467{
e9985d6a 1468 hmap_remove(&dp->ports, &port->node);
d33ed218 1469 seq_change(dp->port_seq);
d0cca6c3
DDP
1470
1471 dp_netdev_del_port_from_all_pmds(dp, port);
1472
e4cfed38 1473 if (netdev_is_pmd(port->netdev)) {
65f13b50
AW
1474 int numa_id = netdev_get_numa_id(port->netdev);
1475
ae7ad0a1
IM
1476 /* PMD threads can not be on invalid numa node. */
1477 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
65f13b50 1478 /* If there is no netdev on the numa node, deletes the pmd threads
d0cca6c3 1479 * for that numa. */
65f13b50
AW
1480 if (!has_pmd_port_for_numa(dp, numa_id)) {
1481 dp_netdev_del_pmds_on_numa(dp, numa_id);
1482 }
e4cfed38 1483 }
72865317 1484
62453dad 1485 port_destroy(port);
72865317
BP
1486}
1487
1488static void
4c738a8d
BP
1489answer_port_query(const struct dp_netdev_port *port,
1490 struct dpif_port *dpif_port)
72865317 1491{
3efb6063 1492 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 1493 dpif_port->type = xstrdup(port->type);
35303d71 1494 dpif_port->port_no = port->port_no;
72865317
BP
1495}
1496
1497static int
4e022ec0 1498dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 1499 struct dpif_port *dpif_port)
72865317
BP
1500{
1501 struct dp_netdev *dp = get_dp_netdev(dpif);
1502 struct dp_netdev_port *port;
1503 int error;
1504
e9985d6a 1505 ovs_mutex_lock(&dp->port_mutex);
72865317 1506 error = get_port_by_number(dp, port_no, &port);
4afba28d 1507 if (!error && dpif_port) {
4c738a8d 1508 answer_port_query(port, dpif_port);
72865317 1509 }
e9985d6a 1510 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1511
72865317
BP
1512 return error;
1513}
1514
1515static int
1516dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 1517 struct dpif_port *dpif_port)
72865317
BP
1518{
1519 struct dp_netdev *dp = get_dp_netdev(dpif);
1520 struct dp_netdev_port *port;
1521 int error;
1522
59e6d833 1523 ovs_mutex_lock(&dp->port_mutex);
72865317 1524 error = get_port_by_name(dp, devname, &port);
4afba28d 1525 if (!error && dpif_port) {
4c738a8d 1526 answer_port_query(port, dpif_port);
72865317 1527 }
59e6d833 1528 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1529
72865317
BP
1530 return error;
1531}
1532
61e7deb1
BP
1533static void
1534dp_netdev_flow_free(struct dp_netdev_flow *flow)
1535{
61e7deb1 1536 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1537 free(flow);
1538}
1539
ed79f89a
DDP
1540static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1541{
1542 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1543 ovsrcu_postpone(dp_netdev_flow_free, flow);
1544 }
1545}
1546
70e5ed6f
JS
1547static uint32_t
1548dp_netdev_flow_hash(const ovs_u128 *ufid)
1549{
1550 return ufid->u32[0];
1551}
1552
72865317 1553static void
1c1e46ed
AW
1554dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1555 struct dp_netdev_flow *flow)
1556 OVS_REQUIRES(pmd->flow_mutex)
72865317 1557{
9f361d6b 1558 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2c0ea78f 1559
1c1e46ed
AW
1560 dpcls_remove(&pmd->cls, &flow->cr);
1561 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
9bbf1c3d 1562 flow->dead = true;
ed79f89a
DDP
1563
1564 dp_netdev_flow_unref(flow);
72865317
BP
1565}
1566
1567static void
1c1e46ed 1568dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 1569{
78c8df12 1570 struct dp_netdev_flow *netdev_flow;
72865317 1571
1c1e46ed
AW
1572 ovs_mutex_lock(&pmd->flow_mutex);
1573 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1574 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 1575 }
1c1e46ed 1576 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
1577}
1578
1579static int
1580dpif_netdev_flow_flush(struct dpif *dpif)
1581{
1582 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
1583 struct dp_netdev_pmd_thread *pmd;
1584
1585 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1586 dp_netdev_pmd_flow_flush(pmd);
1587 }
5279f8fd 1588
72865317
BP
1589 return 0;
1590}
1591
b0ec0f27 1592struct dp_netdev_port_state {
e9985d6a 1593 struct hmap_position position;
4c738a8d 1594 char *name;
b0ec0f27
BP
1595};
1596
1597static int
1598dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1599{
1600 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1601 return 0;
1602}
1603
72865317 1604static int
b0ec0f27 1605dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1606 struct dpif_port *dpif_port)
72865317 1607{
b0ec0f27 1608 struct dp_netdev_port_state *state = state_;
72865317 1609 struct dp_netdev *dp = get_dp_netdev(dpif);
e9985d6a 1610 struct hmap_node *node;
ff073a71 1611 int retval;
72865317 1612
e9985d6a
DDP
1613 ovs_mutex_lock(&dp->port_mutex);
1614 node = hmap_at_position(&dp->ports, &state->position);
ff073a71
BP
1615 if (node) {
1616 struct dp_netdev_port *port;
5279f8fd 1617
ff073a71
BP
1618 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1619
1620 free(state->name);
1621 state->name = xstrdup(netdev_get_name(port->netdev));
1622 dpif_port->name = state->name;
1623 dpif_port->type = port->type;
35303d71 1624 dpif_port->port_no = port->port_no;
ff073a71
BP
1625
1626 retval = 0;
1627 } else {
1628 retval = EOF;
72865317 1629 }
e9985d6a 1630 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1631
ff073a71 1632 return retval;
b0ec0f27
BP
1633}
1634
1635static int
4c738a8d 1636dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1637{
4c738a8d
BP
1638 struct dp_netdev_port_state *state = state_;
1639 free(state->name);
b0ec0f27
BP
1640 free(state);
1641 return 0;
72865317
BP
1642}
1643
1644static int
67a4917b 1645dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1646{
1647 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1648 uint64_t new_port_seq;
5279f8fd
BP
1649 int error;
1650
d33ed218
BP
1651 new_port_seq = seq_read(dpif->dp->port_seq);
1652 if (dpif->last_port_seq != new_port_seq) {
1653 dpif->last_port_seq = new_port_seq;
5279f8fd 1654 error = ENOBUFS;
72865317 1655 } else {
5279f8fd 1656 error = EAGAIN;
72865317 1657 }
5279f8fd
BP
1658
1659 return error;
72865317
BP
1660}
1661
1662static void
1663dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1664{
1665 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1666
d33ed218 1667 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1668}
1669
1670static struct dp_netdev_flow *
0de8783a 1671dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
1672{
1673 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1674}
1675
9bbf1c3d
DDP
1676static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1677{
1678 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1679}
1680
79df317f
DDP
1681/* netdev_flow_key utilities.
1682 *
1683 * netdev_flow_key is basically a miniflow. We use these functions
1684 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1685 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1686 *
1687 * - Since we are dealing exclusively with miniflows created by
1688 * miniflow_extract(), if the map is different the miniflow is different.
1689 * Therefore we can be faster by comparing the map and the miniflow in a
1690 * single memcmp().
5fcff47b 1691 * - These functions can be inlined by the compiler. */
79df317f 1692
361d808d 1693/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 1694 * 'netdev_flow_key.mf' */
361d808d
JR
1695static inline size_t
1696netdev_flow_key_size(size_t flow_u64s)
79df317f 1697{
361d808d 1698 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
1699}
1700
79df317f
DDP
1701static inline bool
1702netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
1703 const struct netdev_flow_key *b)
1704{
caeb4906
JR
1705 /* 'b->len' may be not set yet. */
1706 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
1707}
1708
1709/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
d79a39fe 1710 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
0de8783a
JR
1711 * generated by miniflow_extract. */
1712static inline bool
1713netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1714 const struct miniflow *mf)
79df317f 1715{
caeb4906 1716 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
1717}
1718
1719static inline void
1720netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
1721 const struct netdev_flow_key *src)
1722{
caeb4906
JR
1723 memcpy(dst, src,
1724 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
1725}
1726
1727/* Slow. */
1728static void
1729netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1730 const struct flow *src)
1731{
cf62fa4c 1732 struct dp_packet packet;
0de8783a 1733 uint64_t buf_stub[512 / 8];
0de8783a 1734
cf62fa4c
PS
1735 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1736 pkt_metadata_from_flow(&packet.md, src);
0de8783a 1737 flow_compose(&packet, src);
cf62fa4c
PS
1738 miniflow_extract(&packet, &dst->mf);
1739 dp_packet_uninit(&packet);
0de8783a 1740
361d808d 1741 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
0de8783a
JR
1742 dst->hash = 0; /* Not computed yet. */
1743}
1744
1745/* Initialize a netdev_flow_key 'mask' from 'match'. */
1746static inline void
1747netdev_flow_mask_init(struct netdev_flow_key *mask,
1748 const struct match *match)
1749{
09b0fa9c 1750 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 1751 struct flowmap fmap;
0de8783a 1752 uint32_t hash = 0;
5fcff47b 1753 size_t idx;
0de8783a
JR
1754
1755 /* Only check masks that make sense for the flow. */
5fcff47b
JR
1756 flow_wc_map(&match->flow, &fmap);
1757 flowmap_init(&mask->mf.map);
0de8783a 1758
5fcff47b
JR
1759 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1760 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 1761
5fcff47b
JR
1762 if (mask_u64) {
1763 flowmap_set(&mask->mf.map, idx, 1);
1764 *dst++ = mask_u64;
1765 hash = hash_add64(hash, mask_u64);
0de8783a 1766 }
0de8783a
JR
1767 }
1768
5fcff47b 1769 map_t map;
0de8783a 1770
5fcff47b
JR
1771 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1772 hash = hash_add64(hash, map);
1773 }
0de8783a 1774
5fcff47b 1775 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 1776
d70e8c28 1777 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
1778 mask->len = netdev_flow_key_size(n);
1779}
1780
361d808d 1781/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
1782static inline void
1783netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1784 const struct flow *flow,
1785 const struct netdev_flow_key *mask)
79df317f 1786{
09b0fa9c
JR
1787 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1788 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 1789 uint32_t hash = 0;
d70e8c28 1790 uint64_t value;
0de8783a
JR
1791
1792 dst->len = mask->len;
361d808d 1793 dst->mf = mask->mf; /* Copy maps. */
0de8783a 1794
5fcff47b 1795 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
1796 *dst_u64 = value & *mask_u64++;
1797 hash = hash_add64(hash, *dst_u64++);
0de8783a 1798 }
09b0fa9c
JR
1799 dst->hash = hash_finish(hash,
1800 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
1801}
1802
5fcff47b
JR
1803/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1804#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1805 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
0de8783a
JR
1806
1807/* Returns a hash value for the bits of 'key' where there are 1-bits in
1808 * 'mask'. */
1809static inline uint32_t
1810netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1811 const struct netdev_flow_key *mask)
1812{
09b0fa9c 1813 const uint64_t *p = miniflow_get_values(&mask->mf);
0de8783a 1814 uint32_t hash = 0;
5fcff47b 1815 uint64_t value;
0de8783a 1816
5fcff47b
JR
1817 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1818 hash = hash_add64(hash, value & *p++);
0de8783a
JR
1819 }
1820
09b0fa9c 1821 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
79df317f
DDP
1822}
1823
9bbf1c3d
DDP
1824static inline bool
1825emc_entry_alive(struct emc_entry *ce)
1826{
1827 return ce->flow && !ce->flow->dead;
1828}
1829
1830static void
1831emc_clear_entry(struct emc_entry *ce)
1832{
1833 if (ce->flow) {
1834 dp_netdev_flow_unref(ce->flow);
1835 ce->flow = NULL;
1836 }
1837}
1838
1839static inline void
1840emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 1841 const struct netdev_flow_key *key)
9bbf1c3d
DDP
1842{
1843 if (ce->flow != flow) {
1844 if (ce->flow) {
1845 dp_netdev_flow_unref(ce->flow);
1846 }
1847
1848 if (dp_netdev_flow_ref(flow)) {
1849 ce->flow = flow;
1850 } else {
1851 ce->flow = NULL;
1852 }
1853 }
0de8783a
JR
1854 if (key) {
1855 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
1856 }
1857}
1858
1859static inline void
0de8783a 1860emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
1861 struct dp_netdev_flow *flow)
1862{
1863 struct emc_entry *to_be_replaced = NULL;
1864 struct emc_entry *current_entry;
1865
0de8783a
JR
1866 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1867 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 1868 /* We found the entry with the 'mf' miniflow */
0de8783a 1869 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
1870 return;
1871 }
1872
1873 /* Replacement policy: put the flow in an empty (not alive) entry, or
1874 * in the first entry where it can be */
1875 if (!to_be_replaced
1876 || (emc_entry_alive(to_be_replaced)
1877 && !emc_entry_alive(current_entry))
0de8783a 1878 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
1879 to_be_replaced = current_entry;
1880 }
1881 }
1882 /* We didn't find the miniflow in the cache.
1883 * The 'to_be_replaced' entry is where the new flow will be stored */
1884
0de8783a 1885 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
1886}
1887
1888static inline struct dp_netdev_flow *
0de8783a 1889emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
1890{
1891 struct emc_entry *current_entry;
1892
0de8783a
JR
1893 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1894 if (current_entry->key.hash == key->hash
1895 && emc_entry_alive(current_entry)
1896 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 1897
0de8783a 1898 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
1899 return current_entry->flow;
1900 }
1901 }
1902
1903 return NULL;
1904}
1905
72865317 1906static struct dp_netdev_flow *
1c1e46ed
AW
1907dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
1908 const struct netdev_flow_key *key)
2c0ea78f 1909{
8a4e3a85 1910 struct dp_netdev_flow *netdev_flow;
0de8783a 1911 struct dpcls_rule *rule;
2c0ea78f 1912
1c1e46ed 1913 dpcls_lookup(&pmd->cls, key, &rule, 1);
4f150744 1914 netdev_flow = dp_netdev_flow_cast(rule);
2c0ea78f 1915
8a4e3a85 1916 return netdev_flow;
2c0ea78f
GS
1917}
1918
1919static struct dp_netdev_flow *
1c1e46ed
AW
1920dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
1921 const ovs_u128 *ufidp, const struct nlattr *key,
1922 size_t key_len)
72865317 1923{
1763b4b8 1924 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
1925 struct flow flow;
1926 ovs_u128 ufid;
1927
1928 /* If a UFID is not provided, determine one based on the key. */
1929 if (!ufidp && key && key_len
1930 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1c1e46ed 1931 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
70e5ed6f
JS
1932 ufidp = &ufid;
1933 }
72865317 1934
70e5ed6f
JS
1935 if (ufidp) {
1936 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 1937 &pmd->flow_table) {
2ff8484b 1938 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
70e5ed6f
JS
1939 return netdev_flow;
1940 }
72865317
BP
1941 }
1942 }
8a4e3a85 1943
72865317
BP
1944 return NULL;
1945}
1946
1947static void
eb94da30 1948get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1763b4b8 1949 struct dpif_flow_stats *stats)
feebdea2 1950{
eb94da30
DDP
1951 struct dp_netdev_flow *netdev_flow;
1952 unsigned long long n;
1953 long long used;
1954 uint16_t flags;
1955
1956 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
1957
1958 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
1959 stats->n_packets = n;
1960 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
1961 stats->n_bytes = n;
1962 atomic_read_relaxed(&netdev_flow->stats.used, &used);
1963 stats->used = used;
1964 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
1965 stats->tcp_flags = flags;
72865317
BP
1966}
1967
7af12bd7
JS
1968/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
1969 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
1970 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
1971 * protect them. */
6fe09f8c 1972static void
70e5ed6f 1973dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
7af12bd7 1974 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 1975 struct dpif_flow *flow, bool terse)
6fe09f8c 1976{
64bb477f
JS
1977 if (terse) {
1978 memset(flow, 0, sizeof *flow);
1979 } else {
1980 struct flow_wildcards wc;
1981 struct dp_netdev_actions *actions;
1982 size_t offset;
5262eea1
JG
1983 struct odp_flow_key_parms odp_parms = {
1984 .flow = &netdev_flow->flow,
1985 .mask = &wc.masks,
2494ccd7 1986 .support = dp_netdev_support,
5262eea1 1987 };
64bb477f
JS
1988
1989 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
1990
1991 /* Key */
6fd6ed71 1992 offset = key_buf->size;
64bb477f 1993 flow->key = ofpbuf_tail(key_buf);
5262eea1 1994 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 1995 flow->key_len = key_buf->size - offset;
64bb477f
JS
1996
1997 /* Mask */
6fd6ed71 1998 offset = mask_buf->size;
64bb477f 1999 flow->mask = ofpbuf_tail(mask_buf);
ec1f6f32 2000 odp_parms.key_buf = key_buf;
5262eea1 2001 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 2002 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
2003
2004 /* Actions */
2005 actions = dp_netdev_flow_get_actions(netdev_flow);
2006 flow->actions = actions->actions;
2007 flow->actions_len = actions->size;
2008 }
6fe09f8c 2009
70e5ed6f
JS
2010 flow->ufid = netdev_flow->ufid;
2011 flow->ufid_present = true;
1c1e46ed 2012 flow->pmd_id = netdev_flow->pmd_id;
6fe09f8c
JS
2013 get_dpif_flow_stats(netdev_flow, &flow->stats);
2014}
2015
36956a7d 2016static int
8c301900
JR
2017dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2018 const struct nlattr *mask_key,
2019 uint32_t mask_key_len, const struct flow *flow,
9f861c91 2020 struct flow_wildcards *wc)
8c301900 2021{
ca8d3442
DDP
2022 enum odp_key_fitness fitness;
2023
2024 fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key,
2025 key_len, wc, flow);
2026 if (fitness) {
2027 /* This should not happen: it indicates that
2028 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2029 * disagree on the acceptable form of a mask. Log the problem
2030 * as an error, with enough details to enable debugging. */
2031 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2032
2033 if (!VLOG_DROP_ERR(&rl)) {
2034 struct ds s;
8c301900 2035
ca8d3442
DDP
2036 ds_init(&s);
2037 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2038 true);
2039 VLOG_ERR("internal error parsing flow mask %s (%s)",
2040 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2041 ds_destroy(&s);
8c301900 2042 }
ca8d3442
DDP
2043
2044 return EINVAL;
8c301900
JR
2045 }
2046
2047 return 0;
2048}
2049
2050static int
2051dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2052 struct flow *flow)
36956a7d 2053{
586ddea5
BP
2054 odp_port_t in_port;
2055
6728d578 2056 if (odp_flow_key_to_flow_udpif(key, key_len, flow)) {
36956a7d 2057 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
2058 * and odp_flow_key_to_flow() disagree on the acceptable form of a
2059 * flow. Log the problem as an error, with enough details to enable
2060 * debugging. */
36956a7d
BP
2061 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2062
2063 if (!VLOG_DROP_ERR(&rl)) {
2064 struct ds s;
2065
2066 ds_init(&s);
8c301900 2067 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
2068 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2069 ds_destroy(&s);
2070 }
2071
2072 return EINVAL;
2073 }
2074
586ddea5
BP
2075 in_port = flow->in_port.odp_port;
2076 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
2077 return EINVAL;
2078 }
2079
07659514 2080 /* Userspace datapath doesn't support conntrack. */
9daf2348 2081 if (flow->ct_state || flow->ct_zone || flow->ct_mark
2ff8484b 2082 || !ovs_u128_is_zero(flow->ct_label)) {
07659514
JS
2083 return EINVAL;
2084 }
2085
36956a7d
BP
2086 return 0;
2087}
2088
72865317 2089static int
6fe09f8c 2090dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
2091{
2092 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2093 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2094 struct dp_netdev_pmd_thread *pmd;
c673049c
IM
2095 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2096 struct hmapx_node *node;
2097 int error = EINVAL;
2098
2099 if (get->pmd_id == PMD_ID_NULL) {
2100 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2101 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2102 dp_netdev_pmd_unref(pmd);
2103 }
2104 }
2105 } else {
2106 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2107 if (!pmd) {
2108 goto out;
2109 }
2110 hmapx_add(&to_find, pmd);
1c1e46ed
AW
2111 }
2112
c673049c
IM
2113 if (!hmapx_count(&to_find)) {
2114 goto out;
72865317 2115 }
1c1e46ed 2116
c673049c
IM
2117 HMAPX_FOR_EACH (node, &to_find) {
2118 pmd = (struct dp_netdev_pmd_thread *) node->data;
2119 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2120 get->key_len);
2121 if (netdev_flow) {
2122 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2123 get->flow, false);
2124 error = 0;
2125 break;
2126 } else {
2127 error = ENOENT;
2128 }
2129 }
bc4a05c6 2130
c673049c
IM
2131 HMAPX_FOR_EACH (node, &to_find) {
2132 pmd = (struct dp_netdev_pmd_thread *) node->data;
2133 dp_netdev_pmd_unref(pmd);
2134 }
2135out:
2136 hmapx_destroy(&to_find);
5279f8fd 2137 return error;
72865317
BP
2138}
2139
0de8783a 2140static struct dp_netdev_flow *
1c1e46ed
AW
2141dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2142 struct match *match, const ovs_u128 *ufid,
ae2ceebd 2143 const struct nlattr *actions, size_t actions_len)
1c1e46ed 2144 OVS_REQUIRES(pmd->flow_mutex)
72865317 2145{
0de8783a
JR
2146 struct dp_netdev_flow *flow;
2147 struct netdev_flow_key mask;
ed79f89a 2148
0de8783a
JR
2149 netdev_flow_mask_init(&mask, match);
2150 /* Make sure wc does not have metadata. */
5fcff47b
JR
2151 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2152 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 2153
0de8783a 2154 /* Do not allocate extra space. */
caeb4906 2155 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 2156 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 2157 flow->dead = false;
11e5cf1f 2158 flow->batch = NULL;
bd5131ba 2159 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 2160 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 2161 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 2162 ovs_refcount_init(&flow->ref_cnt);
0de8783a 2163 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 2164
0de8783a 2165 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
1c1e46ed 2166 dpcls_insert(&pmd->cls, &flow->cr, &mask);
72865317 2167
4c75aaab
EJ
2168 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2169 dp_netdev_flow_hash(&flow->ufid));
2170
623540e4
EJ
2171 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
2172 struct ds ds = DS_EMPTY_INITIALIZER;
9044f2c1
JG
2173 struct ofpbuf key_buf, mask_buf;
2174 struct odp_flow_key_parms odp_parms = {
2175 .flow = &match->flow,
2176 .mask = &match->wc.masks,
2177 .support = dp_netdev_support,
2178 };
2179
2180 ofpbuf_init(&key_buf, 0);
2181 ofpbuf_init(&mask_buf, 0);
623540e4 2182
9044f2c1
JG
2183 odp_flow_key_from_flow(&odp_parms, &key_buf);
2184 odp_parms.key_buf = &key_buf;
2185 odp_flow_key_from_mask(&odp_parms, &mask_buf);
0de8783a 2186
623540e4 2187 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
2188 odp_format_ufid(ufid, &ds);
2189 ds_put_cstr(&ds, " ");
9044f2c1
JG
2190 odp_flow_format(key_buf.data, key_buf.size,
2191 mask_buf.data, mask_buf.size,
2192 NULL, &ds, false);
623540e4
EJ
2193 ds_put_cstr(&ds, ", actions:");
2194 format_odp_actions(&ds, actions, actions_len);
2195
2196 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2197
9044f2c1
JG
2198 ofpbuf_uninit(&key_buf);
2199 ofpbuf_uninit(&mask_buf);
623540e4
EJ
2200 ds_destroy(&ds);
2201 }
2202
0de8783a 2203 return flow;
72865317
BP
2204}
2205
72865317 2206static int
89625d1e 2207dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
2208{
2209 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2210 struct dp_netdev_flow *netdev_flow;
0de8783a 2211 struct netdev_flow_key key;
1c1e46ed 2212 struct dp_netdev_pmd_thread *pmd;
ae2ceebd 2213 struct match match;
70e5ed6f 2214 ovs_u128 ufid;
bd5131ba
DDP
2215 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2216 ? NON_PMD_CORE_ID : put->pmd_id;
36956a7d
BP
2217 int error;
2218
ae2ceebd 2219 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
2220 if (error) {
2221 return error;
2222 }
2223 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2224 put->mask, put->mask_len,
9f861c91 2225 &match.flow, &match.wc);
36956a7d
BP
2226 if (error) {
2227 return error;
2228 }
0de8783a 2229
1c1e46ed
AW
2230 pmd = dp_netdev_get_pmd(dp, pmd_id);
2231 if (!pmd) {
2232 return EINVAL;
2233 }
2234
0de8783a
JR
2235 /* Must produce a netdev_flow_key for lookup.
2236 * This interface is no longer performance critical, since it is not used
2237 * for upcall processing any more. */
2238 netdev_flow_key_from_flow(&key, &match.flow);
72865317 2239
70e5ed6f
JS
2240 if (put->ufid) {
2241 ufid = *put->ufid;
2242 } else {
2243 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2244 }
2245
1c1e46ed
AW
2246 ovs_mutex_lock(&pmd->flow_mutex);
2247 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key);
1763b4b8 2248 if (!netdev_flow) {
89625d1e 2249 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 2250 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
89625d1e
BP
2251 if (put->stats) {
2252 memset(put->stats, 0, sizeof *put->stats);
feebdea2 2253 }
1c1e46ed 2254 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
70e5ed6f 2255 put->actions_len);
0de8783a 2256 error = 0;
72865317 2257 } else {
5279f8fd 2258 error = EFBIG;
72865317
BP
2259 }
2260 } else {
5279f8fd 2261 error = ENOENT;
72865317
BP
2262 }
2263 } else {
2c0ea78f 2264 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 2265 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
2266 struct dp_netdev_actions *new_actions;
2267 struct dp_netdev_actions *old_actions;
2268
2269 new_actions = dp_netdev_actions_create(put->actions,
2270 put->actions_len);
2271
61e7deb1
BP
2272 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2273 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 2274
a84cb64a
BP
2275 if (put->stats) {
2276 get_dpif_flow_stats(netdev_flow, put->stats);
2277 }
2278 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
2279 /* XXX: The userspace datapath uses thread local statistics
2280 * (for flows), which should be updated only by the owning
2281 * thread. Since we cannot write on stats memory here,
2282 * we choose not to support this flag. Please note:
2283 * - This feature is currently used only by dpctl commands with
2284 * option --clear.
2285 * - Should the need arise, this operation can be implemented
2286 * by keeping a base value (to be update here) for each
2287 * counter, and subtracting it before outputting the stats */
2288 error = EOPNOTSUPP;
72865317 2289 }
8a4e3a85 2290
61e7deb1 2291 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 2292 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 2293 error = EEXIST;
2c0ea78f
GS
2294 } else {
2295 /* Overlapping flow. */
2296 error = EINVAL;
72865317
BP
2297 }
2298 }
1c1e46ed
AW
2299 ovs_mutex_unlock(&pmd->flow_mutex);
2300 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2301
2302 return error;
72865317
BP
2303}
2304
72865317 2305static int
b99d3cee 2306dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
2307{
2308 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2309 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2310 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
2311 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2312 ? NON_PMD_CORE_ID : del->pmd_id;
70e5ed6f 2313 int error = 0;
72865317 2314
1c1e46ed
AW
2315 pmd = dp_netdev_get_pmd(dp, pmd_id);
2316 if (!pmd) {
2317 return EINVAL;
2318 }
2319
2320 ovs_mutex_lock(&pmd->flow_mutex);
2321 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2322 del->key_len);
1763b4b8 2323 if (netdev_flow) {
b99d3cee 2324 if (del->stats) {
1763b4b8 2325 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 2326 }
1c1e46ed 2327 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2328 } else {
5279f8fd 2329 error = ENOENT;
72865317 2330 }
1c1e46ed
AW
2331 ovs_mutex_unlock(&pmd->flow_mutex);
2332 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2333
2334 return error;
72865317
BP
2335}
2336
ac64794a
BP
2337struct dpif_netdev_flow_dump {
2338 struct dpif_flow_dump up;
1c1e46ed
AW
2339 struct cmap_position poll_thread_pos;
2340 struct cmap_position flow_pos;
2341 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
2342 int status;
2343 struct ovs_mutex mutex;
e723fd32
JS
2344};
2345
ac64794a
BP
2346static struct dpif_netdev_flow_dump *
2347dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 2348{
ac64794a 2349 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
2350}
2351
ac64794a 2352static struct dpif_flow_dump *
64bb477f 2353dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
e723fd32 2354{
ac64794a 2355 struct dpif_netdev_flow_dump *dump;
e723fd32 2356
1c1e46ed 2357 dump = xzalloc(sizeof *dump);
ac64794a 2358 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 2359 dump->up.terse = terse;
ac64794a
BP
2360 ovs_mutex_init(&dump->mutex);
2361
2362 return &dump->up;
e723fd32
JS
2363}
2364
2365static int
ac64794a 2366dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 2367{
ac64794a 2368 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 2369
ac64794a
BP
2370 ovs_mutex_destroy(&dump->mutex);
2371 free(dump);
704a1e09
BP
2372 return 0;
2373}
2374
ac64794a
BP
2375struct dpif_netdev_flow_dump_thread {
2376 struct dpif_flow_dump_thread up;
2377 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
2378 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2379 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
2380};
2381
2382static struct dpif_netdev_flow_dump_thread *
2383dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2384{
2385 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2386}
2387
2388static struct dpif_flow_dump_thread *
2389dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2390{
2391 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2392 struct dpif_netdev_flow_dump_thread *thread;
2393
2394 thread = xmalloc(sizeof *thread);
2395 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2396 thread->dump = dump;
2397 return &thread->up;
2398}
2399
2400static void
2401dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2402{
2403 struct dpif_netdev_flow_dump_thread *thread
2404 = dpif_netdev_flow_dump_thread_cast(thread_);
2405
2406 free(thread);
2407}
2408
704a1e09 2409static int
ac64794a 2410dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 2411 struct dpif_flow *flows, int max_flows)
ac64794a
BP
2412{
2413 struct dpif_netdev_flow_dump_thread *thread
2414 = dpif_netdev_flow_dump_thread_cast(thread_);
2415 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 2416 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
8bb113da
RW
2417 int n_flows = 0;
2418 int i;
14608a15 2419
ac64794a 2420 ovs_mutex_lock(&dump->mutex);
8bb113da 2421 if (!dump->status) {
1c1e46ed
AW
2422 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2423 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2424 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2425 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2426
2427 /* First call to dump_next(), extracts the first pmd thread.
2428 * If there is no pmd thread, returns immediately. */
2429 if (!pmd) {
2430 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2431 if (!pmd) {
2432 ovs_mutex_unlock(&dump->mutex);
2433 return n_flows;
8bb113da 2434
8bb113da 2435 }
d2ad7ef1 2436 }
1c1e46ed
AW
2437
2438 do {
2439 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2440 struct cmap_node *node;
2441
2442 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2443 if (!node) {
2444 break;
2445 }
2446 netdev_flows[n_flows] = CONTAINER_OF(node,
2447 struct dp_netdev_flow,
2448 node);
2449 }
2450 /* When finishing dumping the current pmd thread, moves to
2451 * the next. */
2452 if (n_flows < flow_limit) {
2453 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2454 dp_netdev_pmd_unref(pmd);
2455 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2456 if (!pmd) {
2457 dump->status = EOF;
2458 break;
2459 }
2460 }
2461 /* Keeps the reference to next caller. */
2462 dump->cur_pmd = pmd;
2463
2464 /* If the current dump is empty, do not exit the loop, since the
2465 * remaining pmds could have flows to be dumped. Just dumps again
2466 * on the new 'pmd'. */
2467 } while (!n_flows);
8a4e3a85 2468 }
ac64794a 2469 ovs_mutex_unlock(&dump->mutex);
ac64794a 2470
8bb113da
RW
2471 for (i = 0; i < n_flows; i++) {
2472 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2473 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2474 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2475 struct dpif_flow *f = &flows[i];
7af12bd7 2476 struct ofpbuf key, mask;
8bb113da 2477
7af12bd7
JS
2478 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2479 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
64bb477f
JS
2480 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2481 dump->up.terse);
8bb113da 2482 }
feebdea2 2483
8bb113da 2484 return n_flows;
72865317
BP
2485}
2486
2487static int
758c456d 2488dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 2489 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
2490{
2491 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 2492 struct dp_netdev_pmd_thread *pmd;
1895cc8d 2493 struct dp_packet_batch pp;
72865317 2494
cf62fa4c
PS
2495 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2496 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
2497 return EINVAL;
2498 }
2499
65f13b50
AW
2500 /* Tries finding the 'pmd'. If NULL is returned, that means
2501 * the current thread is a non-pmd thread and should use
b19befae 2502 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
2503 pmd = ovsthread_getspecific(dp->per_pmd_key);
2504 if (!pmd) {
b19befae 2505 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
65f13b50
AW
2506 }
2507
2508 /* If the current thread is non-pmd thread, acquires
2509 * the 'non_pmd_mutex'. */
2510 if (pmd->core_id == NON_PMD_CORE_ID) {
2511 ovs_mutex_lock(&dp->non_pmd_mutex);
2512 }
1c1e46ed 2513
36d8de17
DDP
2514 /* The action processing expects the RSS hash to be valid, because
2515 * it's always initialized at the beginning of datapath processing.
2516 * In this case, though, 'execute->packet' may not have gone through
2517 * the datapath at all, it may have been generated by the upper layer
2518 * (OpenFlow packet-out, BFD frame, ...). */
2519 if (!dp_packet_rss_valid(execute->packet)) {
2520 dp_packet_set_rss_hash(execute->packet,
2521 flow_hash_5tuple(execute->flow, 0));
2522 }
2523
1895cc8d
PS
2524 packet_batch_init_packet(&pp, execute->packet);
2525 dp_netdev_execute_actions(pmd, &pp, false, execute->actions,
324c8374 2526 execute->actions_len, time_msec());
36d8de17 2527
65f13b50
AW
2528 if (pmd->core_id == NON_PMD_CORE_ID) {
2529 ovs_mutex_unlock(&dp->non_pmd_mutex);
e9985d6a 2530 dp_netdev_pmd_unref(pmd);
65f13b50 2531 }
8a4e3a85 2532
758c456d 2533 return 0;
72865317
BP
2534}
2535
1a0c894a
BP
2536static void
2537dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2538{
2539 size_t i;
2540
2541 for (i = 0; i < n_ops; i++) {
2542 struct dpif_op *op = ops[i];
2543
2544 switch (op->type) {
2545 case DPIF_OP_FLOW_PUT:
2546 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2547 break;
2548
2549 case DPIF_OP_FLOW_DEL:
2550 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2551 break;
2552
2553 case DPIF_OP_EXECUTE:
2554 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2555 break;
6fe09f8c
JS
2556
2557 case DPIF_OP_FLOW_GET:
2558 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2559 break;
1a0c894a
BP
2560 }
2561 }
2562}
2563
6e3c6fa4
DDP
2564/* Changes the number or the affinity of pmd threads. The changes are actually
2565 * applied in dpif_netdev_run(). */
f2eee189 2566static int
a14b8947 2567dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
f2eee189
AW
2568{
2569 struct dp_netdev *dp = get_dp_netdev(dpif);
2570
a6a426d6
IM
2571 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
2572 free(dp->pmd_cmask);
2573 dp->pmd_cmask = nullable_xstrdup(cmask);
2574 dp_netdev_request_reconfigure(dp);
f2eee189
AW
2575 }
2576
2577 return 0;
2578}
2579
5bf93d67
EJ
2580static int
2581dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2582 uint32_t queue_id, uint32_t *priority)
2583{
2584 *priority = queue_id;
2585 return 0;
2586}
2587
72865317 2588\f
9ff55ae2
DDP
2589/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2590 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
a84cb64a
BP
2591struct dp_netdev_actions *
2592dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2593{
2594 struct dp_netdev_actions *netdev_actions;
2595
9ff55ae2
DDP
2596 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2597 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
2598 netdev_actions->size = size;
2599
2600 return netdev_actions;
2601}
2602
a84cb64a 2603struct dp_netdev_actions *
61e7deb1 2604dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 2605{
61e7deb1 2606 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
2607}
2608
61e7deb1
BP
2609static void
2610dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 2611{
61e7deb1 2612 free(actions);
a84cb64a
BP
2613}
2614\f
55e3ca97
DDP
2615static inline unsigned long long
2616cycles_counter(void)
2617{
2618#ifdef DPDK_NETDEV
2619 return rte_get_tsc_cycles();
2620#else
2621 return 0;
2622#endif
2623}
2624
2625/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2626extern struct ovs_mutex cycles_counter_fake_mutex;
2627
2628/* Start counting cycles. Must be followed by 'cycles_count_end()' */
2629static inline void
2630cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2631 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2632 OVS_NO_THREAD_SAFETY_ANALYSIS
2633{
2634 pmd->last_cycles = cycles_counter();
2635}
2636
2637/* Stop counting cycles and add them to the counter 'type' */
2638static inline void
2639cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2640 enum pmd_cycles_counter_type type)
2641 OVS_RELEASES(&cycles_counter_fake_mutex)
2642 OVS_NO_THREAD_SAFETY_ANALYSIS
2643{
2644 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2645
2646 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2647}
e4cfed38 2648
5794e276 2649static void
65f13b50 2650dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2651 struct dp_netdev_port *port,
2652 struct netdev_rxq *rxq)
e4cfed38 2653{
1895cc8d
PS
2654 struct dp_packet_batch batch;
2655 int error;
e4cfed38 2656
1895cc8d 2657 dp_packet_batch_init(&batch);
55e3ca97 2658 cycles_count_start(pmd);
1895cc8d 2659 error = netdev_rxq_recv(rxq, &batch);
55e3ca97 2660 cycles_count_end(pmd, PMD_CYCLES_POLLING);
e4cfed38 2661 if (!error) {
3c33f0ff 2662 *recirc_depth_get() = 0;
41ccaa24 2663
55e3ca97 2664 cycles_count_start(pmd);
1895cc8d 2665 dp_netdev_input(pmd, &batch, port->port_no);
55e3ca97 2666 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
e4cfed38 2667 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 2668 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
2669
2670 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 2671 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
2672 }
2673}
2674
dc36593c
DDP
2675static int
2676port_reconfigure(struct dp_netdev_port *port)
2677{
2678 struct netdev *netdev = port->netdev;
dc36593c
DDP
2679 int i, err;
2680
050c60bf 2681 if (!netdev_is_reconf_required(netdev)) {
dc36593c
DDP
2682 return 0;
2683 }
2684
2685 /* Closes the existing 'rxq's. */
2686 for (i = 0; i < port->n_rxq; i++) {
2687 netdev_rxq_close(port->rxq[i]);
2688 port->rxq[i] = NULL;
2689 }
2690 port->n_rxq = 0;
2691
050c60bf
DDP
2692 /* Allows 'netdev' to apply the pending configuration changes. */
2693 err = netdev_reconfigure(netdev);
dc36593c 2694 if (err && (err != EOPNOTSUPP)) {
050c60bf
DDP
2695 VLOG_ERR("Failed to set interface %s new configuration",
2696 netdev_get_name(netdev));
dc36593c
DDP
2697 return err;
2698 }
050c60bf 2699 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
dc36593c 2700 port->rxq = xrealloc(port->rxq, sizeof *port->rxq * netdev_n_rxq(netdev));
324c8374
IM
2701 /* Realloc 'used' counters for tx queues. */
2702 free(port->txq_used);
2703 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
2704
dc36593c
DDP
2705 for (i = 0; i < netdev_n_rxq(netdev); i++) {
2706 err = netdev_rxq_open(netdev, &port->rxq[i], i);
2707 if (err) {
2708 return err;
2709 }
2710 port->n_rxq++;
2711 }
2712
2713 return 0;
2714}
2715
6e3c6fa4
DDP
2716static void
2717reconfigure_pmd_threads(struct dp_netdev *dp)
2718 OVS_REQUIRES(dp->port_mutex)
2719{
dc36593c 2720 struct dp_netdev_port *port, *next;
324c8374 2721 int n_cores;
6e3c6fa4 2722
a6a426d6
IM
2723 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
2724
6e3c6fa4
DDP
2725 dp_netdev_destroy_all_pmds(dp);
2726
324c8374 2727 /* Reconfigures the cpu mask. */
a6a426d6 2728 ovs_numa_set_cpu_mask(dp->pmd_cmask);
324c8374
IM
2729
2730 n_cores = ovs_numa_get_n_cores();
2731 if (n_cores == OVS_CORE_UNSPEC) {
2732 VLOG_ERR("Cannot get cpu core info");
2733 return;
2734 }
2735
dc36593c
DDP
2736 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
2737 int err;
6e3c6fa4 2738
dc36593c
DDP
2739 err = port_reconfigure(port);
2740 if (err) {
2741 hmap_remove(&dp->ports, &port->node);
2742 seq_change(dp->port_seq);
2743 port_destroy(port);
324c8374
IM
2744 } else {
2745 port->dynamic_txqs = netdev_n_txq(port->netdev) < n_cores + 1;
6e3c6fa4
DDP
2746 }
2747 }
6e3c6fa4
DDP
2748 /* Restores the non-pmd. */
2749 dp_netdev_set_nonpmd(dp);
2750 /* Restores all pmd threads. */
2751 dp_netdev_reset_pmd_threads(dp);
2752}
2753
050c60bf
DDP
2754/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
2755static bool
2756ports_require_restart(const struct dp_netdev *dp)
2757 OVS_REQUIRES(dp->port_mutex)
2758{
2759 struct dp_netdev_port *port;
2760
2761 HMAP_FOR_EACH (port, node, &dp->ports) {
2762 if (netdev_is_reconf_required(port->netdev)) {
2763 return true;
2764 }
2765 }
2766
2767 return false;
2768}
2769
a36de779
PS
2770/* Return true if needs to revalidate datapath flows. */
2771static bool
e4cfed38
PS
2772dpif_netdev_run(struct dpif *dpif)
2773{
2774 struct dp_netdev_port *port;
2775 struct dp_netdev *dp = get_dp_netdev(dpif);
b19befae
AW
2776 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_pmd(dp,
2777 NON_PMD_CORE_ID);
a36de779 2778 uint64_t new_tnl_seq;
e4cfed38 2779
e9985d6a 2780 ovs_mutex_lock(&dp->port_mutex);
65f13b50 2781 ovs_mutex_lock(&dp->non_pmd_mutex);
e9985d6a 2782 HMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2783 if (!netdev_is_pmd(port->netdev)) {
2784 int i;
2785
490e82af 2786 for (i = 0; i < port->n_rxq; i++) {
65f13b50 2787 dp_netdev_process_rxq_port(non_pmd, port, port->rxq[i]);
55c955bd 2788 }
e4cfed38
PS
2789 }
2790 }
324c8374 2791 dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
65f13b50 2792 ovs_mutex_unlock(&dp->non_pmd_mutex);
6e3c6fa4 2793
1c1e46ed
AW
2794 dp_netdev_pmd_unref(non_pmd);
2795
a6a426d6 2796 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
6e3c6fa4
DDP
2797 reconfigure_pmd_threads(dp);
2798 }
2799 ovs_mutex_unlock(&dp->port_mutex);
2800
53902038 2801 tnl_neigh_cache_run();
7f9b8504 2802 tnl_port_map_run();
a36de779
PS
2803 new_tnl_seq = seq_read(tnl_conf_seq);
2804
2805 if (dp->last_tnl_conf_seq != new_tnl_seq) {
2806 dp->last_tnl_conf_seq = new_tnl_seq;
2807 return true;
2808 }
2809 return false;
e4cfed38
PS
2810}
2811
2812static void
2813dpif_netdev_wait(struct dpif *dpif)
2814{
2815 struct dp_netdev_port *port;
2816 struct dp_netdev *dp = get_dp_netdev(dpif);
2817
59e6d833 2818 ovs_mutex_lock(&dp_netdev_mutex);
e9985d6a
DDP
2819 ovs_mutex_lock(&dp->port_mutex);
2820 HMAP_FOR_EACH (port, node, &dp->ports) {
050c60bf 2821 netdev_wait_reconf_required(port->netdev);
55c955bd
PS
2822 if (!netdev_is_pmd(port->netdev)) {
2823 int i;
2824
490e82af 2825 for (i = 0; i < port->n_rxq; i++) {
55c955bd
PS
2826 netdev_rxq_wait(port->rxq[i]);
2827 }
e4cfed38
PS
2828 }
2829 }
e9985d6a 2830 ovs_mutex_unlock(&dp->port_mutex);
59e6d833 2831 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 2832 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
2833}
2834
d0cca6c3
DDP
2835static void
2836pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
2837{
2838 struct tx_port *tx_port_cached;
2839
324c8374
IM
2840 /* Free all used tx queue ids. */
2841 dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
2842
d0cca6c3
DDP
2843 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->port_cache) {
2844 free(tx_port_cached);
2845 }
2846}
2847
2848/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
2849 * 'pmd->port_cache' (thread local) */
2850static void
2851pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
2852 OVS_REQUIRES(pmd->port_mutex)
2853{
2854 struct tx_port *tx_port, *tx_port_cached;
2855
2856 pmd_free_cached_ports(pmd);
2857 hmap_shrink(&pmd->port_cache);
2858
2859 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
2860 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
2861 hmap_insert(&pmd->port_cache, &tx_port_cached->node,
324c8374 2862 hash_port_no(tx_port_cached->port->port_no));
d0cca6c3
DDP
2863 }
2864}
2865
e4cfed38 2866static int
d0cca6c3
DDP
2867pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
2868 struct rxq_poll **ppoll_list)
e4cfed38 2869{
f7791740 2870 struct rxq_poll *poll_list = *ppoll_list;
ae7ad0a1
IM
2871 struct rxq_poll *poll;
2872 int i;
e4cfed38 2873
d0cca6c3 2874 ovs_mutex_lock(&pmd->port_mutex);
ae7ad0a1 2875 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
a1fdee13 2876
ae7ad0a1
IM
2877 i = 0;
2878 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
ae7ad0a1 2879 poll_list[i++] = *poll;
e4cfed38 2880 }
d0cca6c3
DDP
2881
2882 pmd_load_cached_ports(pmd);
2883
2884 ovs_mutex_unlock(&pmd->port_mutex);
e4cfed38 2885
e4cfed38 2886 *ppoll_list = poll_list;
d42f9307 2887 return i;
e4cfed38
PS
2888}
2889
6c3eee82 2890static void *
e4cfed38 2891pmd_thread_main(void *f_)
6c3eee82 2892{
65f13b50 2893 struct dp_netdev_pmd_thread *pmd = f_;
e4cfed38 2894 unsigned int lc = 0;
f7791740 2895 struct rxq_poll *poll_list;
84067a4c 2896 unsigned int port_seq = PMD_INITIAL_SEQ;
d42f9307 2897 bool exiting;
e4cfed38
PS
2898 int poll_cnt;
2899 int i;
6c3eee82 2900
e4cfed38
PS
2901 poll_list = NULL;
2902
65f13b50
AW
2903 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
2904 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6930c7e0
DDP
2905 ovs_numa_thread_setaffinity_core(pmd->core_id);
2906 dpdk_set_lcore_id(pmd->core_id);
d0cca6c3 2907 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
e4cfed38 2908reload:
65f13b50 2909 emc_cache_init(&pmd->flow_cache);
ae7ad0a1 2910
7dd671f0
MK
2911 /* List port/core affinity */
2912 for (i = 0; i < poll_cnt; i++) {
ce179f11
IM
2913 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
2914 pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
2915 netdev_rxq_get_queue_id(poll_list[i].rx));
7dd671f0
MK
2916 }
2917
e4cfed38 2918 for (;;) {
e4cfed38 2919 for (i = 0; i < poll_cnt; i++) {
65f13b50 2920 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
e4cfed38
PS
2921 }
2922
2923 if (lc++ > 1024) {
84067a4c 2924 unsigned int seq;
6c3eee82 2925
e4cfed38 2926 lc = 0;
84067a4c 2927
fbe0962b 2928 coverage_try_clear();
9dede5cf
FL
2929 if (!ovsrcu_try_quiesce()) {
2930 emc_cache_slow_sweep(&pmd->flow_cache);
2931 }
84067a4c 2932
65f13b50 2933 atomic_read_relaxed(&pmd->change_seq, &seq);
84067a4c
JR
2934 if (seq != port_seq) {
2935 port_seq = seq;
6c3eee82
BP
2936 break;
2937 }
2938 }
e4cfed38 2939 }
6c3eee82 2940
d0cca6c3 2941 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
d42f9307
DDP
2942 exiting = latch_is_set(&pmd->exit_latch);
2943 /* Signal here to make sure the pmd finishes
2944 * reloading the updated configuration. */
2945 dp_netdev_pmd_reload_done(pmd);
2946
65f13b50 2947 emc_cache_uninit(&pmd->flow_cache);
9bbf1c3d 2948
d42f9307 2949 if (!exiting) {
e4cfed38
PS
2950 goto reload;
2951 }
6c3eee82 2952
e4cfed38 2953 free(poll_list);
d0cca6c3 2954 pmd_free_cached_ports(pmd);
6c3eee82
BP
2955 return NULL;
2956}
2957
6b31e073
RW
2958static void
2959dp_netdev_disable_upcall(struct dp_netdev *dp)
2960 OVS_ACQUIRES(dp->upcall_rwlock)
2961{
2962 fat_rwlock_wrlock(&dp->upcall_rwlock);
2963}
2964
2965static void
2966dpif_netdev_disable_upcall(struct dpif *dpif)
2967 OVS_NO_THREAD_SAFETY_ANALYSIS
2968{
2969 struct dp_netdev *dp = get_dp_netdev(dpif);
2970 dp_netdev_disable_upcall(dp);
2971}
2972
2973static void
2974dp_netdev_enable_upcall(struct dp_netdev *dp)
2975 OVS_RELEASES(dp->upcall_rwlock)
2976{
2977 fat_rwlock_unlock(&dp->upcall_rwlock);
2978}
2979
2980static void
2981dpif_netdev_enable_upcall(struct dpif *dpif)
2982 OVS_NO_THREAD_SAFETY_ANALYSIS
2983{
2984 struct dp_netdev *dp = get_dp_netdev(dpif);
2985 dp_netdev_enable_upcall(dp);
2986}
2987
ae7ad0a1 2988static void
accf8626
AW
2989dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
2990{
2991 ovs_mutex_lock(&pmd->cond_mutex);
2992 xpthread_cond_signal(&pmd->cond);
2993 ovs_mutex_unlock(&pmd->cond_mutex);
2994}
2995
1c1e46ed
AW
2996/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
2997 * the pointer if succeeds, otherwise, NULL.
2998 *
2999 * Caller must unrefs the returned reference. */
65f13b50 3000static struct dp_netdev_pmd_thread *
bd5131ba 3001dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
3002{
3003 struct dp_netdev_pmd_thread *pmd;
55847abe 3004 const struct cmap_node *pnode;
65f13b50 3005
b19befae 3006 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
3007 if (!pnode) {
3008 return NULL;
3009 }
65f13b50
AW
3010 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
3011
1c1e46ed 3012 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
3013}
3014
f2eee189
AW
3015/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
3016static void
3017dp_netdev_set_nonpmd(struct dp_netdev *dp)
e9985d6a 3018 OVS_REQUIRES(dp->port_mutex)
f2eee189
AW
3019{
3020 struct dp_netdev_pmd_thread *non_pmd;
d0cca6c3 3021 struct dp_netdev_port *port;
f2eee189
AW
3022
3023 non_pmd = xzalloc(sizeof *non_pmd);
00873463 3024 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
d0cca6c3 3025
e9985d6a 3026 HMAP_FOR_EACH (port, node, &dp->ports) {
d0cca6c3
DDP
3027 dp_netdev_add_port_tx_to_pmd(non_pmd, port);
3028 }
3029
3030 dp_netdev_reload_pmd__(non_pmd);
f2eee189
AW
3031}
3032
1c1e46ed
AW
3033/* Caller must have valid pointer to 'pmd'. */
3034static bool
3035dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
3036{
3037 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
3038}
3039
3040static void
3041dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
3042{
3043 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
3044 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
3045 }
3046}
3047
3048/* Given cmap position 'pos', tries to ref the next node. If try_ref()
3049 * fails, keeps checking for next node until reaching the end of cmap.
3050 *
3051 * Caller must unrefs the returned reference. */
3052static struct dp_netdev_pmd_thread *
3053dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
3054{
3055 struct dp_netdev_pmd_thread *next;
3056
3057 do {
3058 struct cmap_node *node;
3059
3060 node = cmap_next_position(&dp->poll_threads, pos);
3061 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
3062 : NULL;
3063 } while (next && !dp_netdev_pmd_try_ref(next));
3064
3065 return next;
3066}
3067
65f13b50 3068/* Configures the 'pmd' based on the input argument. */
6c3eee82 3069static void
65f13b50 3070dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
00873463 3071 unsigned core_id, int numa_id)
65f13b50
AW
3072{
3073 pmd->dp = dp;
65f13b50
AW
3074 pmd->core_id = core_id;
3075 pmd->numa_id = numa_id;
ae7ad0a1 3076 pmd->poll_cnt = 0;
1c1e46ed 3077
324c8374 3078 atomic_init(&pmd->static_tx_qid,
347ba9bb
IM
3079 (core_id == NON_PMD_CORE_ID)
3080 ? ovs_numa_get_n_cores()
3081 : get_n_pmd_threads(dp));
3082
1c1e46ed 3083 ovs_refcount_init(&pmd->ref_cnt);
65f13b50
AW
3084 latch_init(&pmd->exit_latch);
3085 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
accf8626
AW
3086 xpthread_cond_init(&pmd->cond, NULL);
3087 ovs_mutex_init(&pmd->cond_mutex);
1c1e46ed 3088 ovs_mutex_init(&pmd->flow_mutex);
d0cca6c3 3089 ovs_mutex_init(&pmd->port_mutex);
1c1e46ed
AW
3090 dpcls_init(&pmd->cls);
3091 cmap_init(&pmd->flow_table);
417e7e66 3092 ovs_list_init(&pmd->poll_list);
d0cca6c3
DDP
3093 hmap_init(&pmd->tx_ports);
3094 hmap_init(&pmd->port_cache);
65f13b50
AW
3095 /* init the 'flow_cache' since there is no
3096 * actual thread created for NON_PMD_CORE_ID. */
3097 if (core_id == NON_PMD_CORE_ID) {
3098 emc_cache_init(&pmd->flow_cache);
3099 }
3100 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
3101 hash_int(core_id, 0));
3102}
3103
1c1e46ed
AW
3104static void
3105dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
3106{
3107 dp_netdev_pmd_flow_flush(pmd);
3108 dpcls_destroy(&pmd->cls);
d0cca6c3
DDP
3109 hmap_destroy(&pmd->port_cache);
3110 hmap_destroy(&pmd->tx_ports);
1c1e46ed
AW
3111 cmap_destroy(&pmd->flow_table);
3112 ovs_mutex_destroy(&pmd->flow_mutex);
3113 latch_destroy(&pmd->exit_latch);
3114 xpthread_cond_destroy(&pmd->cond);
3115 ovs_mutex_destroy(&pmd->cond_mutex);
d0cca6c3 3116 ovs_mutex_destroy(&pmd->port_mutex);
1c1e46ed
AW
3117 free(pmd);
3118}
3119
3120/* Stops the pmd thread, removes it from the 'dp->poll_threads',
3121 * and unrefs the struct. */
65f13b50 3122static void
e4e74c3a 3123dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 3124{
d0cca6c3
DDP
3125 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
3126 * but extra cleanup is necessary */
65f13b50
AW
3127 if (pmd->core_id == NON_PMD_CORE_ID) {
3128 emc_cache_uninit(&pmd->flow_cache);
d0cca6c3 3129 pmd_free_cached_ports(pmd);
65f13b50
AW
3130 } else {
3131 latch_set(&pmd->exit_latch);
3132 dp_netdev_reload_pmd__(pmd);
3133 ovs_numa_unpin_core(pmd->core_id);
3134 xpthread_join(pmd->thread, NULL);
3135 }
ae7ad0a1 3136
d0cca6c3 3137 dp_netdev_pmd_clear_ports(pmd);
ae7ad0a1 3138
e4e74c3a
AW
3139 /* Purges the 'pmd''s flows after stopping the thread, but before
3140 * destroying the flows, so that the flow stats can be collected. */
3141 if (dp->dp_purge_cb) {
3142 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
3143 }
65f13b50 3144 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 3145 dp_netdev_pmd_unref(pmd);
65f13b50 3146}
6c3eee82 3147
65f13b50
AW
3148/* Destroys all pmd threads. */
3149static void
3150dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
3151{
3152 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
3153 struct dp_netdev_pmd_thread **pmd_list;
3154 size_t k = 0, n_pmds;
3155
3156 n_pmds = cmap_count(&dp->poll_threads);
3157 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
3158
3159 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
3160 /* We cannot call dp_netdev_del_pmd(), since it alters
3161 * 'dp->poll_threads' (while we're iterating it) and it
3162 * might quiesce. */
3163 ovs_assert(k < n_pmds);
3164 pmd_list[k++] = pmd;
6c3eee82 3165 }
d916785c
DDP
3166
3167 for (size_t i = 0; i < k; i++) {
3168 dp_netdev_del_pmd(dp, pmd_list[i]);
3169 }
3170 free(pmd_list);
65f13b50 3171}
6c3eee82 3172
347ba9bb 3173/* Deletes all pmd threads on numa node 'numa_id' and
324c8374 3174 * fixes static_tx_qids of other threads to keep them sequential. */
65f13b50
AW
3175static void
3176dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3177{
3178 struct dp_netdev_pmd_thread *pmd;
347ba9bb
IM
3179 int n_pmds_on_numa, n_pmds;
3180 int *free_idx, k = 0;
d916785c 3181 struct dp_netdev_pmd_thread **pmd_list;
347ba9bb
IM
3182
3183 n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
d916785c
DDP
3184 free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
3185 pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
6c3eee82 3186
65f13b50 3187 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
3188 /* We cannot call dp_netdev_del_pmd(), since it alters
3189 * 'dp->poll_threads' (while we're iterating it) and it
3190 * might quiesce. */
65f13b50 3191 if (pmd->numa_id == numa_id) {
324c8374 3192 atomic_read_relaxed(&pmd->static_tx_qid, &free_idx[k]);
d916785c
DDP
3193 pmd_list[k] = pmd;
3194 ovs_assert(k < n_pmds_on_numa);
347ba9bb 3195 k++;
65f13b50 3196 }
6c3eee82 3197 }
347ba9bb 3198
d916785c
DDP
3199 for (int i = 0; i < k; i++) {
3200 dp_netdev_del_pmd(dp, pmd_list[i]);
3201 }
3202
347ba9bb
IM
3203 n_pmds = get_n_pmd_threads(dp);
3204 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3205 int old_tx_qid;
3206
324c8374 3207 atomic_read_relaxed(&pmd->static_tx_qid, &old_tx_qid);
347ba9bb
IM
3208
3209 if (old_tx_qid >= n_pmds) {
3210 int new_tx_qid = free_idx[--k];
3211
324c8374 3212 atomic_store_relaxed(&pmd->static_tx_qid, new_tx_qid);
347ba9bb
IM
3213 }
3214 }
3215
d916785c 3216 free(pmd_list);
347ba9bb 3217 free(free_idx);
65f13b50 3218}
6c3eee82 3219
d0cca6c3
DDP
3220/* Deletes all rx queues from pmd->poll_list and all the ports from
3221 * pmd->tx_ports. */
cc245ce8 3222static void
d0cca6c3 3223dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
3224{
3225 struct rxq_poll *poll;
d0cca6c3 3226 struct tx_port *port;
cc245ce8 3227
d0cca6c3 3228 ovs_mutex_lock(&pmd->port_mutex);
cc245ce8 3229 LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
cc245ce8
IM
3230 free(poll);
3231 }
3232 pmd->poll_cnt = 0;
d0cca6c3
DDP
3233 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
3234 free(port);
3235 }
3236 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8
IM
3237}
3238
d0cca6c3
DDP
3239static struct tx_port *
3240tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3241{
3242 struct tx_port *tx;
3243
3244 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
324c8374 3245 if (tx->port->port_no == port_no) {
d0cca6c3
DDP
3246 return tx;
3247 }
3248 }
3249
3250 return NULL;
3251}
3252
3253/* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
3254 * 'tx_ports' of 'pmd' thread. Returns true if 'port' was found in 'pmd'
3255 * (therefore a restart is required). */
b68872d8
DDP
3256static bool
3257dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
3258 struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
3259{
3260 struct rxq_poll *poll, *next;
d0cca6c3 3261 struct tx_port *tx;
cc245ce8
IM
3262 bool found = false;
3263
d0cca6c3 3264 ovs_mutex_lock(&pmd->port_mutex);
cc245ce8
IM
3265 LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
3266 if (poll->port == port) {
3267 found = true;
417e7e66 3268 ovs_list_remove(&poll->node);
cc245ce8
IM
3269 pmd->poll_cnt--;
3270 free(poll);
3271 }
3272 }
d0cca6c3
DDP
3273
3274 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
3275 if (tx) {
3276 hmap_remove(&pmd->tx_ports, &tx->node);
3277 free(tx);
3278 found = true;
3279 }
3280 ovs_mutex_unlock(&pmd->port_mutex);
b68872d8
DDP
3281
3282 return found;
3283}
3284
d0cca6c3
DDP
3285/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3286 * threads. The pmd threads that need to be restarted are inserted in
3287 * 'to_reload'. */
b68872d8
DDP
3288static void
3289dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
3290 struct dp_netdev_port *port,
3291 struct hmapx *to_reload)
3292{
b68872d8
DDP
3293 struct dp_netdev_pmd_thread *pmd;
3294
3295 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d0cca6c3 3296 bool found;
b68872d8 3297
d0cca6c3 3298 found = dp_netdev_del_port_from_pmd__(port, pmd);
b68872d8 3299
d0cca6c3
DDP
3300 if (found) {
3301 hmapx_add(to_reload, pmd);
3302 }
cc245ce8
IM
3303 }
3304}
3305
d0cca6c3
DDP
3306/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3307 * threads. Reloads the threads if needed. */
cc245ce8
IM
3308static void
3309dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
3310 struct dp_netdev_port *port)
3311{
cc245ce8 3312 struct dp_netdev_pmd_thread *pmd;
b68872d8
DDP
3313 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3314 struct hmapx_node *node;
cc245ce8 3315
b68872d8
DDP
3316 dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);
3317
3318 HMAPX_FOR_EACH (node, &to_reload) {
3319 pmd = (struct dp_netdev_pmd_thread *) node->data;
3320 dp_netdev_reload_pmd__(pmd);
cc245ce8 3321 }
b68872d8
DDP
3322
3323 hmapx_destroy(&to_reload);
cc245ce8
IM
3324}
3325
b68872d8 3326
ae7ad0a1
IM
3327/* Returns PMD thread from this numa node with fewer rx queues to poll.
3328 * Returns NULL if there is no PMD threads on this numa node.
3329 * Can be called safely only by main thread. */
3330static struct dp_netdev_pmd_thread *
3331dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
3332{
3333 int min_cnt = -1;
3334 struct dp_netdev_pmd_thread *pmd, *res = NULL;
3335
3336 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3337 if (pmd->numa_id == numa_id
3338 && (min_cnt > pmd->poll_cnt || res == NULL)) {
3339 min_cnt = pmd->poll_cnt;
3340 res = pmd;
3341 }
3342 }
3343
3344 return res;
3345}
3346
3347/* Adds rx queue to poll_list of PMD thread. */
3348static void
3349dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3350 struct dp_netdev_port *port, struct netdev_rxq *rx)
d0cca6c3 3351 OVS_REQUIRES(pmd->port_mutex)
ae7ad0a1
IM
3352{
3353 struct rxq_poll *poll = xmalloc(sizeof *poll);
3354
ae7ad0a1
IM
3355 poll->port = port;
3356 poll->rx = rx;
3357
417e7e66 3358 ovs_list_push_back(&pmd->poll_list, &poll->node);
ae7ad0a1
IM
3359 pmd->poll_cnt++;
3360}
3361
d0cca6c3
DDP
3362/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
3363 * changes to take effect. */
cc245ce8 3364static void
d0cca6c3
DDP
3365dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
3366 struct dp_netdev_port *port)
3367{
3368 struct tx_port *tx = xzalloc(sizeof *tx);
3369
324c8374
IM
3370 tx->port = port;
3371 tx->qid = -1;
d0cca6c3
DDP
3372
3373 ovs_mutex_lock(&pmd->port_mutex);
324c8374 3374 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
d0cca6c3
DDP
3375 ovs_mutex_unlock(&pmd->port_mutex);
3376}
3377
3378/* Distribute all rx queues of 'port' between PMD threads in 'dp'. The pmd
3379 * threads that need to be restarted are inserted in 'to_reload'. */
3380static void
3381dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
3382 struct dp_netdev_port *port,
3383 struct hmapx *to_reload)
cc245ce8
IM
3384{
3385 int numa_id = netdev_get_numa_id(port->netdev);
cc245ce8
IM
3386 int i;
3387
d0cca6c3
DDP
3388 if (!netdev_is_pmd(port->netdev)) {
3389 return;
3390 }
cc245ce8 3391
490e82af 3392 for (i = 0; i < port->n_rxq; i++) {
d0cca6c3
DDP
3393 struct dp_netdev_pmd_thread *pmd;
3394
cc245ce8
IM
3395 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
3396 if (!pmd) {
d0cca6c3 3397 VLOG_WARN("There's no pmd thread on numa node %d", numa_id);
cc245ce8
IM
3398 break;
3399 }
3400
d0cca6c3 3401 ovs_mutex_lock(&pmd->port_mutex);
cc245ce8 3402 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]);
d0cca6c3 3403 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8 3404
b68872d8 3405 hmapx_add(to_reload, pmd);
cc245ce8 3406 }
b68872d8
DDP
3407}
3408
3409/* Distributes all rx queues of 'port' between all PMD threads in 'dp' and
d0cca6c3
DDP
3410 * inserts 'port' in the PMD threads 'tx_ports'. The pmd threads that need to
3411 * be restarted are inserted in 'to_reload'. */
3412static void
3413dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
3414 struct hmapx *to_reload)
3415{
3416 struct dp_netdev_pmd_thread *pmd;
3417
3418 dp_netdev_add_port_rx_to_pmds(dp, port, to_reload);
3419
3420 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3421 dp_netdev_add_port_tx_to_pmd(pmd, port);
3422 hmapx_add(to_reload, pmd);
3423 }
3424}
3425
3426/* Distributes all rx queues of 'port' between all PMD threads in 'dp', inserts
3427 * 'port' in the PMD threads 'tx_ports' and reloads them, if needed. */
b68872d8
DDP
3428static void
3429dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
3430{
3431 struct dp_netdev_pmd_thread *pmd;
3432 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3433 struct hmapx_node *node;
3434
3435 dp_netdev_add_port_to_pmds__(dp, port, &to_reload);
cc245ce8
IM
3436
3437 HMAPX_FOR_EACH (node, &to_reload) {
3438 pmd = (struct dp_netdev_pmd_thread *) node->data;
3439 dp_netdev_reload_pmd__(pmd);
3440 }
3441
3442 hmapx_destroy(&to_reload);
3443}
3444
d0cca6c3
DDP
3445/* Starts pmd threads for the numa node 'numa_id', if not already started.
3446 * The function takes care of filling the threads tx port cache. */
65f13b50
AW
3447static void
3448dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
e9985d6a 3449 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
3450{
3451 int n_pmds;
e4cfed38 3452
65f13b50 3453 if (!ovs_numa_numa_id_is_valid(numa_id)) {
d0cca6c3
DDP
3454 VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
3455 numa_id);
3456 return;
65f13b50
AW
3457 }
3458
3459 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
3460
3461 /* If there are already pmd threads created for the numa node
3462 * in which 'netdev' is on, do nothing. Else, creates the
3463 * pmd threads for the numa node. */
3464 if (!n_pmds) {
d0cca6c3 3465 int can_have, n_unpinned, i;
65f13b50
AW
3466
3467 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
3468 if (!n_unpinned) {
d0cca6c3
DDP
3469 VLOG_WARN("Cannot create pmd threads due to out of unpinned "
3470 "cores on numa node %d", numa_id);
65f13b50
AW
3471 return;
3472 }
6c3eee82 3473
f2eee189
AW
3474 /* If cpu mask is specified, uses all unpinned cores, otherwise
3475 * tries creating NR_PMD_THREADS pmd threads. */
3476 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
65f13b50 3477 for (i = 0; i < can_have; i++) {
bd5131ba 3478 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
d0cca6c3
DDP
3479 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
3480 struct dp_netdev_port *port;
ae7ad0a1 3481
d0cca6c3
DDP
3482 dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
3483
e9985d6a 3484 HMAP_FOR_EACH (port, node, &dp->ports) {
d0cca6c3 3485 dp_netdev_add_port_tx_to_pmd(pmd, port);
ae7ad0a1 3486 }
ae7ad0a1 3487
d0cca6c3 3488 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
65f13b50
AW
3489 }
3490 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
6c3eee82
BP
3491 }
3492}
e4cfed38 3493
6c3eee82 3494\f
f2eee189
AW
3495/* Called after pmd threads config change. Restarts pmd threads with
3496 * new configuration. */
3497static void
3498dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
e9985d6a 3499 OVS_REQUIRES(dp->port_mutex)
f2eee189 3500{
d0cca6c3
DDP
3501 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3502 struct dp_netdev_pmd_thread *pmd;
f2eee189 3503 struct dp_netdev_port *port;
d0cca6c3 3504 struct hmapx_node *node;
f2eee189 3505
e9985d6a 3506 HMAP_FOR_EACH (port, node, &dp->ports) {
f2eee189
AW
3507 if (netdev_is_pmd(port->netdev)) {
3508 int numa_id = netdev_get_numa_id(port->netdev);
3509
3510 dp_netdev_set_pmds_on_numa(dp, numa_id);
3511 }
d0cca6c3
DDP
3512 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload);
3513 }
3514
3515 HMAPX_FOR_EACH (node, &to_reload) {
3516 pmd = (struct dp_netdev_pmd_thread *) node->data;
3517 dp_netdev_reload_pmd__(pmd);
f2eee189 3518 }
d0cca6c3
DDP
3519
3520 hmapx_destroy(&to_reload);
f2eee189
AW
3521}
3522
b5cbbcf6
AZ
3523static char *
3524dpif_netdev_get_datapath_version(void)
3525{
3526 return xstrdup("<built-in>");
3527}
3528
72865317 3529static void
1c1e46ed 3530dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 3531 uint16_t tcp_flags, long long now)
72865317 3532{
eb94da30 3533 uint16_t flags;
72865317 3534
eb94da30
DDP
3535 atomic_store_relaxed(&netdev_flow->stats.used, now);
3536 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
3537 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
3538 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3539 flags |= tcp_flags;
3540 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
3541}
3542
3543static void
1c1e46ed
AW
3544dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
3545 enum dp_stat_type type, int cnt)
51852a57 3546{
eb94da30 3547 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
51852a57
BP
3548}
3549
623540e4 3550static int
e14deea0 3551dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 3552 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
3553 enum dpif_upcall_type type, const struct nlattr *userdata,
3554 struct ofpbuf *actions, struct ofpbuf *put_actions)
3555{
1c1e46ed 3556 struct dp_netdev *dp = pmd->dp;
6728d578
JG
3557 struct flow_tnl orig_tunnel;
3558 int err;
623540e4 3559
623540e4
EJ
3560 if (OVS_UNLIKELY(!dp->upcall_cb)) {
3561 return ENODEV;
3562 }
3563
6728d578
JG
3564 /* Upcall processing expects the Geneve options to be in the translated
3565 * format but we need to retain the raw format for datapath use. */
3566 orig_tunnel.flags = flow->tunnel.flags;
3567 if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
3568 orig_tunnel.metadata.present.len = flow->tunnel.metadata.present.len;
3569 memcpy(orig_tunnel.metadata.opts.gnv, flow->tunnel.metadata.opts.gnv,
3570 flow->tunnel.metadata.present.len);
3571 err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel,
3572 &flow->tunnel);
3573 if (err) {
3574 return err;
3575 }
3576 }
3577
623540e4
EJ
3578 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
3579 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 3580 char *packet_str;
cf62fa4c 3581 struct ofpbuf key;
5262eea1
JG
3582 struct odp_flow_key_parms odp_parms = {
3583 .flow = flow,
3584 .mask = &wc->masks,
2494ccd7 3585 .support = dp_netdev_support,
5262eea1 3586 };
623540e4
EJ
3587
3588 ofpbuf_init(&key, 0);
5262eea1 3589 odp_flow_key_from_flow(&odp_parms, &key);
cf62fa4c
PS
3590 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3591 dp_packet_size(packet_));
623540e4 3592
6fd6ed71 3593 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
3594
3595 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3596 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3597
3598 ofpbuf_uninit(&key);
3599 free(packet_str);
6fd6ed71 3600
623540e4
EJ
3601 ds_destroy(&ds);
3602 }
3603
6728d578
JG
3604 err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
3605 actions, wc, put_actions, dp->upcall_aux);
3606 if (err && err != ENOSPC) {
3607 return err;
3608 }
3609
3610 /* Translate tunnel metadata masks to datapath format. */
3611 if (wc) {
3612 if (wc->masks.tunnel.metadata.present.map) {
4e548ad9 3613 struct geneve_opt opts[TLV_TOT_OPT_SIZE /
6728d578
JG
3614 sizeof(struct geneve_opt)];
3615
3f32cfeb
JG
3616 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3617 tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
3618 &wc->masks.tunnel,
3619 orig_tunnel.metadata.opts.gnv,
3620 orig_tunnel.metadata.present.len,
3621 opts);
3622 } else {
3623 orig_tunnel.metadata.present.len = 0;
3624 }
6728d578
JG
3625
3626 memset(&wc->masks.tunnel.metadata, 0,
3627 sizeof wc->masks.tunnel.metadata);
3628 memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts,
3629 orig_tunnel.metadata.present.len);
3630 }
3631 wc->masks.tunnel.metadata.present.len = 0xff;
3632 }
3633
3634 /* Restore tunnel metadata. We need to use the saved options to ensure
3635 * that any unknown options are not lost. The generated mask will have
3636 * the same structure, matching on types and lengths but wildcarding
3637 * option data we don't care about. */
3638 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3639 memcpy(&flow->tunnel.metadata.opts.gnv, orig_tunnel.metadata.opts.gnv,
3640 orig_tunnel.metadata.present.len);
3641 flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len;
3642 flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
3643 }
3644
3645 return err;
623540e4
EJ
3646}
3647
9bbf1c3d 3648static inline uint32_t
048963aa
DDP
3649dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3650 const struct miniflow *mf)
9bbf1c3d 3651{
048963aa 3652 uint32_t hash, recirc_depth;
9bbf1c3d 3653
f2f44f5d
DDP
3654 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
3655 hash = dp_packet_get_rss_hash(packet);
3656 } else {
9bbf1c3d 3657 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 3658 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 3659 }
048963aa
DDP
3660
3661 /* The RSS hash must account for the recirculation depth to avoid
3662 * collisions in the exact match cache */
3663 recirc_depth = *recirc_depth_get_unsafe();
3664 if (OVS_UNLIKELY(recirc_depth)) {
3665 hash = hash_finish(hash, recirc_depth);
3666 dp_packet_set_rss_hash(packet, hash);
3667 }
9bbf1c3d
DDP
3668 return hash;
3669}
3670
f7ce4811 3671struct packet_batch_per_flow {
8cbf4f47
DDP
3672 unsigned int byte_count;
3673 uint16_t tcp_flags;
8cbf4f47
DDP
3674 struct dp_netdev_flow *flow;
3675
1895cc8d 3676 struct dp_packet_batch array;
8cbf4f47
DDP
3677};
3678
3679static inline void
f7ce4811
PS
3680packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
3681 struct dp_packet *packet,
3682 const struct miniflow *mf)
8cbf4f47 3683{
cf62fa4c 3684 batch->byte_count += dp_packet_size(packet);
1895cc8d
PS
3685 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3686 batch->array.packets[batch->array.count++] = packet;
8cbf4f47
DDP
3687}
3688
3689static inline void
f7ce4811
PS
3690packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
3691 struct dp_netdev_flow *flow)
8cbf4f47 3692{
11e5cf1f 3693 flow->batch = batch;
8cbf4f47 3694
11e5cf1f 3695 batch->flow = flow;
1895cc8d 3696 dp_packet_batch_init(&batch->array);
8cbf4f47
DDP
3697 batch->byte_count = 0;
3698 batch->tcp_flags = 0;
8cbf4f47
DDP
3699}
3700
3701static inline void
f7ce4811
PS
3702packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
3703 struct dp_netdev_pmd_thread *pmd,
3704 long long now)
8cbf4f47
DDP
3705{
3706 struct dp_netdev_actions *actions;
3707 struct dp_netdev_flow *flow = batch->flow;
3708
1895cc8d 3709 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
11bfdadd 3710 batch->tcp_flags, now);
8cbf4f47
DDP
3711
3712 actions = dp_netdev_flow_get_actions(flow);
3713
1895cc8d 3714 dp_netdev_execute_actions(pmd, &batch->array, true,
324c8374 3715 actions->actions, actions->size, now);
8cbf4f47
DDP
3716}
3717
8aaa125d 3718static inline void
e14deea0 3719dp_netdev_queue_batches(struct dp_packet *pkt,
9bbf1c3d 3720 struct dp_netdev_flow *flow, const struct miniflow *mf,
f7ce4811 3721 struct packet_batch_per_flow *batches, size_t *n_batches)
9bbf1c3d 3722{
f7ce4811 3723 struct packet_batch_per_flow *batch = flow->batch;
11e5cf1f 3724
f9fe365b
AZ
3725 if (OVS_UNLIKELY(!batch)) {
3726 batch = &batches[(*n_batches)++];
f7ce4811 3727 packet_batch_per_flow_init(batch, flow);
9bbf1c3d
DDP
3728 }
3729
f7ce4811 3730 packet_batch_per_flow_update(batch, pkt, mf);
9bbf1c3d
DDP
3731}
3732
9bbf1c3d 3733/* Try to process all ('cnt') the 'packets' using only the exact match cache
a90ed026 3734 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d
DDP
3735 * miniflow is copied into 'keys' and the packet pointer is moved at the
3736 * beginning of the 'packets' array.
9bbf1c3d
DDP
3737 *
3738 * The function returns the number of packets that needs to be processed in the
3739 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026
DDP
3740 *
3741 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
3742 * initialized by this function using 'port_no'.
9bbf1c3d
DDP
3743 */
3744static inline size_t
1895cc8d
PS
3745emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
3746 struct netdev_flow_key *keys,
f7ce4811 3747 struct packet_batch_per_flow batches[], size_t *n_batches,
a90ed026 3748 bool md_is_valid, odp_port_t port_no)
72865317 3749{
65f13b50 3750 struct emc_cache *flow_cache = &pmd->flow_cache;
b89c678b 3751 struct netdev_flow_key *key = &keys[0];
3d88a620 3752 size_t i, n_missed = 0, n_dropped = 0;
1895cc8d
PS
3753 struct dp_packet **packets = packets_->packets;
3754 int cnt = packets_->count;
8cbf4f47 3755
84d6d5eb 3756 for (i = 0; i < cnt; i++) {
9bbf1c3d 3757 struct dp_netdev_flow *flow;
5a2fed48 3758 struct dp_packet *packet = packets[i];
9bbf1c3d 3759
5a2fed48
AZ
3760 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
3761 dp_packet_delete(packet);
3d88a620 3762 n_dropped++;
84d6d5eb
EJ
3763 continue;
3764 }
8cbf4f47 3765
72a5e2b8 3766 if (i != cnt - 1) {
a90ed026 3767 /* Prefetch next packet data and metadata. */
72a5e2b8 3768 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 3769 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
3770 }
3771
a90ed026
DDP
3772 if (!md_is_valid) {
3773 pkt_metadata_init(&packet->md, port_no);
3774 }
5a2fed48 3775 miniflow_extract(packet, &key->mf);
d262ac2c 3776 key->len = 0; /* Not computed yet. */
5a2fed48 3777 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
9bbf1c3d 3778
d262ac2c 3779 flow = emc_lookup(flow_cache, key);
8aaa125d 3780 if (OVS_LIKELY(flow)) {
5a2fed48 3781 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
8aaa125d
DDP
3782 n_batches);
3783 } else {
d1aa0b94
AZ
3784 /* Exact match cache missed. Group missed packets together at
3785 * the beginning of the 'packets' array. */
b89c678b 3786 packets[n_missed] = packet;
400486f7
DDP
3787 /* 'key[n_missed]' contains the key of the current packet and it
3788 * must be returned to the caller. The next key should be extracted
3789 * to 'keys[n_missed + 1]'. */
3790 key = &keys[++n_missed];
9bbf1c3d
DDP
3791 }
3792 }
3793
3d88a620 3794 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
4f150744 3795
3d88a620 3796 return n_missed;
9bbf1c3d
DDP
3797}
3798
a260d966
PS
3799static inline void
3800handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
3801 const struct netdev_flow_key *key,
3802 struct ofpbuf *actions, struct ofpbuf *put_actions,
324c8374 3803 int *lost_cnt, long long now)
a260d966
PS
3804{
3805 struct ofpbuf *add_actions;
3806 struct dp_packet_batch b;
3807 struct match match;
3808 ovs_u128 ufid;
3809 int error;
3810
3811 match.tun_md.valid = false;
3812 miniflow_expand(&key->mf, &match.flow);
3813
3814 ofpbuf_clear(actions);
3815 ofpbuf_clear(put_actions);
3816
3817 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
3818 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
3819 &ufid, DPIF_UC_MISS, NULL, actions,
3820 put_actions);
3821 if (OVS_UNLIKELY(error && error != ENOSPC)) {
3822 dp_packet_delete(packet);
3823 (*lost_cnt)++;
3824 return;
3825 }
3826
3827 /* The Netlink encoding of datapath flow keys cannot express
3828 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3829 * tag is interpreted as exact match on the fact that there is no
3830 * VLAN. Unless we refactor a lot of code that translates between
3831 * Netlink and struct flow representations, we have to do the same
3832 * here. */
3833 if (!match.wc.masks.vlan_tci) {
3834 match.wc.masks.vlan_tci = htons(0xffff);
3835 }
3836
3837 /* We can't allow the packet batching in the next loop to execute
3838 * the actions. Otherwise, if there are any slow path actions,
3839 * we'll send the packet up twice. */
3840 packet_batch_init_packet(&b, packet);
3841 dp_netdev_execute_actions(pmd, &b, true,
324c8374 3842 actions->data, actions->size, now);
a260d966
PS
3843
3844 add_actions = put_actions->size ? put_actions : actions;
3845 if (OVS_LIKELY(error != ENOSPC)) {
3846 struct dp_netdev_flow *netdev_flow;
3847
3848 /* XXX: There's a race window where a flow covering this packet
3849 * could have already been installed since we last did the flow
3850 * lookup before upcall. This could be solved by moving the
3851 * mutex lock outside the loop, but that's an awful long time
3852 * to be locking everyone out of making flow installs. If we
3853 * move to a per-core classifier, it would be reasonable. */
3854 ovs_mutex_lock(&pmd->flow_mutex);
3855 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key);
3856 if (OVS_LIKELY(!netdev_flow)) {
3857 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
3858 add_actions->data,
3859 add_actions->size);
3860 }
3861 ovs_mutex_unlock(&pmd->flow_mutex);
3862
3863 emc_insert(&pmd->flow_cache, key, netdev_flow);
3864 }
3865}
3866
9bbf1c3d 3867static inline void
65f13b50 3868fast_path_processing(struct dp_netdev_pmd_thread *pmd,
1895cc8d 3869 struct dp_packet_batch *packets_,
8aaa125d 3870 struct netdev_flow_key *keys,
324c8374
IM
3871 struct packet_batch_per_flow batches[], size_t *n_batches,
3872 long long now)
9bbf1c3d 3873{
1895cc8d 3874 int cnt = packets_->count;
1a0d5831 3875#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3876 const size_t PKT_ARRAY_SIZE = cnt;
3877#else
1a0d5831 3878 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3879 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 3880#endif
1895cc8d 3881 struct dp_packet **packets = packets_->packets;
0de8783a 3882 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50
AW
3883 struct dp_netdev *dp = pmd->dp;
3884 struct emc_cache *flow_cache = &pmd->flow_cache;
8aaa125d 3885 int miss_cnt = 0, lost_cnt = 0;
9bbf1c3d 3886 bool any_miss;
8aaa125d 3887 size_t i;
9bbf1c3d
DDP
3888
3889 for (i = 0; i < cnt; i++) {
0de8783a 3890 /* Key length is needed in all the cases, hash computed on demand. */
361d808d 3891 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
9bbf1c3d 3892 }
1c1e46ed 3893 any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
623540e4
EJ
3894 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3895 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
3896 struct ofpbuf actions, put_actions;
623540e4
EJ
3897
3898 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
3899 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
3900
3901 for (i = 0; i < cnt; i++) {
0de8783a 3902 struct dp_netdev_flow *netdev_flow;
623540e4 3903
0de8783a 3904 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
3905 continue;
3906 }
3907
3908 /* It's possible that an earlier slow path execution installed
0de8783a 3909 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 3910 * to catch it here than execute a miss. */
1c1e46ed 3911 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
623540e4 3912 if (netdev_flow) {
0de8783a 3913 rules[i] = &netdev_flow->cr;
623540e4
EJ
3914 continue;
3915 }
3916
60fc3b7b 3917 miss_cnt++;
324c8374
IM
3918 handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
3919 &put_actions, &lost_cnt, now);
623540e4
EJ
3920 }
3921
3922 ofpbuf_uninit(&actions);
3923 ofpbuf_uninit(&put_actions);
3924 fat_rwlock_unlock(&dp->upcall_rwlock);
60fc3b7b 3925 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
ac8c2081 3926 } else if (OVS_UNLIKELY(any_miss)) {
ac8c2081 3927 for (i = 0; i < cnt; i++) {
0de8783a 3928 if (OVS_UNLIKELY(!rules[i])) {
e14deea0 3929 dp_packet_delete(packets[i]);
8aaa125d
DDP
3930 lost_cnt++;
3931 miss_cnt++;
ac8c2081
DDP
3932 }
3933 }
623540e4 3934 }
84d6d5eb 3935
8cbf4f47 3936 for (i = 0; i < cnt; i++) {
e14deea0 3937 struct dp_packet *packet = packets[i];
84d6d5eb 3938 struct dp_netdev_flow *flow;
8cbf4f47 3939
0de8783a 3940 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
3941 continue;
3942 }
3943
84d6d5eb 3944 flow = dp_netdev_flow_cast(rules[i]);
0de8783a 3945
0de8783a 3946 emc_insert(flow_cache, &keys[i], flow);
8aaa125d 3947 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
8cbf4f47
DDP
3948 }
3949
8aaa125d
DDP
3950 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
3951 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
3952 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
72865317
BP
3953}
3954
a90ed026
DDP
3955/* Packets enter the datapath from a port (or from recirculation) here.
3956 *
3957 * For performance reasons a caller may choose not to initialize the metadata
3958 * in 'packets': in this case 'mdinit' is false and this function needs to
3959 * initialize it using 'port_no'. If the metadata in 'packets' is already
3960 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
adcf00ba 3961static void
a90ed026 3962dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
1895cc8d 3963 struct dp_packet_batch *packets,
a90ed026 3964 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 3965{
1895cc8d 3966 int cnt = packets->count;
1a0d5831 3967#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
3968 const size_t PKT_ARRAY_SIZE = cnt;
3969#else
1a0d5831 3970 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 3971 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d
DDP
3972#endif
3973 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
f7ce4811 3974 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
11bfdadd 3975 long long now = time_msec();
8aaa125d 3976 size_t newcnt, n_batches, i;
9bbf1c3d 3977
8aaa125d 3978 n_batches = 0;
1895cc8d 3979 newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
a90ed026 3980 md_is_valid, port_no);
9bbf1c3d 3981 if (OVS_UNLIKELY(newcnt)) {
1895cc8d 3982 packets->count = newcnt;
324c8374 3983 fast_path_processing(pmd, packets, keys, batches, &n_batches, now);
8aaa125d
DDP
3984 }
3985
603f2ce0
EJ
3986 for (i = 0; i < n_batches; i++) {
3987 batches[i].flow->batch = NULL;
3988 }
3989
8aaa125d 3990 for (i = 0; i < n_batches; i++) {
f7ce4811 3991 packet_batch_per_flow_execute(&batches[i], pmd, now);
9bbf1c3d
DDP
3992 }
3993}
3994
a90ed026
DDP
3995static void
3996dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
1895cc8d 3997 struct dp_packet_batch *packets,
a90ed026
DDP
3998 odp_port_t port_no)
3999{
1895cc8d 4000 dp_netdev_input__(pmd, packets, false, port_no);
a90ed026
DDP
4001}
4002
4003static void
4004dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4005 struct dp_packet_batch *packets)
a90ed026 4006{
1895cc8d 4007 dp_netdev_input__(pmd, packets, true, 0);
a90ed026
DDP
4008}
4009
9080a111 4010struct dp_netdev_execute_aux {
65f13b50 4011 struct dp_netdev_pmd_thread *pmd;
324c8374 4012 long long now;
9080a111
JR
4013};
4014
e4e74c3a
AW
4015static void
4016dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
4017 void *aux)
4018{
4019 struct dp_netdev *dp = get_dp_netdev(dpif);
4020 dp->dp_purge_aux = aux;
4021 dp->dp_purge_cb = cb;
4022}
4023
6b31e073 4024static void
623540e4
EJ
4025dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
4026 void *aux)
6b31e073
RW
4027{
4028 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 4029 dp->upcall_aux = aux;
6b31e073
RW
4030 dp->upcall_cb = cb;
4031}
4032
324c8374
IM
4033static void
4034dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
4035 long long now, bool purge)
4036{
4037 struct tx_port *tx;
4038 struct dp_netdev_port *port;
4039 long long interval;
4040
4041 HMAP_FOR_EACH (tx, node, &pmd->port_cache) {
4042 if (tx->port->dynamic_txqs) {
4043 continue;
4044 }
4045 interval = now - tx->last_used;
4046 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
4047 port = tx->port;
4048 ovs_mutex_lock(&port->txq_used_mutex);
4049 port->txq_used[tx->qid]--;
4050 ovs_mutex_unlock(&port->txq_used_mutex);
4051 tx->qid = -1;
4052 }
4053 }
4054}
4055
4056static int
4057dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
4058 struct tx_port *tx, long long now)
4059{
4060 struct dp_netdev_port *port;
4061 long long interval;
4062 int i, min_cnt, min_qid;
4063
4064 if (OVS_UNLIKELY(!now)) {
4065 now = time_msec();
4066 }
4067
4068 interval = now - tx->last_used;
4069 tx->last_used = now;
4070
4071 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
4072 return tx->qid;
4073 }
4074
4075 port = tx->port;
4076
4077 ovs_mutex_lock(&port->txq_used_mutex);
4078 if (tx->qid >= 0) {
4079 port->txq_used[tx->qid]--;
4080 tx->qid = -1;
4081 }
4082
4083 min_cnt = -1;
4084 min_qid = 0;
4085 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
4086 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
4087 min_cnt = port->txq_used[i];
4088 min_qid = i;
4089 }
4090 }
4091
4092 port->txq_used[min_qid]++;
4093 tx->qid = min_qid;
4094
4095 ovs_mutex_unlock(&port->txq_used_mutex);
4096
4097 dpif_netdev_xps_revalidate_pmd(pmd, now, false);
4098
4099 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
4100 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
4101 return min_qid;
4102}
4103
d0cca6c3
DDP
4104static struct tx_port *
4105pmd_tx_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
4106 odp_port_t port_no)
4107{
4108 return tx_port_lookup(&pmd->port_cache, port_no);
4109}
4110
a36de779 4111static int
d0cca6c3 4112push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
1895cc8d
PS
4113 const struct nlattr *attr,
4114 struct dp_packet_batch *batch)
a36de779 4115{
d0cca6c3 4116 struct tx_port *tun_port;
a36de779 4117 const struct ovs_action_push_tnl *data;
4c742796 4118 int err;
a36de779
PS
4119
4120 data = nl_attr_get(attr);
4121
d0cca6c3 4122 tun_port = pmd_tx_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
a36de779 4123 if (!tun_port) {
4c742796
PS
4124 err = -EINVAL;
4125 goto error;
a36de779 4126 }
324c8374 4127 err = netdev_push_header(tun_port->port->netdev, batch, data);
4c742796
PS
4128 if (!err) {
4129 return 0;
4130 }
4131error:
4132 dp_packet_delete_batch(batch, true);
4133 return err;
a36de779
PS
4134}
4135
66525ef3
PS
4136static void
4137dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
4138 struct dp_packet *packet, bool may_steal,
4139 struct flow *flow, ovs_u128 *ufid,
4140 struct ofpbuf *actions,
324c8374 4141 const struct nlattr *userdata, long long now)
66525ef3
PS
4142{
4143 struct dp_packet_batch b;
4144 int error;
4145
4146 ofpbuf_clear(actions);
4147
4148 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
4149 DPIF_UC_ACTION, userdata, actions,
4150 NULL);
4151 if (!error || error == ENOSPC) {
4152 packet_batch_init_packet(&b, packet);
4153 dp_netdev_execute_actions(pmd, &b, may_steal,
324c8374 4154 actions->data, actions->size, now);
66525ef3
PS
4155 } else if (may_steal) {
4156 dp_packet_delete(packet);
4157 }
4158}
4159
a36de779 4160static void
1895cc8d 4161dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
09f9da0b 4162 const struct nlattr *a, bool may_steal)
9080a111
JR
4163{
4164 struct dp_netdev_execute_aux *aux = aux_;
623540e4 4165 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
4166 struct dp_netdev_pmd_thread *pmd = aux->pmd;
4167 struct dp_netdev *dp = pmd->dp;
09f9da0b 4168 int type = nl_attr_type(a);
324c8374 4169 long long now = aux->now;
d0cca6c3 4170 struct tx_port *p;
9080a111 4171
09f9da0b
JR
4172 switch ((enum ovs_action_attr)type) {
4173 case OVS_ACTION_ATTR_OUTPUT:
d0cca6c3 4174 p = pmd_tx_port_cache_lookup(pmd, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 4175 if (OVS_LIKELY(p)) {
347ba9bb 4176 int tx_qid;
324c8374 4177 bool dynamic_txqs;
347ba9bb 4178
324c8374
IM
4179 dynamic_txqs = p->port->dynamic_txqs;
4180 if (dynamic_txqs) {
4181 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
4182 } else {
4183 atomic_read_relaxed(&pmd->static_tx_qid, &tx_qid);
4184 }
347ba9bb 4185
324c8374
IM
4186 netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
4187 dynamic_txqs);
ac8c2081 4188 return;
8a4e3a85 4189 }
09f9da0b
JR
4190 break;
4191
a36de779
PS
4192 case OVS_ACTION_ATTR_TUNNEL_PUSH:
4193 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 4194 struct dp_packet_batch tnl_pkt;
aaca4fe0 4195 struct dp_packet_batch *orig_packets_ = packets_;
a36de779
PS
4196 int err;
4197
4198 if (!may_steal) {
1895cc8d
PS
4199 dp_packet_batch_clone(&tnl_pkt, packets_);
4200 packets_ = &tnl_pkt;
aaca4fe0 4201 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
4202 }
4203
aaca4fe0
WT
4204 dp_packet_batch_apply_cutlen(packets_);
4205
d0cca6c3 4206 err = push_tnl_action(pmd, a, packets_);
a36de779
PS
4207 if (!err) {
4208 (*depth)++;
1895cc8d 4209 dp_netdev_recirculate(pmd, packets_);
a36de779 4210 (*depth)--;
a36de779
PS
4211 }
4212 return;
4213 }
4214 break;
4215
4216 case OVS_ACTION_ATTR_TUNNEL_POP:
4217 if (*depth < MAX_RECIRC_DEPTH) {
aaca4fe0 4218 struct dp_packet_batch *orig_packets_ = packets_;
a36de779
PS
4219 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
4220
d0cca6c3 4221 p = pmd_tx_port_cache_lookup(pmd, portno);
a36de779 4222 if (p) {
1895cc8d 4223 struct dp_packet_batch tnl_pkt;
9235b479 4224 int i;
a36de779
PS
4225
4226 if (!may_steal) {
aaca4fe0
WT
4227 dp_packet_batch_clone(&tnl_pkt, packets_);
4228 packets_ = &tnl_pkt;
4229 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
4230 }
4231
aaca4fe0
WT
4232 dp_packet_batch_apply_cutlen(packets_);
4233
324c8374 4234 netdev_pop_header(p->port->netdev, packets_);
1895cc8d 4235 if (!packets_->count) {
1c8f98d9
PS
4236 return;
4237 }
9235b479
PS
4238
4239 for (i = 0; i < packets_->count; i++) {
4240 packets_->packets[i]->md.in_port.odp_port = portno;
a36de779 4241 }
9235b479
PS
4242
4243 (*depth)++;
4244 dp_netdev_recirculate(pmd, packets_);
4245 (*depth)--;
a36de779
PS
4246 return;
4247 }
4248 }
4249 break;
4250
623540e4
EJ
4251 case OVS_ACTION_ATTR_USERSPACE:
4252 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
aaca4fe0 4253 struct dp_packet_batch *orig_packets_ = packets_;
1895cc8d 4254 struct dp_packet **packets = packets_->packets;
623540e4 4255 const struct nlattr *userdata;
aaca4fe0 4256 struct dp_packet_batch usr_pkt;
623540e4
EJ
4257 struct ofpbuf actions;
4258 struct flow flow;
7af12bd7 4259 ovs_u128 ufid;
aaca4fe0 4260 bool clone = false;
1c8f98d9 4261 int i;
4fc65926 4262
623540e4
EJ
4263 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
4264 ofpbuf_init(&actions, 0);
8cbf4f47 4265
aaca4fe0
WT
4266 if (packets_->trunc) {
4267 if (!may_steal) {
4268 dp_packet_batch_clone(&usr_pkt, packets_);
4269 packets_ = &usr_pkt;
4270 packets = packets_->packets;
4271 clone = true;
4272 dp_packet_batch_reset_cutlen(orig_packets_);
4273 }
4274
4275 dp_packet_batch_apply_cutlen(packets_);
4276 }
4277
1895cc8d 4278 for (i = 0; i < packets_->count; i++) {
cf62fa4c 4279 flow_extract(packets[i], &flow);
7af12bd7 4280 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
66525ef3 4281 dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
324c8374 4282 &ufid, &actions, userdata, now);
db73f716 4283 }
aaca4fe0
WT
4284
4285 if (clone) {
4286 dp_packet_delete_batch(packets_, true);
4287 }
4288
623540e4
EJ
4289 ofpbuf_uninit(&actions);
4290 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 4291
ac8c2081
DDP
4292 return;
4293 }
09f9da0b 4294 break;
572f732a 4295
adcf00ba
AZ
4296 case OVS_ACTION_ATTR_RECIRC:
4297 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 4298 struct dp_packet_batch recirc_pkts;
1c8f98d9 4299 int i;
572f732a 4300
28e2fa02 4301 if (!may_steal) {
1895cc8d
PS
4302 dp_packet_batch_clone(&recirc_pkts, packets_);
4303 packets_ = &recirc_pkts;
28e2fa02 4304 }
8cbf4f47 4305
1895cc8d
PS
4306 for (i = 0; i < packets_->count; i++) {
4307 packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 4308 }
28e2fa02
DDP
4309
4310 (*depth)++;
1895cc8d 4311 dp_netdev_recirculate(pmd, packets_);
adcf00ba
AZ
4312 (*depth)--;
4313
ac8c2081 4314 return;
adcf00ba 4315 }
ac8c2081
DDP
4316
4317 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 4318 break;
572f732a 4319
07659514
JS
4320 case OVS_ACTION_ATTR_CT:
4321 /* If a flow with this action is slow-pathed, datapath assistance is
4322 * required to implement it. However, we don't support this action
4323 * in the userspace datapath. */
4324 VLOG_WARN("Cannot execute conntrack action in userspace.");
4325 break;
4326
09f9da0b
JR
4327 case OVS_ACTION_ATTR_PUSH_VLAN:
4328 case OVS_ACTION_ATTR_POP_VLAN:
4329 case OVS_ACTION_ATTR_PUSH_MPLS:
4330 case OVS_ACTION_ATTR_POP_MPLS:
4331 case OVS_ACTION_ATTR_SET:
6d670e7f 4332 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 4333 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 4334 case OVS_ACTION_ATTR_HASH:
09f9da0b 4335 case OVS_ACTION_ATTR_UNSPEC:
aaca4fe0 4336 case OVS_ACTION_ATTR_TRUNC:
09f9da0b
JR
4337 case __OVS_ACTION_ATTR_MAX:
4338 OVS_NOT_REACHED();
da546e07 4339 }
ac8c2081 4340
1895cc8d 4341 dp_packet_delete_batch(packets_, may_steal);
98403001
BP
4342}
4343
4edb9ae9 4344static void
65f13b50 4345dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4346 struct dp_packet_batch *packets,
41ccaa24 4347 bool may_steal,
324c8374
IM
4348 const struct nlattr *actions, size_t actions_len,
4349 long long now)
72865317 4350{
324c8374 4351 struct dp_netdev_execute_aux aux = { pmd, now };
9080a111 4352
1895cc8d 4353 odp_execute_actions(&aux, packets, may_steal, actions,
8cbf4f47 4354 actions_len, dp_execute_cb);
72865317
BP
4355}
4356
4357const struct dpif_class dpif_netdev_class = {
72865317 4358 "netdev",
6553d06b 4359 dpif_netdev_init,
2197d7ab 4360 dpif_netdev_enumerate,
0aeaabc8 4361 dpif_netdev_port_open_type,
72865317
BP
4362 dpif_netdev_open,
4363 dpif_netdev_close,
7dab847a 4364 dpif_netdev_destroy,
e4cfed38
PS
4365 dpif_netdev_run,
4366 dpif_netdev_wait,
72865317 4367 dpif_netdev_get_stats,
72865317
BP
4368 dpif_netdev_port_add,
4369 dpif_netdev_port_del,
91364d18 4370 NULL, /* port_set_config */
72865317
BP
4371 dpif_netdev_port_query_by_number,
4372 dpif_netdev_port_query_by_name,
98403001 4373 NULL, /* port_get_pid */
b0ec0f27
BP
4374 dpif_netdev_port_dump_start,
4375 dpif_netdev_port_dump_next,
4376 dpif_netdev_port_dump_done,
72865317
BP
4377 dpif_netdev_port_poll,
4378 dpif_netdev_port_poll_wait,
72865317 4379 dpif_netdev_flow_flush,
ac64794a
BP
4380 dpif_netdev_flow_dump_create,
4381 dpif_netdev_flow_dump_destroy,
4382 dpif_netdev_flow_dump_thread_create,
4383 dpif_netdev_flow_dump_thread_destroy,
704a1e09 4384 dpif_netdev_flow_dump_next,
1a0c894a 4385 dpif_netdev_operate,
6b31e073
RW
4386 NULL, /* recv_set */
4387 NULL, /* handlers_set */
f2eee189 4388 dpif_netdev_pmd_set,
5bf93d67 4389 dpif_netdev_queue_to_priority,
6b31e073
RW
4390 NULL, /* recv */
4391 NULL, /* recv_wait */
4392 NULL, /* recv_purge */
e4e74c3a 4393 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
4394 dpif_netdev_register_upcall_cb,
4395 dpif_netdev_enable_upcall,
4396 dpif_netdev_disable_upcall,
b5cbbcf6 4397 dpif_netdev_get_datapath_version,
b77d9629
DDP
4398 NULL, /* ct_dump_start */
4399 NULL, /* ct_dump_next */
4400 NULL, /* ct_dump_done */
a0f7b6d5 4401 NULL, /* ct_flush */
72865317 4402};
614c4892 4403
74cc3969
BP
4404static void
4405dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
4406 const char *argv[], void *aux OVS_UNUSED)
4407{
e9985d6a 4408 struct dp_netdev_port *port;
74cc3969 4409 struct dp_netdev *dp;
ff073a71 4410 odp_port_t port_no;
74cc3969 4411
8a4e3a85 4412 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
4413 dp = shash_find_data(&dp_netdevs, argv[1]);
4414 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 4415 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
4416 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4417 return;
4418 }
8a4e3a85
BP
4419 ovs_refcount_ref(&dp->ref_cnt);
4420 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 4421
59e6d833 4422 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 4423 if (get_port_by_name(dp, argv[2], &port)) {
74cc3969 4424 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 4425 goto exit;
74cc3969
BP
4426 }
4427
ff073a71
BP
4428 port_no = u32_to_odp(atoi(argv[3]));
4429 if (!port_no || port_no == ODPP_NONE) {
74cc3969 4430 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 4431 goto exit;
74cc3969 4432 }
ff073a71 4433 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 4434 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 4435 goto exit;
74cc3969 4436 }
59e6d833 4437
e9985d6a
DDP
4438 /* Remove port. */
4439 hmap_remove(&dp->ports, &port->node);
4440 dp_netdev_del_port_from_all_pmds(dp, port);
59e6d833 4441
e9985d6a
DDP
4442 /* Reinsert with new port number. */
4443 port->port_no = port_no;
4444 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
4445 dp_netdev_add_port_to_pmds(dp, port);
59e6d833 4446
d33ed218 4447 seq_change(dp->port_seq);
74cc3969 4448 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
4449
4450exit:
59e6d833 4451 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 4452 dp_netdev_unref(dp);
74cc3969
BP
4453}
4454
0cbfe35d
BP
4455static void
4456dpif_dummy_register__(const char *type)
4457{
4458 struct dpif_class *class;
4459
4460 class = xmalloc(sizeof *class);
4461 *class = dpif_netdev_class;
4462 class->type = xstrdup(type);
4463 dp_register_provider(class);
4464}
4465
8420c7ad
BP
4466static void
4467dpif_dummy_override(const char *type)
4468{
65d43fdc
YT
4469 int error;
4470
4471 /*
4472 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
4473 * a userland-only build. It's useful for testsuite.
4474 */
4475 error = dp_unregister_provider(type);
4476 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
4477 dpif_dummy_register__(type);
4478 }
4479}
4480
614c4892 4481void
8420c7ad 4482dpif_dummy_register(enum dummy_level level)
614c4892 4483{
8420c7ad 4484 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
4485 struct sset types;
4486 const char *type;
4487
4488 sset_init(&types);
4489 dp_enumerate_types(&types);
4490 SSET_FOR_EACH (type, &types) {
8420c7ad 4491 dpif_dummy_override(type);
0cbfe35d
BP
4492 }
4493 sset_destroy(&types);
8420c7ad
BP
4494 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
4495 dpif_dummy_override("system");
614c4892 4496 }
0cbfe35d
BP
4497
4498 dpif_dummy_register__("dummy");
74cc3969
BP
4499
4500 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 4501 "dp port new-number",
74cc3969 4502 3, 3, dpif_dummy_change_port_number, NULL);
614c4892 4503}
0de8783a
JR
4504\f
4505/* Datapath Classifier. */
4506
4507/* A set of rules that all have the same fields wildcarded. */
4508struct dpcls_subtable {
4509 /* The fields are only used by writers. */
4510 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
4511
4512 /* These fields are accessed by readers. */
4513 struct cmap rules; /* Contains "struct dpcls_rule"s. */
4514 struct netdev_flow_key mask; /* Wildcards for fields (const). */
4515 /* 'mask' must be the last field, additional space is allocated here. */
4516};
4517
4518/* Initializes 'cls' as a classifier that initially contains no classification
4519 * rules. */
4520static void
4521dpcls_init(struct dpcls *cls)
4522{
4523 cmap_init(&cls->subtables_map);
4524 pvector_init(&cls->subtables);
4525}
4526
4527static void
4528dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
4529{
4530 pvector_remove(&cls->subtables, subtable);
4531 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
4532 subtable->mask.hash);
4533 cmap_destroy(&subtable->rules);
4534 ovsrcu_postpone(free, subtable);
4535}
4536
4537/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
4538 * caller's responsibility.
4539 * May only be called after all the readers have been terminated. */
4540static void
4541dpcls_destroy(struct dpcls *cls)
4542{
4543 if (cls) {
4544 struct dpcls_subtable *subtable;
4545
4546 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 4547 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
4548 dpcls_destroy_subtable(cls, subtable);
4549 }
4550 cmap_destroy(&cls->subtables_map);
4551 pvector_destroy(&cls->subtables);
4552 }
4553}
4554
4555static struct dpcls_subtable *
4556dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4557{
4558 struct dpcls_subtable *subtable;
4559
4560 /* Need to add one. */
caeb4906
JR
4561 subtable = xmalloc(sizeof *subtable
4562 - sizeof subtable->mask.mf + mask->len);
0de8783a
JR
4563 cmap_init(&subtable->rules);
4564 netdev_flow_key_clone(&subtable->mask, mask);
4565 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
4566 pvector_insert(&cls->subtables, subtable, 0);
802f84ff 4567 pvector_publish(&cls->subtables);
0de8783a
JR
4568
4569 return subtable;
4570}
4571
4572static inline struct dpcls_subtable *
4573dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4574{
4575 struct dpcls_subtable *subtable;
4576
4577 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
4578 &cls->subtables_map) {
4579 if (netdev_flow_key_equal(&subtable->mask, mask)) {
4580 return subtable;
4581 }
4582 }
4583 return dpcls_create_subtable(cls, mask);
4584}
4585
4586/* Insert 'rule' into 'cls'. */
4587static void
4588dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
4589 const struct netdev_flow_key *mask)
4590{
4591 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
4592
4593 rule->mask = &subtable->mask;
4594 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
4595}
4596
4597/* Removes 'rule' from 'cls', also destructing the 'rule'. */
4598static void
4599dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
4600{
4601 struct dpcls_subtable *subtable;
4602
4603 ovs_assert(rule->mask);
4604
4605 INIT_CONTAINER(subtable, rule->mask, mask);
4606
4607 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
4608 == 0) {
4609 dpcls_destroy_subtable(cls, subtable);
802f84ff 4610 pvector_publish(&cls->subtables);
0de8783a
JR
4611 }
4612}
4613
361d808d
JR
4614/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
4615 * in 'mask' the values in 'key' and 'target' are the same. */
0de8783a
JR
4616static inline bool
4617dpcls_rule_matches_key(const struct dpcls_rule *rule,
4618 const struct netdev_flow_key *target)
4619{
09b0fa9c
JR
4620 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
4621 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 4622 uint64_t value;
0de8783a 4623
5fcff47b
JR
4624 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
4625 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
4626 return false;
4627 }
4628 }
4629 return true;
4630}
4631
4632/* For each miniflow in 'flows' performs a classifier lookup writing the result
4633 * into the corresponding slot in 'rules'. If a particular entry in 'flows' is
4634 * NULL it is skipped.
4635 *
4636 * This function is optimized for use in the userspace datapath and therefore
4637 * does not implement a lot of features available in the standard
4638 * classifier_lookup() function. Specifically, it does not implement
4639 * priorities, instead returning any rule which matches the flow.
4640 *
4641 * Returns true if all flows found a corresponding rule. */
4642static bool
4643dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
4644 struct dpcls_rule **rules, const size_t cnt)
4645{
4646 /* The batch size 16 was experimentally found faster than 8 or 32. */
4647 typedef uint16_t map_type;
4648#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
4649
4650#if !defined(__CHECKER__) && !defined(_WIN32)
4651 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
4652#else
cd159f1a 4653 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
0de8783a
JR
4654#endif
4655 map_type maps[N_MAPS];
4656 struct dpcls_subtable *subtable;
4657
4658 memset(maps, 0xff, sizeof maps);
4659 if (cnt % MAP_BITS) {
4660 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
4661 }
4662 memset(rules, 0, cnt * sizeof *rules);
4663
4664 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
4665 const struct netdev_flow_key *mkeys = keys;
4666 struct dpcls_rule **mrules = rules;
4667 map_type remains = 0;
4668 int m;
4669
4670 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
4671
4672 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
4673 uint32_t hashes[MAP_BITS];
4674 const struct cmap_node *nodes[MAP_BITS];
4675 unsigned long map = maps[m];
4676 int i;
4677
4678 if (!map) {
4679 continue; /* Skip empty maps. */
4680 }
4681
4682 /* Compute hashes for the remaining keys. */
3ee6026a 4683 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
4684 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
4685 &subtable->mask);
4686 }
4687 /* Lookup. */
4688 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
4689 /* Check results. */
3ee6026a 4690 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
4691 struct dpcls_rule *rule;
4692
4693 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
4694 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
4695 mrules[i] = rule;
4696 goto next;
4697 }
4698 }
3ee6026a 4699 ULLONG_SET0(map, i); /* Did not match. */
0de8783a
JR
4700 next:
4701 ; /* Keep Sparse happy. */
4702 }
4703 maps[m] &= ~map; /* Clear the found rules. */
4704 remains |= maps[m];
4705 }
4706 if (!remains) {
4707 return true; /* All found. */
4708 }
4709 }
4710 return false; /* Some misses. */
4711}