]> git.proxmox.com Git - ovs.git/blame - lib/dpif-netdev.c
netdev-dpdk: Change vlog module name to 'netdev_dpdk'.
[ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
d262ac2c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
7f3adc00 24#include <net/if.h>
7daedce4 25#include <netinet/in.h>
cdee00fd 26#include <stdint.h>
72865317
BP
27#include <stdlib.h>
28#include <string.h>
29#include <sys/ioctl.h>
7daedce4 30#include <sys/socket.h>
72865317 31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
9f861c91 34#include "bitmap.h"
59e6d833 35#include "cmap.h"
5cf3edb3 36#include "conntrack.h"
7daedce4 37#include "coverage.h"
4d4e68ed 38#include "ct-dpif.h"
72865317 39#include "csum.h"
e14deea0 40#include "dp-packet.h"
614c4892 41#include "dpif.h"
72865317 42#include "dpif-provider.h"
614c4892 43#include "dummy.h"
afae68b1 44#include "fat-rwlock.h"
72865317 45#include "flow.h"
762d146a 46#include "hmapx.h"
6c3eee82 47#include "latch.h"
72865317 48#include "netdev.h"
8617afff 49#include "netdev-dpdk.h"
de281153 50#include "netdev-vport.h"
cdee00fd 51#include "netlink.h"
f094af7b 52#include "odp-execute.h"
72865317 53#include "odp-util.h"
25d436fb
BW
54#include "openvswitch/dynamic-string.h"
55#include "openvswitch/list.h"
56#include "openvswitch/match.h"
57#include "openvswitch/ofp-print.h"
3eb67853 58#include "openvswitch/ofp-util.h"
64c96779 59#include "openvswitch/ofpbuf.h"
3eb67853 60#include "openvswitch/shash.h"
25d436fb 61#include "openvswitch/vlog.h"
5a034064 62#include "ovs-numa.h"
61e7deb1 63#include "ovs-rcu.h"
72865317
BP
64#include "packets.h"
65#include "poll-loop.h"
0de8783a 66#include "pvector.h"
26c6b6cd 67#include "random.h"
d33ed218 68#include "seq.h"
3eb67853 69#include "smap.h"
0cbfe35d 70#include "sset.h"
72865317 71#include "timeval.h"
53902038 72#include "tnl-neigh-cache.h"
7f9b8504 73#include "tnl-ports.h"
74cc3969 74#include "unixctl.h"
72865317 75#include "util.h"
7daedce4 76
d98e6007 77VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 78
8bb113da 79#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
80/* Use per thread recirc_depth to prevent recirculation loop. */
81#define MAX_RECIRC_DEPTH 5
82DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 83
72865317 84/* Configuration parameters. */
72865317
BP
85enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
86
8a4e3a85
BP
87/* Protects against changes to 'dp_netdevs'. */
88static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
89
90/* Contains all 'struct dp_netdev's. */
91static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
92 = SHASH_INITIALIZER(&dp_netdevs);
93
623540e4 94static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 95
5cf3edb3
DDP
96#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
97 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED)
98#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
99
2494ccd7
JS
100static struct odp_support dp_netdev_support = {
101 .max_mpls_depth = SIZE_MAX,
102 .recirc = true,
5cf3edb3
DDP
103 .ct_state = true,
104 .ct_zone = true,
105 .ct_mark = true,
106 .ct_label = true,
2494ccd7
JS
107};
108
79df317f 109/* Stores a miniflow with inline values */
9bbf1c3d 110
9bbf1c3d 111struct netdev_flow_key {
caeb4906
JR
112 uint32_t hash; /* Hash function differs for different users. */
113 uint32_t len; /* Length of the following miniflow (incl. map). */
0de8783a 114 struct miniflow mf;
8fd47924 115 uint64_t buf[FLOW_MAX_PACKET_U64S];
9bbf1c3d
DDP
116};
117
118/* Exact match cache for frequently used flows
119 *
120 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
121 * search its entries for a miniflow that matches exactly the miniflow of the
0de8783a 122 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
9bbf1c3d
DDP
123 *
124 * A cache entry holds a reference to its 'dp_netdev_flow'.
125 *
126 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
127 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
128 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
129 * value is the index of a cache entry where the miniflow could be.
130 *
131 *
132 * Thread-safety
133 * =============
134 *
135 * Each pmd_thread has its own private exact match cache.
136 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
137 */
138
fc82e877 139#define EM_FLOW_HASH_SHIFT 13
9bbf1c3d
DDP
140#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
141#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
142#define EM_FLOW_HASH_SEGS 2
143
144struct emc_entry {
9bbf1c3d 145 struct dp_netdev_flow *flow;
0de8783a 146 struct netdev_flow_key key; /* key.hash used for emc hash value. */
9bbf1c3d
DDP
147};
148
149struct emc_cache {
150 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
67ad54cb 151 int sweep_idx; /* For emc_cache_slow_sweep(). */
9bbf1c3d
DDP
152};
153
154/* Iterate in the exact match cache through every entry that might contain a
155 * miniflow with hash 'HASH'. */
156#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
157 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
158 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
159 i__ < EM_FLOW_HASH_SEGS; \
160 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
0de8783a
JR
161\f
162/* Simple non-wildcarding single-priority classifier. */
163
3453b4d6
JS
164/* Time in ms between successive optimizations of the dpcls subtable vector */
165#define DPCLS_OPTIMIZATION_INTERVAL 1000
166
0de8783a 167struct dpcls {
3453b4d6
JS
168 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
169 odp_port_t in_port;
0de8783a 170 struct cmap subtables_map;
da9cfca6 171 struct pvector subtables;
0de8783a 172};
9bbf1c3d 173
0de8783a
JR
174/* A rule to be inserted to the classifier. */
175struct dpcls_rule {
176 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
177 struct netdev_flow_key *mask; /* Subtable's mask. */
178 struct netdev_flow_key flow; /* Matching key. */
179 /* 'flow' must be the last field, additional space is allocated here. */
180};
181
182static void dpcls_init(struct dpcls *);
183static void dpcls_destroy(struct dpcls *);
3453b4d6 184static void dpcls_sort_subtable_vector(struct dpcls *);
0de8783a
JR
185static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
186 const struct netdev_flow_key *mask);
187static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
3453b4d6 188static bool dpcls_lookup(struct dpcls *cls,
0de8783a 189 const struct netdev_flow_key keys[],
3453b4d6
JS
190 struct dpcls_rule **rules, size_t cnt,
191 int *num_lookups_p);
0de8783a 192\f
8a4e3a85
BP
193/* Datapath based on the network device interface from netdev.h.
194 *
195 *
196 * Thread-safety
197 * =============
198 *
199 * Some members, marked 'const', are immutable. Accessing other members
200 * requires synchronization, as noted in more detail below.
201 *
202 * Acquisition order is, from outermost to innermost:
203 *
204 * dp_netdev_mutex (global)
59e6d833 205 * port_mutex
d0cca6c3 206 * non_pmd_mutex
8a4e3a85 207 */
72865317 208struct dp_netdev {
8a4e3a85
BP
209 const struct dpif_class *const class;
210 const char *const name;
6b31e073 211 struct dpif *dpif;
6a8267c5
BP
212 struct ovs_refcount ref_cnt;
213 atomic_flag destroyed;
72865317 214
8a4e3a85
BP
215 /* Ports.
216 *
e9985d6a
DDP
217 * Any lookup into 'ports' or any access to the dp_netdev_ports found
218 * through 'ports' requires taking 'port_mutex'. */
59e6d833 219 struct ovs_mutex port_mutex;
e9985d6a 220 struct hmap ports;
d33ed218 221 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 222
6b31e073
RW
223 /* Protects access to ofproto-dpif-upcall interface during revalidator
224 * thread synchronization. */
225 struct fat_rwlock upcall_rwlock;
623540e4
EJ
226 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
227 void *upcall_aux;
6b31e073 228
e4e74c3a
AW
229 /* Callback function for notifying the purging of dp flows (during
230 * reseting pmd deletion). */
231 dp_purge_callback *dp_purge_cb;
232 void *dp_purge_aux;
233
65f13b50
AW
234 /* Stores all 'struct dp_netdev_pmd_thread's. */
235 struct cmap poll_threads;
236
237 /* Protects the access of the 'struct dp_netdev_pmd_thread'
238 * instance for non-pmd thread. */
239 struct ovs_mutex non_pmd_mutex;
240
241 /* Each pmd thread will store its pointer to
242 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
243 ovsthread_key_t per_pmd_key;
f2eee189 244
a6a426d6
IM
245 struct seq *reconfigure_seq;
246 uint64_t last_reconfigure_seq;
247
a14b8947 248 /* Cpu mask for pin of pmd threads. */
f2eee189 249 char *pmd_cmask;
6e3c6fa4 250
a36de779 251 uint64_t last_tnl_conf_seq;
5cf3edb3
DDP
252
253 struct conntrack conntrack;
72865317
BP
254};
255
8a4e3a85 256static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
e9985d6a
DDP
257 odp_port_t)
258 OVS_REQUIRES(dp->port_mutex);
ff073a71 259
51852a57 260enum dp_stat_type {
abcf3ef4
DDP
261 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
262 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
51852a57
BP
263 DP_STAT_MISS, /* Packets that did not match. */
264 DP_STAT_LOST, /* Packets not passed up to the client. */
3453b4d6
JS
265 DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
266 hits */
51852a57
BP
267 DP_N_STATS
268};
269
55e3ca97
DDP
270enum pmd_cycles_counter_type {
271 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
272 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
273 PMD_N_CYCLES
274};
275
324c8374
IM
276#define XPS_TIMEOUT_MS 500LL
277
3eb67853
IM
278/* Contained by struct dp_netdev_port's 'rxqs' member. */
279struct dp_netdev_rxq {
280 struct netdev_rxq *rxq;
281 unsigned core_id; /* Сore to which this queue is pinned. */
282};
283
72865317
BP
284/* A port in a netdev-based datapath. */
285struct dp_netdev_port {
35303d71 286 odp_port_t port_no;
72865317 287 struct netdev *netdev;
e9985d6a 288 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
4b609110 289 struct netdev_saved_flags *sf;
490e82af 290 unsigned n_rxq; /* Number of elements in 'rxq' */
3eb67853 291 struct dp_netdev_rxq *rxqs;
324c8374
IM
292 bool dynamic_txqs; /* If true XPS will be used. */
293 unsigned *txq_used; /* Number of threads that uses each tx queue. */
294 struct ovs_mutex txq_used_mutex;
0cbfe35d 295 char *type; /* Port type as requested by user. */
3eb67853 296 char *rxq_affinity_list; /* Requested affinity of rx queues. */
72865317
BP
297};
298
1c1e46ed
AW
299/* Contained by struct dp_netdev_flow's 'stats' member. */
300struct dp_netdev_flow_stats {
eb94da30
DDP
301 atomic_llong used; /* Last used time, in monotonic msecs. */
302 atomic_ullong packet_count; /* Number of packets matched. */
303 atomic_ullong byte_count; /* Number of bytes matched. */
304 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
1c1e46ed
AW
305};
306
307/* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
8a4e3a85
BP
308 *
309 *
310 * Thread-safety
311 * =============
312 *
313 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
1c1e46ed 314 * its pmd thread's classifier. The text below calls this classifier 'cls'.
8a4e3a85
BP
315 *
316 * Motivation
317 * ----------
318 *
319 * The thread safety rules described here for "struct dp_netdev_flow" are
320 * motivated by two goals:
321 *
322 * - Prevent threads that read members of "struct dp_netdev_flow" from
323 * reading bad data due to changes by some thread concurrently modifying
324 * those members.
325 *
326 * - Prevent two threads making changes to members of a given "struct
327 * dp_netdev_flow" from interfering with each other.
328 *
329 *
330 * Rules
331 * -----
332 *
ed79f89a
DDP
333 * A flow 'flow' may be accessed without a risk of being freed during an RCU
334 * grace period. Code that needs to hold onto a flow for a while
335 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
336 *
337 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
338 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
339 * from modification.
8a4e3a85
BP
340 *
341 * Some members, marked 'const', are immutable. Accessing other members
342 * requires synchronization, as noted in more detail below.
343 */
72865317 344struct dp_netdev_flow {
11e5cf1f 345 const struct flow flow; /* Unmasked flow that created this entry. */
8a4e3a85 346 /* Hash table index by unmasked flow. */
1c1e46ed
AW
347 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
348 /* 'flow_table'. */
70e5ed6f 349 const ovs_u128 ufid; /* Unique flow identifier. */
bd5131ba 350 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
1c1e46ed 351 /* flow. */
72865317 352
ed79f89a
DDP
353 /* Number of references.
354 * The classifier owns one reference.
355 * Any thread trying to keep a rule from being freed should hold its own
356 * reference. */
357 struct ovs_refcount ref_cnt;
358
11e5cf1f
DDP
359 bool dead;
360
1c1e46ed
AW
361 /* Statistics. */
362 struct dp_netdev_flow_stats stats;
8a4e3a85 363
45c626a3 364 /* Actions. */
61e7deb1 365 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
0de8783a 366
11e5cf1f
DDP
367 /* While processing a group of input packets, the datapath uses the next
368 * member to store a pointer to the output batch for the flow. It is
369 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
f7ce4811
PS
370 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
371 struct packet_batch_per_flow *batch;
11e5cf1f 372
0de8783a
JR
373 /* Packet classification. */
374 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
375 /* 'cr' must be the last member. */
72865317
BP
376};
377
ed79f89a 378static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 379static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
70e5ed6f
JS
380static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
381 struct flow *);
8a4e3a85 382
a84cb64a
BP
383/* A set of datapath actions within a "struct dp_netdev_flow".
384 *
385 *
386 * Thread-safety
387 * =============
388 *
45c626a3 389 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 390struct dp_netdev_actions {
a84cb64a
BP
391 /* These members are immutable: they do not change during the struct's
392 * lifetime. */
a84cb64a 393 unsigned int size; /* Size of 'actions', in bytes. */
9ff55ae2 394 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
a84cb64a
BP
395};
396
397struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
398 size_t);
61e7deb1
BP
399struct dp_netdev_actions *dp_netdev_flow_get_actions(
400 const struct dp_netdev_flow *);
401static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 402
1c1e46ed
AW
403/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
404struct dp_netdev_pmd_stats {
405 /* Indexed by DP_STAT_*. */
eb94da30 406 atomic_ullong n[DP_N_STATS];
1c1e46ed
AW
407};
408
55e3ca97
DDP
409/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
410struct dp_netdev_pmd_cycles {
411 /* Indexed by PMD_CYCLES_*. */
412 atomic_ullong n[PMD_N_CYCLES];
413};
414
ae7ad0a1
IM
415/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
416struct rxq_poll {
417 struct dp_netdev_port *port;
418 struct netdev_rxq *rx;
419 struct ovs_list node;
420};
421
d0cca6c3
DDP
422/* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
423struct tx_port {
324c8374
IM
424 struct dp_netdev_port *port;
425 int qid;
426 long long last_used;
d0cca6c3
DDP
427 struct hmap_node node;
428};
429
e4cfed38
PS
430/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
431 * the performance overhead of interrupt processing. Therefore netdev can
432 * not implement rx-wait for these devices. dpif-netdev needs to poll
433 * these device to check for recv buffer. pmd-thread does polling for
1c1e46ed 434 * devices assigned to itself.
e4cfed38
PS
435 *
436 * DPDK used PMD for accessing NIC.
437 *
65f13b50
AW
438 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
439 * I/O of all non-pmd threads. There will be no actual thread created
440 * for the instance.
1c1e46ed
AW
441 *
442 * Each struct has its own flow table and classifier. Packets received
443 * from managed ports are looked up in the corresponding pmd thread's
444 * flow table, and are executed with the found actions.
445 * */
65f13b50 446struct dp_netdev_pmd_thread {
6c3eee82 447 struct dp_netdev *dp;
1c1e46ed 448 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
65f13b50 449 struct cmap_node node; /* In 'dp->poll_threads'. */
accf8626
AW
450
451 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
452 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
453
65f13b50
AW
454 /* Per thread exact-match cache. Note, the instance for cpu core
455 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
d0cca6c3
DDP
456 * need to be protected by 'non_pmd_mutex'. Every other instance
457 * will only be accessed by its own pmd thread. */
9bbf1c3d 458 struct emc_cache flow_cache;
1c1e46ed 459
3453b4d6 460 /* Flow-Table and classifiers
1c1e46ed
AW
461 *
462 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
3453b4d6
JS
463 * changes to 'classifiers' must be made while still holding the
464 * 'flow_mutex'.
1c1e46ed
AW
465 */
466 struct ovs_mutex flow_mutex;
1c1e46ed
AW
467 struct cmap flow_table OVS_GUARDED; /* Flow table. */
468
3453b4d6
JS
469 /* One classifier per in_port polled by the pmd */
470 struct cmap classifiers;
471 /* Periodically sort subtable vectors according to hit frequencies */
472 long long int next_optimization;
473
1c1e46ed
AW
474 /* Statistics. */
475 struct dp_netdev_pmd_stats stats;
476
55e3ca97
DDP
477 /* Cycles counters */
478 struct dp_netdev_pmd_cycles cycles;
479
480 /* Used to count cicles. See 'cycles_counter_end()' */
481 unsigned long long last_cycles;
482
65f13b50
AW
483 struct latch exit_latch; /* For terminating the pmd thread. */
484 atomic_uint change_seq; /* For reloading pmd ports. */
6c3eee82 485 pthread_t thread;
bd5131ba 486 unsigned core_id; /* CPU core id of this pmd thread. */
65f13b50 487 int numa_id; /* numa node id of this pmd thread. */
3eb67853 488 bool isolated;
81acebda 489
324c8374
IM
490 /* Queue id used by this pmd thread to send packets on all netdevs if
491 * XPS disabled for this netdev. All static_tx_qid's are unique and less
492 * than 'ovs_numa_get_n_cores() + 1'. */
493 atomic_int static_tx_qid;
6553d06b 494
d0cca6c3 495 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
ae7ad0a1
IM
496 /* List of rx queues to poll. */
497 struct ovs_list poll_list OVS_GUARDED;
d0cca6c3
DDP
498 /* Number of elements in 'poll_list' */
499 int poll_cnt;
500 /* Map of 'tx_port's used for transmission. Written by the main thread,
501 * read by the pmd thread. */
502 struct hmap tx_ports OVS_GUARDED;
503
504 /* Map of 'tx_port' used in the fast path. This is a thread-local copy of
505 * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
506 * by multiple threads, and thusly need to be protected by 'non_pmd_mutex'.
507 * Every other instance will only be accessed by its own pmd thread. */
508 struct hmap port_cache;
ae7ad0a1 509
6553d06b
DDP
510 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
511 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
512 * values and subtracts them from 'stats' and 'cycles' before
513 * reporting to the user */
514 unsigned long long stats_zero[DP_N_STATS];
515 uint64_t cycles_zero[PMD_N_CYCLES];
6c3eee82
BP
516};
517
84067a4c
JR
518#define PMD_INITIAL_SEQ 1
519
72865317
BP
520/* Interface to netdev-based datapath. */
521struct dpif_netdev {
522 struct dpif dpif;
523 struct dp_netdev *dp;
d33ed218 524 uint64_t last_port_seq;
72865317
BP
525};
526
8a4e3a85 527static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
e9985d6a
DDP
528 struct dp_netdev_port **portp)
529 OVS_REQUIRES(dp->port_mutex);
8a4e3a85 530static int get_port_by_name(struct dp_netdev *dp, const char *devname,
e9985d6a
DDP
531 struct dp_netdev_port **portp)
532 OVS_REQUIRES(dp->port_mutex);
8a4e3a85
BP
533static void dp_netdev_free(struct dp_netdev *)
534 OVS_REQUIRES(dp_netdev_mutex);
8a4e3a85
BP
535static int do_add_port(struct dp_netdev *dp, const char *devname,
536 const char *type, odp_port_t port_no)
59e6d833 537 OVS_REQUIRES(dp->port_mutex);
c40b890f 538static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 539 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
540static int dpif_netdev_open(const struct dpif_class *, const char *name,
541 bool create, struct dpif **);
65f13b50 542static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 543 struct dp_packet_batch *,
66e4ad8a 544 bool may_steal, const struct flow *flow,
4edb9ae9 545 const struct nlattr *actions,
324c8374
IM
546 size_t actions_len,
547 long long now);
65f13b50 548static void dp_netdev_input(struct dp_netdev_pmd_thread *,
1895cc8d 549 struct dp_packet_batch *, odp_port_t port_no);
a90ed026 550static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
1895cc8d 551 struct dp_packet_batch *);
41ccaa24 552
6b31e073 553static void dp_netdev_disable_upcall(struct dp_netdev *);
ae7ad0a1 554static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
65f13b50 555static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
00873463
DDP
556 struct dp_netdev *dp, unsigned core_id,
557 int numa_id);
1c1e46ed 558static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
e9985d6a
DDP
559static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
560 OVS_REQUIRES(dp->port_mutex);
561
b19befae 562static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
bd5131ba 563 unsigned core_id);
1c1e46ed
AW
564static struct dp_netdev_pmd_thread *
565dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
65f13b50
AW
566static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
567static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
e9985d6a
DDP
568static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
569 OVS_REQUIRES(dp->port_mutex);
d0cca6c3 570static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
cc245ce8
IM
571static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
572 struct dp_netdev_port *port);
d0cca6c3
DDP
573static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
574 struct dp_netdev_port *port);
575static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
576 struct dp_netdev_port *port);
577static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
578 struct dp_netdev_port *port,
579 struct netdev_rxq *rx);
ae7ad0a1
IM
580static struct dp_netdev_pmd_thread *
581dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
e9985d6a
DDP
582static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
583 OVS_REQUIRES(dp->port_mutex);
3eb67853
IM
584static void reconfigure_pmd_threads(struct dp_netdev *dp)
585 OVS_REQUIRES(dp->port_mutex);
1c1e46ed
AW
586static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
587static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
588static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
d0cca6c3
DDP
589static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
590 OVS_REQUIRES(pmd->port_mutex);
3453b4d6
JS
591static inline void
592dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
72865317 593
324c8374
IM
594static void
595dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
596 long long now, bool purge);
597static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
598 struct tx_port *tx, long long now);
599
67ad54cb 600static inline bool emc_entry_alive(struct emc_entry *ce);
9bbf1c3d
DDP
601static void emc_clear_entry(struct emc_entry *ce);
602
603static void
604emc_cache_init(struct emc_cache *flow_cache)
605{
606 int i;
607
67ad54cb 608 flow_cache->sweep_idx = 0;
9bbf1c3d
DDP
609 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
610 flow_cache->entries[i].flow = NULL;
0de8783a 611 flow_cache->entries[i].key.hash = 0;
09b0fa9c 612 flow_cache->entries[i].key.len = sizeof(struct miniflow);
5fcff47b 613 flowmap_init(&flow_cache->entries[i].key.mf.map);
9bbf1c3d
DDP
614 }
615}
616
617static void
618emc_cache_uninit(struct emc_cache *flow_cache)
619{
620 int i;
621
622 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
623 emc_clear_entry(&flow_cache->entries[i]);
624 }
625}
626
67ad54cb
AW
627/* Check and clear dead flow references slowly (one entry at each
628 * invocation). */
629static void
630emc_cache_slow_sweep(struct emc_cache *flow_cache)
631{
632 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
633
634 if (!emc_entry_alive(entry)) {
635 emc_clear_entry(entry);
636 }
637 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
638}
639
c4ea7529
BP
640/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
641bool
642dpif_is_netdev(const struct dpif *dpif)
643{
644 return dpif->dpif_class->open == dpif_netdev_open;
645}
646
72865317
BP
647static struct dpif_netdev *
648dpif_netdev_cast(const struct dpif *dpif)
649{
c4ea7529 650 ovs_assert(dpif_is_netdev(dpif));
72865317
BP
651 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
652}
653
654static struct dp_netdev *
655get_dp_netdev(const struct dpif *dpif)
656{
657 return dpif_netdev_cast(dpif)->dp;
658}
6553d06b
DDP
659\f
660enum pmd_info_type {
ce179f11
IM
661 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
662 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
663 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
6553d06b
DDP
664};
665
666static void
667pmd_info_show_stats(struct ds *reply,
668 struct dp_netdev_pmd_thread *pmd,
669 unsigned long long stats[DP_N_STATS],
670 uint64_t cycles[PMD_N_CYCLES])
671{
672 unsigned long long total_packets = 0;
673 uint64_t total_cycles = 0;
674 int i;
675
676 /* These loops subtracts reference values ('*_zero') from the counters.
677 * Since loads and stores are relaxed, it might be possible for a '*_zero'
678 * value to be more recent than the current value we're reading from the
679 * counter. This is not a big problem, since these numbers are not
680 * supposed to be too accurate, but we should at least make sure that
681 * the result is not negative. */
682 for (i = 0; i < DP_N_STATS; i++) {
683 if (stats[i] > pmd->stats_zero[i]) {
684 stats[i] -= pmd->stats_zero[i];
685 } else {
686 stats[i] = 0;
687 }
688
689 if (i != DP_STAT_LOST) {
690 /* Lost packets are already included in DP_STAT_MISS */
691 total_packets += stats[i];
692 }
693 }
694
695 for (i = 0; i < PMD_N_CYCLES; i++) {
696 if (cycles[i] > pmd->cycles_zero[i]) {
697 cycles[i] -= pmd->cycles_zero[i];
698 } else {
699 cycles[i] = 0;
700 }
701
702 total_cycles += cycles[i];
703 }
704
705 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
706 ? "main thread" : "pmd thread");
707
708 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
709 ds_put_format(reply, " numa_id %d", pmd->numa_id);
710 }
d5c199ea 711 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
bd5131ba 712 ds_put_format(reply, " core_id %u", pmd->core_id);
6553d06b
DDP
713 }
714 ds_put_cstr(reply, ":\n");
715
716 ds_put_format(reply,
717 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
3453b4d6 718 "\tavg. subtable lookups per hit:%.2f\n"
6553d06b
DDP
719 "\tmiss:%llu\n\tlost:%llu\n",
720 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
3453b4d6
JS
721 stats[DP_STAT_MASKED_HIT] > 0
722 ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
723 : 0,
6553d06b
DDP
724 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
725
726 if (total_cycles == 0) {
727 return;
728 }
729
730 ds_put_format(reply,
731 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
732 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
733 cycles[PMD_CYCLES_POLLING],
734 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
735 cycles[PMD_CYCLES_PROCESSING],
736 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
737
738 if (total_packets == 0) {
739 return;
740 }
741
742 ds_put_format(reply,
743 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
744 total_cycles / (double)total_packets,
745 total_cycles, total_packets);
746
747 ds_put_format(reply,
748 "\tavg processing cycles per packet: "
749 "%.02f (%"PRIu64"/%llu)\n",
750 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
751 cycles[PMD_CYCLES_PROCESSING], total_packets);
752}
753
754static void
755pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
756 struct dp_netdev_pmd_thread *pmd,
757 unsigned long long stats[DP_N_STATS],
758 uint64_t cycles[PMD_N_CYCLES])
759{
760 int i;
761
762 /* We cannot write 'stats' and 'cycles' (because they're written by other
763 * threads) and we shouldn't change 'stats' (because they're used to count
764 * datapath stats, which must not be cleared here). Instead, we save the
765 * current values and subtract them from the values to be displayed in the
766 * future */
767 for (i = 0; i < DP_N_STATS; i++) {
768 pmd->stats_zero[i] = stats[i];
769 }
770 for (i = 0; i < PMD_N_CYCLES; i++) {
771 pmd->cycles_zero[i] = cycles[i];
772 }
773}
774
ce179f11
IM
775static void
776pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
777{
778 if (pmd->core_id != NON_PMD_CORE_ID) {
779 struct rxq_poll *poll;
780 const char *prev_name = NULL;
781
3eb67853
IM
782 ds_put_format(reply,
783 "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
784 pmd->numa_id, pmd->core_id, (pmd->isolated)
785 ? "true" : "false");
ce179f11 786
d0cca6c3 787 ovs_mutex_lock(&pmd->port_mutex);
ce179f11
IM
788 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
789 const char *name = netdev_get_name(poll->port->netdev);
790
791 if (!prev_name || strcmp(name, prev_name)) {
792 if (prev_name) {
793 ds_put_cstr(reply, "\n");
794 }
795 ds_put_format(reply, "\tport: %s\tqueue-id:",
796 netdev_get_name(poll->port->netdev));
797 }
798 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
799 prev_name = name;
800 }
d0cca6c3 801 ovs_mutex_unlock(&pmd->port_mutex);
ce179f11
IM
802 ds_put_cstr(reply, "\n");
803 }
804}
805
6553d06b
DDP
806static void
807dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
808 void *aux)
809{
810 struct ds reply = DS_EMPTY_INITIALIZER;
811 struct dp_netdev_pmd_thread *pmd;
812 struct dp_netdev *dp = NULL;
813 enum pmd_info_type type = *(enum pmd_info_type *) aux;
814
815 ovs_mutex_lock(&dp_netdev_mutex);
816
817 if (argc == 2) {
818 dp = shash_find_data(&dp_netdevs, argv[1]);
819 } else if (shash_count(&dp_netdevs) == 1) {
820 /* There's only one datapath */
821 dp = shash_first(&dp_netdevs)->data;
822 }
823
824 if (!dp) {
825 ovs_mutex_unlock(&dp_netdev_mutex);
826 unixctl_command_reply_error(conn,
827 "please specify an existing datapath");
828 return;
829 }
830
831 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
ce179f11
IM
832 if (type == PMD_INFO_SHOW_RXQ) {
833 pmd_info_show_rxq(&reply, pmd);
834 } else {
835 unsigned long long stats[DP_N_STATS];
836 uint64_t cycles[PMD_N_CYCLES];
837 int i;
6553d06b 838
ce179f11
IM
839 /* Read current stats and cycle counters */
840 for (i = 0; i < ARRAY_SIZE(stats); i++) {
841 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
842 }
843 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
844 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
845 }
6553d06b 846
ce179f11
IM
847 if (type == PMD_INFO_CLEAR_STATS) {
848 pmd_info_clear_stats(&reply, pmd, stats, cycles);
849 } else if (type == PMD_INFO_SHOW_STATS) {
850 pmd_info_show_stats(&reply, pmd, stats, cycles);
851 }
6553d06b
DDP
852 }
853 }
854
855 ovs_mutex_unlock(&dp_netdev_mutex);
856
857 unixctl_command_reply(conn, ds_cstr(&reply));
858 ds_destroy(&reply);
859}
860\f
861static int
862dpif_netdev_init(void)
863{
864 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
ce179f11
IM
865 clear_aux = PMD_INFO_CLEAR_STATS,
866 poll_aux = PMD_INFO_SHOW_RXQ;
6553d06b
DDP
867
868 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
869 0, 1, dpif_netdev_pmd_info,
870 (void *)&show_aux);
871 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
872 0, 1, dpif_netdev_pmd_info,
873 (void *)&clear_aux);
ce179f11
IM
874 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
875 0, 1, dpif_netdev_pmd_info,
876 (void *)&poll_aux);
6553d06b
DDP
877 return 0;
878}
72865317 879
2197d7ab 880static int
2240af25
DDP
881dpif_netdev_enumerate(struct sset *all_dps,
882 const struct dpif_class *dpif_class)
2197d7ab
GL
883{
884 struct shash_node *node;
885
97be1538 886 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 887 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
888 struct dp_netdev *dp = node->data;
889 if (dpif_class != dp->class) {
890 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
891 * If the class doesn't match, skip this dpif. */
892 continue;
893 }
2197d7ab
GL
894 sset_add(all_dps, node->name);
895 }
97be1538 896 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 897
2197d7ab
GL
898 return 0;
899}
900
add90f6f
EJ
901static bool
902dpif_netdev_class_is_dummy(const struct dpif_class *class)
903{
904 return class != &dpif_netdev_class;
905}
906
0aeaabc8
JP
907static const char *
908dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
909{
910 return strcmp(type, "internal") ? type
e98d0cb3 911 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
0aeaabc8
JP
912 : "tap";
913}
914
72865317
BP
915static struct dpif *
916create_dpif_netdev(struct dp_netdev *dp)
917{
462278db 918 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 919 struct dpif_netdev *dpif;
72865317 920
6a8267c5 921 ovs_refcount_ref(&dp->ref_cnt);
72865317 922
72865317 923 dpif = xmalloc(sizeof *dpif);
614c4892 924 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 925 dpif->dp = dp;
d33ed218 926 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
927
928 return &dpif->dpif;
929}
930
4e022ec0
AW
931/* Choose an unused, non-zero port number and return it on success.
932 * Return ODPP_NONE on failure. */
933static odp_port_t
e44768b7 934choose_port(struct dp_netdev *dp, const char *name)
59e6d833 935 OVS_REQUIRES(dp->port_mutex)
e44768b7 936{
4e022ec0 937 uint32_t port_no;
e44768b7
JP
938
939 if (dp->class != &dpif_netdev_class) {
940 const char *p;
941 int start_no = 0;
942
943 /* If the port name begins with "br", start the number search at
944 * 100 to make writing tests easier. */
945 if (!strncmp(name, "br", 2)) {
946 start_no = 100;
947 }
948
949 /* If the port name contains a number, try to assign that port number.
950 * This can make writing unit tests easier because port numbers are
951 * predictable. */
952 for (p = name; *p != '\0'; p++) {
953 if (isdigit((unsigned char) *p)) {
954 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
955 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
956 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 957 return u32_to_odp(port_no);
e44768b7
JP
958 }
959 break;
960 }
961 }
962 }
963
ff073a71
BP
964 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
965 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 966 return u32_to_odp(port_no);
e44768b7
JP
967 }
968 }
969
4e022ec0 970 return ODPP_NONE;
e44768b7
JP
971}
972
72865317 973static int
614c4892
BP
974create_dp_netdev(const char *name, const struct dpif_class *class,
975 struct dp_netdev **dpp)
8a4e3a85 976 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
977{
978 struct dp_netdev *dp;
979 int error;
72865317 980
462278db 981 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
982 shash_add(&dp_netdevs, name, dp);
983
984 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
985 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 986 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 987 atomic_flag_clear(&dp->destroyed);
8a4e3a85 988
59e6d833 989 ovs_mutex_init(&dp->port_mutex);
e9985d6a 990 hmap_init(&dp->ports);
d33ed218 991 dp->port_seq = seq_create();
6b31e073
RW
992 fat_rwlock_init(&dp->upcall_rwlock);
993
a6a426d6
IM
994 dp->reconfigure_seq = seq_create();
995 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
996
6b31e073
RW
997 /* Disable upcalls by default. */
998 dp_netdev_disable_upcall(dp);
623540e4 999 dp->upcall_aux = NULL;
6b31e073 1000 dp->upcall_cb = NULL;
e44768b7 1001
5cf3edb3
DDP
1002 conntrack_init(&dp->conntrack);
1003
65f13b50
AW
1004 cmap_init(&dp->poll_threads);
1005 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1006 ovsthread_key_create(&dp->per_pmd_key, NULL);
1007
e9985d6a 1008 ovs_mutex_lock(&dp->port_mutex);
f2eee189 1009 dp_netdev_set_nonpmd(dp);
65f13b50 1010
a3e8437a
TLSC
1011 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1012 "internal"),
1013 ODPP_LOCAL);
59e6d833 1014 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
1015 if (error) {
1016 dp_netdev_free(dp);
462278db 1017 return error;
72865317
BP
1018 }
1019
a36de779 1020 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
462278db 1021 *dpp = dp;
72865317
BP
1022 return 0;
1023}
1024
a6a426d6
IM
1025static void
1026dp_netdev_request_reconfigure(struct dp_netdev *dp)
1027{
1028 seq_change(dp->reconfigure_seq);
1029}
1030
1031static bool
1032dp_netdev_is_reconf_required(struct dp_netdev *dp)
1033{
1034 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1035}
1036
72865317 1037static int
614c4892 1038dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 1039 bool create, struct dpif **dpifp)
72865317 1040{
462278db 1041 struct dp_netdev *dp;
5279f8fd 1042 int error;
462278db 1043
97be1538 1044 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
1045 dp = shash_find_data(&dp_netdevs, name);
1046 if (!dp) {
5279f8fd 1047 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 1048 } else {
5279f8fd
BP
1049 error = (dp->class != class ? EINVAL
1050 : create ? EEXIST
1051 : 0);
1052 }
1053 if (!error) {
1054 *dpifp = create_dpif_netdev(dp);
6b31e073 1055 dp->dpif = *dpifp;
72865317 1056 }
97be1538 1057 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 1058
5279f8fd 1059 return error;
72865317
BP
1060}
1061
88ace79b
DDP
1062static void
1063dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1064 OVS_NO_THREAD_SAFETY_ANALYSIS
1065{
1066 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1067 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1068
1069 /* Before freeing a lock we should release it */
1070 fat_rwlock_unlock(&dp->upcall_rwlock);
1071 fat_rwlock_destroy(&dp->upcall_rwlock);
1072}
1073
8a4e3a85
BP
1074/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1075 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
1076static void
1077dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 1078 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 1079{
e9985d6a 1080 struct dp_netdev_port *port, *next;
4ad28026 1081
8a4e3a85
BP
1082 shash_find_and_delete(&dp_netdevs, dp->name);
1083
65f13b50
AW
1084 dp_netdev_destroy_all_pmds(dp);
1085 ovs_mutex_destroy(&dp->non_pmd_mutex);
1086 ovsthread_key_delete(dp->per_pmd_key);
6c3eee82 1087
5cf3edb3
DDP
1088 conntrack_destroy(&dp->conntrack);
1089
59e6d833 1090 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 1091 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
c40b890f 1092 do_del_port(dp, port);
1ba530f4 1093 }
59e6d833 1094 ovs_mutex_unlock(&dp->port_mutex);
d916785c 1095 cmap_destroy(&dp->poll_threads);
51852a57 1096
a6a426d6
IM
1097 seq_destroy(dp->reconfigure_seq);
1098
d33ed218 1099 seq_destroy(dp->port_seq);
e9985d6a 1100 hmap_destroy(&dp->ports);
3186ea46 1101 ovs_mutex_destroy(&dp->port_mutex);
88ace79b
DDP
1102
1103 /* Upcalls must be disabled at this point */
1104 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 1105
f2eee189 1106 free(dp->pmd_cmask);
8a4e3a85 1107 free(CONST_CAST(char *, dp->name));
72865317
BP
1108 free(dp);
1109}
1110
8a4e3a85
BP
1111static void
1112dp_netdev_unref(struct dp_netdev *dp)
1113{
1114 if (dp) {
1115 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1116 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1117 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 1118 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
1119 dp_netdev_free(dp);
1120 }
1121 ovs_mutex_unlock(&dp_netdev_mutex);
1122 }
1123}
1124
72865317
BP
1125static void
1126dpif_netdev_close(struct dpif *dpif)
1127{
1128 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1129
8a4e3a85 1130 dp_netdev_unref(dp);
72865317
BP
1131 free(dpif);
1132}
1133
1134static int
7dab847a 1135dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
1136{
1137 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1138
6a8267c5 1139 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 1140 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
1141 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1142 OVS_NOT_REACHED();
1143 }
1144 }
5279f8fd 1145
72865317
BP
1146 return 0;
1147}
1148
eb94da30
DDP
1149/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1150 * load/store semantics. While the increment is not atomic, the load and
1151 * store operations are, making it impossible to read inconsistent values.
1152 *
1153 * This is used to update thread local stats counters. */
1154static void
1155non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1156{
1157 unsigned long long tmp;
1158
1159 atomic_read_relaxed(var, &tmp);
1160 tmp += n;
1161 atomic_store_relaxed(var, tmp);
1162}
1163
72865317 1164static int
a8d9304d 1165dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
1166{
1167 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed 1168 struct dp_netdev_pmd_thread *pmd;
8a4e3a85 1169
1c1e46ed
AW
1170 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1171 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
eb94da30 1172 unsigned long long n;
1c1e46ed 1173 stats->n_flows += cmap_count(&pmd->flow_table);
eb94da30 1174
abcf3ef4
DDP
1175 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1176 stats->n_hit += n;
1177 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
eb94da30
DDP
1178 stats->n_hit += n;
1179 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1180 stats->n_missed += n;
1181 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1182 stats->n_lost += n;
51852a57 1183 }
1ce3fa06 1184 stats->n_masks = UINT32_MAX;
847108dc 1185 stats->n_mask_hit = UINT64_MAX;
5279f8fd 1186
72865317
BP
1187 return 0;
1188}
1189
e4cfed38 1190static void
65f13b50 1191dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 1192{
65f13b50
AW
1193 int old_seq;
1194
accf8626 1195 if (pmd->core_id == NON_PMD_CORE_ID) {
d0cca6c3
DDP
1196 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1197 ovs_mutex_lock(&pmd->port_mutex);
1198 pmd_load_cached_ports(pmd);
1199 ovs_mutex_unlock(&pmd->port_mutex);
1200 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
accf8626
AW
1201 return;
1202 }
1203
1204 ovs_mutex_lock(&pmd->cond_mutex);
65f13b50 1205 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
accf8626
AW
1206 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1207 ovs_mutex_unlock(&pmd->cond_mutex);
65f13b50 1208}
e4cfed38 1209
59e6d833
BP
1210static uint32_t
1211hash_port_no(odp_port_t port_no)
1212{
1213 return hash_int(odp_to_u32(port_no), 0);
1214}
1215
72865317 1216static int
a3e8437a 1217port_create(const char *devname, const char *type,
b8d29252 1218 odp_port_t port_no, struct dp_netdev_port **portp)
72865317 1219{
4b609110 1220 struct netdev_saved_flags *sf;
72865317 1221 struct dp_netdev_port *port;
2499a8ce 1222 enum netdev_flags flags;
b8d29252
DDP
1223 struct netdev *netdev;
1224 int n_open_rxqs = 0;
324c8374 1225 int n_cores = 0;
b8d29252 1226 int i, error;
324c8374 1227 bool dynamic_txqs = false;
72865317 1228
b8d29252 1229 *portp = NULL;
72865317
BP
1230
1231 /* Open and validate network device. */
a3e8437a 1232 error = netdev_open(devname, type, &netdev);
72865317 1233 if (error) {
b8d29252 1234 return error;
72865317 1235 }
72865317
BP
1236 /* XXX reject non-Ethernet devices */
1237
2499a8ce
AC
1238 netdev_get_flags(netdev, &flags);
1239 if (flags & NETDEV_LOOPBACK) {
1240 VLOG_ERR("%s: cannot add a loopback device", devname);
d17f4f08 1241 error = EINVAL;
b8d29252 1242 goto out;
2499a8ce
AC
1243 }
1244
5a034064 1245 if (netdev_is_pmd(netdev)) {
324c8374 1246 n_cores = ovs_numa_get_n_cores();
5a034064
AW
1247
1248 if (n_cores == OVS_CORE_UNSPEC) {
1249 VLOG_ERR("%s, cannot get cpu core info", devname);
d17f4f08 1250 error = ENOENT;
b8d29252 1251 goto out;
5a034064
AW
1252 }
1253 /* There can only be ovs_numa_get_n_cores() pmd threads,
3bcc10c0
DDP
1254 * so creates a txq for each, and one extra for the non
1255 * pmd threads. */
050c60bf 1256 error = netdev_set_tx_multiq(netdev, n_cores + 1);
7251515e 1257 if (error && (error != EOPNOTSUPP)) {
5a034064 1258 VLOG_ERR("%s, cannot set multiq", devname);
b8d29252 1259 goto out;
5a034064
AW
1260 }
1261 }
050c60bf
DDP
1262
1263 if (netdev_is_reconf_required(netdev)) {
1264 error = netdev_reconfigure(netdev);
1265 if (error) {
1266 goto out;
1267 }
1268 }
1269
324c8374
IM
1270 if (netdev_is_pmd(netdev)) {
1271 if (netdev_n_txq(netdev) < n_cores + 1) {
1272 dynamic_txqs = true;
1273 }
1274 }
1275
e4cfed38 1276 port = xzalloc(sizeof *port);
35303d71 1277 port->port_no = port_no;
e4cfed38 1278 port->netdev = netdev;
490e82af 1279 port->n_rxq = netdev_n_rxq(netdev);
3eb67853 1280 port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
324c8374 1281 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
e4cfed38 1282 port->type = xstrdup(type);
324c8374
IM
1283 ovs_mutex_init(&port->txq_used_mutex);
1284 port->dynamic_txqs = dynamic_txqs;
d17f4f08 1285
490e82af 1286 for (i = 0; i < port->n_rxq; i++) {
3eb67853 1287 error = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
d17f4f08 1288 if (error) {
55c955bd
PS
1289 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1290 devname, ovs_strerror(errno));
d17f4f08 1291 goto out_rxq_close;
55c955bd 1292 }
3eb67853 1293 port->rxqs[i].core_id = -1;
d17f4f08 1294 n_open_rxqs++;
7b6b0ef4
BP
1295 }
1296
4b609110 1297 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 1298 if (error) {
d17f4f08 1299 goto out_rxq_close;
72865317 1300 }
4b609110 1301 port->sf = sf;
e4cfed38 1302
b8d29252 1303 *portp = port;
72865317
BP
1304
1305 return 0;
d17f4f08
DDP
1306
1307out_rxq_close:
1308 for (i = 0; i < n_open_rxqs; i++) {
3eb67853 1309 netdev_rxq_close(port->rxqs[i].rxq);
d17f4f08 1310 }
324c8374 1311 ovs_mutex_destroy(&port->txq_used_mutex);
d17f4f08 1312 free(port->type);
324c8374 1313 free(port->txq_used);
3eb67853 1314 free(port->rxqs);
d17f4f08 1315 free(port);
b8d29252 1316
d17f4f08 1317out:
b8d29252 1318 netdev_close(netdev);
d17f4f08 1319 return error;
72865317
BP
1320}
1321
b8d29252
DDP
1322static int
1323do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1324 odp_port_t port_no)
1325 OVS_REQUIRES(dp->port_mutex)
1326{
1327 struct dp_netdev_port *port;
1328 int error;
1329
1330 /* Reject devices already in 'dp'. */
1331 if (!get_port_by_name(dp, devname, &port)) {
1332 return EEXIST;
1333 }
1334
a3e8437a 1335 error = port_create(devname, type, port_no, &port);
b8d29252
DDP
1336 if (error) {
1337 return error;
1338 }
1339
b8d29252 1340 if (netdev_is_pmd(port->netdev)) {
d0cca6c3
DDP
1341 int numa_id = netdev_get_numa_id(port->netdev);
1342
1343 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1344 dp_netdev_set_pmds_on_numa(dp, numa_id);
b8d29252 1345 }
d0cca6c3
DDP
1346
1347 dp_netdev_add_port_to_pmds(dp, port);
1348
e9985d6a 1349 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
b8d29252
DDP
1350 seq_change(dp->port_seq);
1351
1352 return 0;
1353}
1354
247527db
BP
1355static int
1356dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 1357 odp_port_t *port_nop)
247527db
BP
1358{
1359 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
1360 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1361 const char *dpif_port;
4e022ec0 1362 odp_port_t port_no;
5279f8fd 1363 int error;
247527db 1364
59e6d833 1365 ovs_mutex_lock(&dp->port_mutex);
3aa30359 1366 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 1367 if (*port_nop != ODPP_NONE) {
ff073a71
BP
1368 port_no = *port_nop;
1369 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 1370 } else {
3aa30359 1371 port_no = choose_port(dp, dpif_port);
5279f8fd 1372 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 1373 }
5279f8fd 1374 if (!error) {
247527db 1375 *port_nop = port_no;
5279f8fd 1376 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 1377 }
59e6d833 1378 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1379
1380 return error;
72865317
BP
1381}
1382
1383static int
4e022ec0 1384dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
1385{
1386 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
1387 int error;
1388
59e6d833 1389 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
1390 if (port_no == ODPP_LOCAL) {
1391 error = EINVAL;
1392 } else {
1393 struct dp_netdev_port *port;
1394
1395 error = get_port_by_number(dp, port_no, &port);
1396 if (!error) {
1397 do_del_port(dp, port);
1398 }
1399 }
59e6d833 1400 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
1401
1402 return error;
72865317
BP
1403}
1404
1405static bool
4e022ec0 1406is_valid_port_number(odp_port_t port_no)
72865317 1407{
ff073a71
BP
1408 return port_no != ODPP_NONE;
1409}
1410
1411static struct dp_netdev_port *
1412dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
e9985d6a 1413 OVS_REQUIRES(dp->port_mutex)
ff073a71
BP
1414{
1415 struct dp_netdev_port *port;
1416
e9985d6a 1417 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
35303d71 1418 if (port->port_no == port_no) {
ff073a71
BP
1419 return port;
1420 }
1421 }
1422 return NULL;
72865317
BP
1423}
1424
1425static int
1426get_port_by_number(struct dp_netdev *dp,
4e022ec0 1427 odp_port_t port_no, struct dp_netdev_port **portp)
e9985d6a 1428 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1429{
1430 if (!is_valid_port_number(port_no)) {
1431 *portp = NULL;
1432 return EINVAL;
1433 } else {
ff073a71 1434 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
1435 return *portp ? 0 : ENOENT;
1436 }
1437}
1438
b284085e 1439static void
62453dad 1440port_destroy(struct dp_netdev_port *port)
b284085e 1441{
62453dad
DDP
1442 if (!port) {
1443 return;
b284085e 1444 }
b284085e 1445
62453dad
DDP
1446 netdev_close(port->netdev);
1447 netdev_restore_flags(port->sf);
accf8626 1448
62453dad 1449 for (unsigned i = 0; i < port->n_rxq; i++) {
3eb67853 1450 netdev_rxq_close(port->rxqs[i].rxq);
b284085e 1451 }
324c8374 1452 ovs_mutex_destroy(&port->txq_used_mutex);
3eb67853 1453 free(port->rxq_affinity_list);
324c8374 1454 free(port->txq_used);
3eb67853 1455 free(port->rxqs);
62453dad
DDP
1456 free(port->type);
1457 free(port);
b284085e
PS
1458}
1459
72865317
BP
1460static int
1461get_port_by_name(struct dp_netdev *dp,
1462 const char *devname, struct dp_netdev_port **portp)
59e6d833 1463 OVS_REQUIRES(dp->port_mutex)
72865317
BP
1464{
1465 struct dp_netdev_port *port;
1466
e9985d6a 1467 HMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 1468 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
1469 *portp = port;
1470 return 0;
1471 }
1472 }
1473 return ENOENT;
1474}
1475
347ba9bb
IM
1476static int
1477get_n_pmd_threads(struct dp_netdev *dp)
1478{
1479 /* There is one non pmd thread in dp->poll_threads */
1480 return cmap_count(&dp->poll_threads) - 1;
1481}
1482
65f13b50
AW
1483static int
1484get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1485{
1486 struct dp_netdev_pmd_thread *pmd;
1487 int n_pmds = 0;
1488
1489 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1490 if (pmd->numa_id == numa_id) {
1491 n_pmds++;
1492 }
1493 }
1494
1495 return n_pmds;
1496}
1497
1498/* Returns 'true' if there is a port with pmd netdev and the netdev
1499 * is on numa node 'numa_id'. */
1500static bool
1501has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
e9985d6a 1502 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
1503{
1504 struct dp_netdev_port *port;
1505
e9985d6a 1506 HMAP_FOR_EACH (port, node, &dp->ports) {
65f13b50
AW
1507 if (netdev_is_pmd(port->netdev)
1508 && netdev_get_numa_id(port->netdev) == numa_id) {
1509 return true;
1510 }
1511 }
1512
1513 return false;
1514}
1515
1516
c40b890f
BP
1517static void
1518do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 1519 OVS_REQUIRES(dp->port_mutex)
72865317 1520{
e9985d6a 1521 hmap_remove(&dp->ports, &port->node);
d33ed218 1522 seq_change(dp->port_seq);
d0cca6c3
DDP
1523
1524 dp_netdev_del_port_from_all_pmds(dp, port);
1525
e4cfed38 1526 if (netdev_is_pmd(port->netdev)) {
65f13b50
AW
1527 int numa_id = netdev_get_numa_id(port->netdev);
1528
ae7ad0a1
IM
1529 /* PMD threads can not be on invalid numa node. */
1530 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
65f13b50 1531 /* If there is no netdev on the numa node, deletes the pmd threads
d0cca6c3 1532 * for that numa. */
65f13b50
AW
1533 if (!has_pmd_port_for_numa(dp, numa_id)) {
1534 dp_netdev_del_pmds_on_numa(dp, numa_id);
1535 }
e4cfed38 1536 }
72865317 1537
62453dad 1538 port_destroy(port);
72865317
BP
1539}
1540
1541static void
4c738a8d
BP
1542answer_port_query(const struct dp_netdev_port *port,
1543 struct dpif_port *dpif_port)
72865317 1544{
3efb6063 1545 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 1546 dpif_port->type = xstrdup(port->type);
35303d71 1547 dpif_port->port_no = port->port_no;
72865317
BP
1548}
1549
1550static int
4e022ec0 1551dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 1552 struct dpif_port *dpif_port)
72865317
BP
1553{
1554 struct dp_netdev *dp = get_dp_netdev(dpif);
1555 struct dp_netdev_port *port;
1556 int error;
1557
e9985d6a 1558 ovs_mutex_lock(&dp->port_mutex);
72865317 1559 error = get_port_by_number(dp, port_no, &port);
4afba28d 1560 if (!error && dpif_port) {
4c738a8d 1561 answer_port_query(port, dpif_port);
72865317 1562 }
e9985d6a 1563 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1564
72865317
BP
1565 return error;
1566}
1567
1568static int
1569dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 1570 struct dpif_port *dpif_port)
72865317
BP
1571{
1572 struct dp_netdev *dp = get_dp_netdev(dpif);
1573 struct dp_netdev_port *port;
1574 int error;
1575
59e6d833 1576 ovs_mutex_lock(&dp->port_mutex);
72865317 1577 error = get_port_by_name(dp, devname, &port);
4afba28d 1578 if (!error && dpif_port) {
4c738a8d 1579 answer_port_query(port, dpif_port);
72865317 1580 }
59e6d833 1581 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1582
72865317
BP
1583 return error;
1584}
1585
61e7deb1
BP
1586static void
1587dp_netdev_flow_free(struct dp_netdev_flow *flow)
1588{
61e7deb1 1589 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1590 free(flow);
1591}
1592
ed79f89a
DDP
1593static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1594{
1595 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1596 ovsrcu_postpone(dp_netdev_flow_free, flow);
1597 }
1598}
1599
70e5ed6f
JS
1600static uint32_t
1601dp_netdev_flow_hash(const ovs_u128 *ufid)
1602{
1603 return ufid->u32[0];
1604}
1605
3453b4d6
JS
1606static inline struct dpcls *
1607dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1608 odp_port_t in_port)
1609{
1610 struct dpcls *cls;
1611 uint32_t hash = hash_port_no(in_port);
1612 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1613 if (cls->in_port == in_port) {
1614 /* Port classifier exists already */
1615 return cls;
1616 }
1617 }
1618 return NULL;
1619}
1620
1621static inline struct dpcls *
1622dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1623 odp_port_t in_port)
1624 OVS_REQUIRES(pmd->flow_mutex)
1625{
1626 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1627 uint32_t hash = hash_port_no(in_port);
1628
1629 if (!cls) {
1630 /* Create new classifier for in_port */
1631 cls = xmalloc(sizeof(*cls));
1632 dpcls_init(cls);
1633 cls->in_port = in_port;
1634 cmap_insert(&pmd->classifiers, &cls->node, hash);
1635 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1636 }
1637 return cls;
1638}
1639
72865317 1640static void
1c1e46ed
AW
1641dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1642 struct dp_netdev_flow *flow)
1643 OVS_REQUIRES(pmd->flow_mutex)
72865317 1644{
9f361d6b 1645 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
3453b4d6
JS
1646 struct dpcls *cls;
1647 odp_port_t in_port = flow->flow.in_port.odp_port;
2c0ea78f 1648
3453b4d6
JS
1649 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1650 ovs_assert(cls != NULL);
1651 dpcls_remove(cls, &flow->cr);
1c1e46ed 1652 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
9bbf1c3d 1653 flow->dead = true;
ed79f89a
DDP
1654
1655 dp_netdev_flow_unref(flow);
72865317
BP
1656}
1657
1658static void
1c1e46ed 1659dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
72865317 1660{
78c8df12 1661 struct dp_netdev_flow *netdev_flow;
72865317 1662
1c1e46ed
AW
1663 ovs_mutex_lock(&pmd->flow_mutex);
1664 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1665 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 1666 }
1c1e46ed 1667 ovs_mutex_unlock(&pmd->flow_mutex);
72865317
BP
1668}
1669
1670static int
1671dpif_netdev_flow_flush(struct dpif *dpif)
1672{
1673 struct dp_netdev *dp = get_dp_netdev(dpif);
1c1e46ed
AW
1674 struct dp_netdev_pmd_thread *pmd;
1675
1676 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1677 dp_netdev_pmd_flow_flush(pmd);
1678 }
5279f8fd 1679
72865317
BP
1680 return 0;
1681}
1682
b0ec0f27 1683struct dp_netdev_port_state {
e9985d6a 1684 struct hmap_position position;
4c738a8d 1685 char *name;
b0ec0f27
BP
1686};
1687
1688static int
1689dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1690{
1691 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1692 return 0;
1693}
1694
72865317 1695static int
b0ec0f27 1696dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1697 struct dpif_port *dpif_port)
72865317 1698{
b0ec0f27 1699 struct dp_netdev_port_state *state = state_;
72865317 1700 struct dp_netdev *dp = get_dp_netdev(dpif);
e9985d6a 1701 struct hmap_node *node;
ff073a71 1702 int retval;
72865317 1703
e9985d6a
DDP
1704 ovs_mutex_lock(&dp->port_mutex);
1705 node = hmap_at_position(&dp->ports, &state->position);
ff073a71
BP
1706 if (node) {
1707 struct dp_netdev_port *port;
5279f8fd 1708
ff073a71
BP
1709 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1710
1711 free(state->name);
1712 state->name = xstrdup(netdev_get_name(port->netdev));
1713 dpif_port->name = state->name;
1714 dpif_port->type = port->type;
35303d71 1715 dpif_port->port_no = port->port_no;
ff073a71
BP
1716
1717 retval = 0;
1718 } else {
1719 retval = EOF;
72865317 1720 }
e9985d6a 1721 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1722
ff073a71 1723 return retval;
b0ec0f27
BP
1724}
1725
1726static int
4c738a8d 1727dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1728{
4c738a8d
BP
1729 struct dp_netdev_port_state *state = state_;
1730 free(state->name);
b0ec0f27
BP
1731 free(state);
1732 return 0;
72865317
BP
1733}
1734
1735static int
67a4917b 1736dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1737{
1738 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1739 uint64_t new_port_seq;
5279f8fd
BP
1740 int error;
1741
d33ed218
BP
1742 new_port_seq = seq_read(dpif->dp->port_seq);
1743 if (dpif->last_port_seq != new_port_seq) {
1744 dpif->last_port_seq = new_port_seq;
5279f8fd 1745 error = ENOBUFS;
72865317 1746 } else {
5279f8fd 1747 error = EAGAIN;
72865317 1748 }
5279f8fd
BP
1749
1750 return error;
72865317
BP
1751}
1752
1753static void
1754dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1755{
1756 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1757
d33ed218 1758 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1759}
1760
1761static struct dp_netdev_flow *
0de8783a 1762dp_netdev_flow_cast(const struct dpcls_rule *cr)
8a4e3a85
BP
1763{
1764 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1765}
1766
9bbf1c3d
DDP
1767static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1768{
1769 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1770}
1771
79df317f
DDP
1772/* netdev_flow_key utilities.
1773 *
1774 * netdev_flow_key is basically a miniflow. We use these functions
1775 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1776 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1777 *
1778 * - Since we are dealing exclusively with miniflows created by
1779 * miniflow_extract(), if the map is different the miniflow is different.
1780 * Therefore we can be faster by comparing the map and the miniflow in a
1781 * single memcmp().
5fcff47b 1782 * - These functions can be inlined by the compiler. */
79df317f 1783
361d808d 1784/* Given the number of bits set in miniflow's maps, returns the size of the
caeb4906 1785 * 'netdev_flow_key.mf' */
361d808d
JR
1786static inline size_t
1787netdev_flow_key_size(size_t flow_u64s)
79df317f 1788{
361d808d 1789 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
79df317f
DDP
1790}
1791
79df317f
DDP
1792static inline bool
1793netdev_flow_key_equal(const struct netdev_flow_key *a,
0de8783a
JR
1794 const struct netdev_flow_key *b)
1795{
caeb4906
JR
1796 /* 'b->len' may be not set yet. */
1797 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
0de8783a
JR
1798}
1799
1800/* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
d79a39fe 1801 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
0de8783a
JR
1802 * generated by miniflow_extract. */
1803static inline bool
1804netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1805 const struct miniflow *mf)
79df317f 1806{
caeb4906 1807 return !memcmp(&key->mf, mf, key->len);
79df317f
DDP
1808}
1809
1810static inline void
1811netdev_flow_key_clone(struct netdev_flow_key *dst,
0de8783a
JR
1812 const struct netdev_flow_key *src)
1813{
caeb4906
JR
1814 memcpy(dst, src,
1815 offsetof(struct netdev_flow_key, mf) + src->len);
0de8783a
JR
1816}
1817
1818/* Slow. */
1819static void
1820netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1821 const struct flow *src)
1822{
cf62fa4c 1823 struct dp_packet packet;
0de8783a 1824 uint64_t buf_stub[512 / 8];
0de8783a 1825
cf62fa4c
PS
1826 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1827 pkt_metadata_from_flow(&packet.md, src);
0de8783a 1828 flow_compose(&packet, src);
cf62fa4c
PS
1829 miniflow_extract(&packet, &dst->mf);
1830 dp_packet_uninit(&packet);
0de8783a 1831
361d808d 1832 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
0de8783a
JR
1833 dst->hash = 0; /* Not computed yet. */
1834}
1835
1836/* Initialize a netdev_flow_key 'mask' from 'match'. */
1837static inline void
1838netdev_flow_mask_init(struct netdev_flow_key *mask,
1839 const struct match *match)
1840{
09b0fa9c 1841 uint64_t *dst = miniflow_values(&mask->mf);
5fcff47b 1842 struct flowmap fmap;
0de8783a 1843 uint32_t hash = 0;
5fcff47b 1844 size_t idx;
0de8783a
JR
1845
1846 /* Only check masks that make sense for the flow. */
5fcff47b
JR
1847 flow_wc_map(&match->flow, &fmap);
1848 flowmap_init(&mask->mf.map);
0de8783a 1849
5fcff47b
JR
1850 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1851 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
0de8783a 1852
5fcff47b
JR
1853 if (mask_u64) {
1854 flowmap_set(&mask->mf.map, idx, 1);
1855 *dst++ = mask_u64;
1856 hash = hash_add64(hash, mask_u64);
0de8783a 1857 }
0de8783a
JR
1858 }
1859
5fcff47b 1860 map_t map;
0de8783a 1861
5fcff47b
JR
1862 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1863 hash = hash_add64(hash, map);
1864 }
0de8783a 1865
5fcff47b 1866 size_t n = dst - miniflow_get_values(&mask->mf);
0de8783a 1867
d70e8c28 1868 mask->hash = hash_finish(hash, n * 8);
0de8783a
JR
1869 mask->len = netdev_flow_key_size(n);
1870}
1871
361d808d 1872/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
0de8783a
JR
1873static inline void
1874netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1875 const struct flow *flow,
1876 const struct netdev_flow_key *mask)
79df317f 1877{
09b0fa9c
JR
1878 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1879 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
0de8783a 1880 uint32_t hash = 0;
d70e8c28 1881 uint64_t value;
0de8783a
JR
1882
1883 dst->len = mask->len;
361d808d 1884 dst->mf = mask->mf; /* Copy maps. */
0de8783a 1885
5fcff47b 1886 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
d70e8c28
JR
1887 *dst_u64 = value & *mask_u64++;
1888 hash = hash_add64(hash, *dst_u64++);
0de8783a 1889 }
09b0fa9c
JR
1890 dst->hash = hash_finish(hash,
1891 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
0de8783a
JR
1892}
1893
5fcff47b
JR
1894/* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1895#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1896 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
0de8783a
JR
1897
1898/* Returns a hash value for the bits of 'key' where there are 1-bits in
1899 * 'mask'. */
1900static inline uint32_t
1901netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1902 const struct netdev_flow_key *mask)
1903{
09b0fa9c 1904 const uint64_t *p = miniflow_get_values(&mask->mf);
0de8783a 1905 uint32_t hash = 0;
5fcff47b 1906 uint64_t value;
0de8783a 1907
5fcff47b
JR
1908 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1909 hash = hash_add64(hash, value & *p++);
0de8783a
JR
1910 }
1911
09b0fa9c 1912 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
79df317f
DDP
1913}
1914
9bbf1c3d
DDP
1915static inline bool
1916emc_entry_alive(struct emc_entry *ce)
1917{
1918 return ce->flow && !ce->flow->dead;
1919}
1920
1921static void
1922emc_clear_entry(struct emc_entry *ce)
1923{
1924 if (ce->flow) {
1925 dp_netdev_flow_unref(ce->flow);
1926 ce->flow = NULL;
1927 }
1928}
1929
1930static inline void
1931emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
0de8783a 1932 const struct netdev_flow_key *key)
9bbf1c3d
DDP
1933{
1934 if (ce->flow != flow) {
1935 if (ce->flow) {
1936 dp_netdev_flow_unref(ce->flow);
1937 }
1938
1939 if (dp_netdev_flow_ref(flow)) {
1940 ce->flow = flow;
1941 } else {
1942 ce->flow = NULL;
1943 }
1944 }
0de8783a
JR
1945 if (key) {
1946 netdev_flow_key_clone(&ce->key, key);
9bbf1c3d
DDP
1947 }
1948}
1949
1950static inline void
0de8783a 1951emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
9bbf1c3d
DDP
1952 struct dp_netdev_flow *flow)
1953{
1954 struct emc_entry *to_be_replaced = NULL;
1955 struct emc_entry *current_entry;
1956
0de8783a
JR
1957 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1958 if (netdev_flow_key_equal(&current_entry->key, key)) {
9bbf1c3d 1959 /* We found the entry with the 'mf' miniflow */
0de8783a 1960 emc_change_entry(current_entry, flow, NULL);
9bbf1c3d
DDP
1961 return;
1962 }
1963
1964 /* Replacement policy: put the flow in an empty (not alive) entry, or
1965 * in the first entry where it can be */
1966 if (!to_be_replaced
1967 || (emc_entry_alive(to_be_replaced)
1968 && !emc_entry_alive(current_entry))
0de8783a 1969 || current_entry->key.hash < to_be_replaced->key.hash) {
9bbf1c3d
DDP
1970 to_be_replaced = current_entry;
1971 }
1972 }
1973 /* We didn't find the miniflow in the cache.
1974 * The 'to_be_replaced' entry is where the new flow will be stored */
1975
0de8783a 1976 emc_change_entry(to_be_replaced, flow, key);
9bbf1c3d
DDP
1977}
1978
1979static inline struct dp_netdev_flow *
0de8783a 1980emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
9bbf1c3d
DDP
1981{
1982 struct emc_entry *current_entry;
1983
0de8783a
JR
1984 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1985 if (current_entry->key.hash == key->hash
1986 && emc_entry_alive(current_entry)
1987 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
9bbf1c3d 1988
0de8783a 1989 /* We found the entry with the 'key->mf' miniflow */
9bbf1c3d
DDP
1990 return current_entry->flow;
1991 }
1992 }
1993
1994 return NULL;
1995}
1996
72865317 1997static struct dp_netdev_flow *
3453b4d6
JS
1998dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
1999 const struct netdev_flow_key *key,
2000 int *lookup_num_p)
2c0ea78f 2001{
3453b4d6 2002 struct dpcls *cls;
0de8783a 2003 struct dpcls_rule *rule;
3453b4d6
JS
2004 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
2005 struct dp_netdev_flow *netdev_flow = NULL;
2c0ea78f 2006
3453b4d6
JS
2007 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2008 if (OVS_LIKELY(cls)) {
2009 dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2010 netdev_flow = dp_netdev_flow_cast(rule);
2011 }
8a4e3a85 2012 return netdev_flow;
2c0ea78f
GS
2013}
2014
2015static struct dp_netdev_flow *
1c1e46ed
AW
2016dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2017 const ovs_u128 *ufidp, const struct nlattr *key,
2018 size_t key_len)
72865317 2019{
1763b4b8 2020 struct dp_netdev_flow *netdev_flow;
70e5ed6f
JS
2021 struct flow flow;
2022 ovs_u128 ufid;
2023
2024 /* If a UFID is not provided, determine one based on the key. */
2025 if (!ufidp && key && key_len
2026 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1c1e46ed 2027 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
70e5ed6f
JS
2028 ufidp = &ufid;
2029 }
72865317 2030
70e5ed6f
JS
2031 if (ufidp) {
2032 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1c1e46ed 2033 &pmd->flow_table) {
2ff8484b 2034 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
70e5ed6f
JS
2035 return netdev_flow;
2036 }
72865317
BP
2037 }
2038 }
8a4e3a85 2039
72865317
BP
2040 return NULL;
2041}
2042
2043static void
eb94da30 2044get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1763b4b8 2045 struct dpif_flow_stats *stats)
feebdea2 2046{
eb94da30
DDP
2047 struct dp_netdev_flow *netdev_flow;
2048 unsigned long long n;
2049 long long used;
2050 uint16_t flags;
2051
2052 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2053
2054 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2055 stats->n_packets = n;
2056 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2057 stats->n_bytes = n;
2058 atomic_read_relaxed(&netdev_flow->stats.used, &used);
2059 stats->used = used;
2060 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2061 stats->tcp_flags = flags;
72865317
BP
2062}
2063
7af12bd7
JS
2064/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2065 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2066 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2067 * protect them. */
6fe09f8c 2068static void
70e5ed6f 2069dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
7af12bd7 2070 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
64bb477f 2071 struct dpif_flow *flow, bool terse)
6fe09f8c 2072{
64bb477f
JS
2073 if (terse) {
2074 memset(flow, 0, sizeof *flow);
2075 } else {
2076 struct flow_wildcards wc;
2077 struct dp_netdev_actions *actions;
2078 size_t offset;
5262eea1
JG
2079 struct odp_flow_key_parms odp_parms = {
2080 .flow = &netdev_flow->flow,
2081 .mask = &wc.masks,
2494ccd7 2082 .support = dp_netdev_support,
5262eea1 2083 };
64bb477f
JS
2084
2085 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2086
2087 /* Key */
6fd6ed71 2088 offset = key_buf->size;
64bb477f 2089 flow->key = ofpbuf_tail(key_buf);
5262eea1 2090 odp_flow_key_from_flow(&odp_parms, key_buf);
6fd6ed71 2091 flow->key_len = key_buf->size - offset;
64bb477f
JS
2092
2093 /* Mask */
6fd6ed71 2094 offset = mask_buf->size;
64bb477f 2095 flow->mask = ofpbuf_tail(mask_buf);
ec1f6f32 2096 odp_parms.key_buf = key_buf;
5262eea1 2097 odp_flow_key_from_mask(&odp_parms, mask_buf);
6fd6ed71 2098 flow->mask_len = mask_buf->size - offset;
64bb477f
JS
2099
2100 /* Actions */
2101 actions = dp_netdev_flow_get_actions(netdev_flow);
2102 flow->actions = actions->actions;
2103 flow->actions_len = actions->size;
2104 }
6fe09f8c 2105
70e5ed6f
JS
2106 flow->ufid = netdev_flow->ufid;
2107 flow->ufid_present = true;
1c1e46ed 2108 flow->pmd_id = netdev_flow->pmd_id;
6fe09f8c
JS
2109 get_dpif_flow_stats(netdev_flow, &flow->stats);
2110}
2111
36956a7d 2112static int
8c301900
JR
2113dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2114 const struct nlattr *mask_key,
2115 uint32_t mask_key_len, const struct flow *flow,
9f861c91 2116 struct flow_wildcards *wc)
8c301900 2117{
ca8d3442
DDP
2118 enum odp_key_fitness fitness;
2119
8d8ab6c2 2120 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
ca8d3442
DDP
2121 if (fitness) {
2122 /* This should not happen: it indicates that
2123 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2124 * disagree on the acceptable form of a mask. Log the problem
2125 * as an error, with enough details to enable debugging. */
2126 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2127
2128 if (!VLOG_DROP_ERR(&rl)) {
2129 struct ds s;
8c301900 2130
ca8d3442
DDP
2131 ds_init(&s);
2132 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2133 true);
2134 VLOG_ERR("internal error parsing flow mask %s (%s)",
2135 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2136 ds_destroy(&s);
8c301900 2137 }
ca8d3442
DDP
2138
2139 return EINVAL;
8c301900
JR
2140 }
2141
2142 return 0;
2143}
2144
2145static int
2146dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2147 struct flow *flow)
36956a7d 2148{
586ddea5
BP
2149 odp_port_t in_port;
2150
8d8ab6c2 2151 if (odp_flow_key_to_flow(key, key_len, flow)) {
36956a7d 2152 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
2153 * and odp_flow_key_to_flow() disagree on the acceptable form of a
2154 * flow. Log the problem as an error, with enough details to enable
2155 * debugging. */
36956a7d
BP
2156 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2157
2158 if (!VLOG_DROP_ERR(&rl)) {
2159 struct ds s;
2160
2161 ds_init(&s);
8c301900 2162 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
2163 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2164 ds_destroy(&s);
2165 }
2166
2167 return EINVAL;
2168 }
2169
586ddea5
BP
2170 in_port = flow->in_port.odp_port;
2171 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
2172 return EINVAL;
2173 }
2174
5cf3edb3 2175 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
07659514
JS
2176 return EINVAL;
2177 }
2178
36956a7d
BP
2179 return 0;
2180}
2181
72865317 2182static int
6fe09f8c 2183dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
2184{
2185 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2186 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2187 struct dp_netdev_pmd_thread *pmd;
c673049c
IM
2188 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2189 struct hmapx_node *node;
2190 int error = EINVAL;
2191
2192 if (get->pmd_id == PMD_ID_NULL) {
2193 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2194 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2195 dp_netdev_pmd_unref(pmd);
2196 }
2197 }
2198 } else {
2199 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2200 if (!pmd) {
2201 goto out;
2202 }
2203 hmapx_add(&to_find, pmd);
1c1e46ed
AW
2204 }
2205
c673049c
IM
2206 if (!hmapx_count(&to_find)) {
2207 goto out;
72865317 2208 }
1c1e46ed 2209
c673049c
IM
2210 HMAPX_FOR_EACH (node, &to_find) {
2211 pmd = (struct dp_netdev_pmd_thread *) node->data;
2212 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2213 get->key_len);
2214 if (netdev_flow) {
2215 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2216 get->flow, false);
2217 error = 0;
2218 break;
2219 } else {
2220 error = ENOENT;
2221 }
2222 }
bc4a05c6 2223
c673049c
IM
2224 HMAPX_FOR_EACH (node, &to_find) {
2225 pmd = (struct dp_netdev_pmd_thread *) node->data;
2226 dp_netdev_pmd_unref(pmd);
2227 }
2228out:
2229 hmapx_destroy(&to_find);
5279f8fd 2230 return error;
72865317
BP
2231}
2232
0de8783a 2233static struct dp_netdev_flow *
1c1e46ed
AW
2234dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2235 struct match *match, const ovs_u128 *ufid,
ae2ceebd 2236 const struct nlattr *actions, size_t actions_len)
1c1e46ed 2237 OVS_REQUIRES(pmd->flow_mutex)
72865317 2238{
0de8783a
JR
2239 struct dp_netdev_flow *flow;
2240 struct netdev_flow_key mask;
3453b4d6
JS
2241 struct dpcls *cls;
2242 odp_port_t in_port = match->flow.in_port.odp_port;
ed79f89a 2243
0de8783a
JR
2244 netdev_flow_mask_init(&mask, match);
2245 /* Make sure wc does not have metadata. */
5fcff47b
JR
2246 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2247 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
679ba04c 2248
0de8783a 2249 /* Do not allocate extra space. */
caeb4906 2250 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
1c1e46ed 2251 memset(&flow->stats, 0, sizeof flow->stats);
0de8783a 2252 flow->dead = false;
11e5cf1f 2253 flow->batch = NULL;
bd5131ba 2254 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
0de8783a 2255 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
70e5ed6f 2256 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
0de8783a 2257 ovs_refcount_init(&flow->ref_cnt);
0de8783a 2258 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2c0ea78f 2259
0de8783a 2260 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3453b4d6
JS
2261
2262 /* Select dpcls for in_port. Relies on in_port to be exact match */
2263 ovs_assert(match->wc.masks.in_port.odp_port == ODP_PORT_C(UINT32_MAX));
2264 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2265 dpcls_insert(cls, &flow->cr, &mask);
72865317 2266
4c75aaab
EJ
2267 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2268 dp_netdev_flow_hash(&flow->ufid));
2269
623540e4
EJ
2270 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
2271 struct ds ds = DS_EMPTY_INITIALIZER;
9044f2c1
JG
2272 struct ofpbuf key_buf, mask_buf;
2273 struct odp_flow_key_parms odp_parms = {
2274 .flow = &match->flow,
2275 .mask = &match->wc.masks,
2276 .support = dp_netdev_support,
2277 };
2278
2279 ofpbuf_init(&key_buf, 0);
2280 ofpbuf_init(&mask_buf, 0);
623540e4 2281
9044f2c1
JG
2282 odp_flow_key_from_flow(&odp_parms, &key_buf);
2283 odp_parms.key_buf = &key_buf;
2284 odp_flow_key_from_mask(&odp_parms, &mask_buf);
0de8783a 2285
623540e4 2286 ds_put_cstr(&ds, "flow_add: ");
70e5ed6f
JS
2287 odp_format_ufid(ufid, &ds);
2288 ds_put_cstr(&ds, " ");
9044f2c1
JG
2289 odp_flow_format(key_buf.data, key_buf.size,
2290 mask_buf.data, mask_buf.size,
2291 NULL, &ds, false);
623540e4
EJ
2292 ds_put_cstr(&ds, ", actions:");
2293 format_odp_actions(&ds, actions, actions_len);
2294
2295 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2296
9044f2c1
JG
2297 ofpbuf_uninit(&key_buf);
2298 ofpbuf_uninit(&mask_buf);
623540e4
EJ
2299 ds_destroy(&ds);
2300 }
2301
0de8783a 2302 return flow;
72865317
BP
2303}
2304
72865317 2305static int
89625d1e 2306dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
2307{
2308 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2309 struct dp_netdev_flow *netdev_flow;
0de8783a 2310 struct netdev_flow_key key;
1c1e46ed 2311 struct dp_netdev_pmd_thread *pmd;
ae2ceebd 2312 struct match match;
70e5ed6f 2313 ovs_u128 ufid;
bd5131ba
DDP
2314 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2315 ? NON_PMD_CORE_ID : put->pmd_id;
36956a7d
BP
2316 int error;
2317
ae2ceebd 2318 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
2319 if (error) {
2320 return error;
2321 }
2322 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2323 put->mask, put->mask_len,
9f861c91 2324 &match.flow, &match.wc);
36956a7d
BP
2325 if (error) {
2326 return error;
2327 }
0de8783a 2328
1c1e46ed
AW
2329 pmd = dp_netdev_get_pmd(dp, pmd_id);
2330 if (!pmd) {
2331 return EINVAL;
2332 }
2333
0de8783a
JR
2334 /* Must produce a netdev_flow_key for lookup.
2335 * This interface is no longer performance critical, since it is not used
2336 * for upcall processing any more. */
2337 netdev_flow_key_from_flow(&key, &match.flow);
72865317 2338
70e5ed6f
JS
2339 if (put->ufid) {
2340 ufid = *put->ufid;
2341 } else {
2342 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2343 }
2344
1c1e46ed 2345 ovs_mutex_lock(&pmd->flow_mutex);
3453b4d6 2346 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key, NULL);
1763b4b8 2347 if (!netdev_flow) {
89625d1e 2348 if (put->flags & DPIF_FP_CREATE) {
1c1e46ed 2349 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
89625d1e
BP
2350 if (put->stats) {
2351 memset(put->stats, 0, sizeof *put->stats);
feebdea2 2352 }
1c1e46ed 2353 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
70e5ed6f 2354 put->actions_len);
0de8783a 2355 error = 0;
72865317 2356 } else {
5279f8fd 2357 error = EFBIG;
72865317
BP
2358 }
2359 } else {
5279f8fd 2360 error = ENOENT;
72865317
BP
2361 }
2362 } else {
2c0ea78f 2363 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 2364 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
2365 struct dp_netdev_actions *new_actions;
2366 struct dp_netdev_actions *old_actions;
2367
2368 new_actions = dp_netdev_actions_create(put->actions,
2369 put->actions_len);
2370
61e7deb1
BP
2371 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2372 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 2373
a84cb64a
BP
2374 if (put->stats) {
2375 get_dpif_flow_stats(netdev_flow, put->stats);
2376 }
2377 if (put->flags & DPIF_FP_ZERO_STATS) {
97447f55
DDP
2378 /* XXX: The userspace datapath uses thread local statistics
2379 * (for flows), which should be updated only by the owning
2380 * thread. Since we cannot write on stats memory here,
2381 * we choose not to support this flag. Please note:
2382 * - This feature is currently used only by dpctl commands with
2383 * option --clear.
2384 * - Should the need arise, this operation can be implemented
2385 * by keeping a base value (to be update here) for each
2386 * counter, and subtracting it before outputting the stats */
2387 error = EOPNOTSUPP;
72865317 2388 }
8a4e3a85 2389
61e7deb1 2390 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 2391 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 2392 error = EEXIST;
2c0ea78f
GS
2393 } else {
2394 /* Overlapping flow. */
2395 error = EINVAL;
72865317
BP
2396 }
2397 }
1c1e46ed
AW
2398 ovs_mutex_unlock(&pmd->flow_mutex);
2399 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2400
2401 return error;
72865317
BP
2402}
2403
72865317 2404static int
b99d3cee 2405dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
2406{
2407 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 2408 struct dp_netdev_flow *netdev_flow;
1c1e46ed 2409 struct dp_netdev_pmd_thread *pmd;
bd5131ba
DDP
2410 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2411 ? NON_PMD_CORE_ID : del->pmd_id;
70e5ed6f 2412 int error = 0;
72865317 2413
1c1e46ed
AW
2414 pmd = dp_netdev_get_pmd(dp, pmd_id);
2415 if (!pmd) {
2416 return EINVAL;
2417 }
2418
2419 ovs_mutex_lock(&pmd->flow_mutex);
2420 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2421 del->key_len);
1763b4b8 2422 if (netdev_flow) {
b99d3cee 2423 if (del->stats) {
1763b4b8 2424 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 2425 }
1c1e46ed 2426 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
72865317 2427 } else {
5279f8fd 2428 error = ENOENT;
72865317 2429 }
1c1e46ed
AW
2430 ovs_mutex_unlock(&pmd->flow_mutex);
2431 dp_netdev_pmd_unref(pmd);
5279f8fd
BP
2432
2433 return error;
72865317
BP
2434}
2435
ac64794a
BP
2436struct dpif_netdev_flow_dump {
2437 struct dpif_flow_dump up;
1c1e46ed
AW
2438 struct cmap_position poll_thread_pos;
2439 struct cmap_position flow_pos;
2440 struct dp_netdev_pmd_thread *cur_pmd;
d2ad7ef1
JS
2441 int status;
2442 struct ovs_mutex mutex;
e723fd32
JS
2443};
2444
ac64794a
BP
2445static struct dpif_netdev_flow_dump *
2446dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 2447{
ac64794a 2448 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
2449}
2450
ac64794a 2451static struct dpif_flow_dump *
64bb477f 2452dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
e723fd32 2453{
ac64794a 2454 struct dpif_netdev_flow_dump *dump;
e723fd32 2455
1c1e46ed 2456 dump = xzalloc(sizeof *dump);
ac64794a 2457 dpif_flow_dump_init(&dump->up, dpif_);
64bb477f 2458 dump->up.terse = terse;
ac64794a
BP
2459 ovs_mutex_init(&dump->mutex);
2460
2461 return &dump->up;
e723fd32
JS
2462}
2463
2464static int
ac64794a 2465dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 2466{
ac64794a 2467 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 2468
ac64794a
BP
2469 ovs_mutex_destroy(&dump->mutex);
2470 free(dump);
704a1e09
BP
2471 return 0;
2472}
2473
ac64794a
BP
2474struct dpif_netdev_flow_dump_thread {
2475 struct dpif_flow_dump_thread up;
2476 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
2477 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2478 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
2479};
2480
2481static struct dpif_netdev_flow_dump_thread *
2482dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2483{
2484 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2485}
2486
2487static struct dpif_flow_dump_thread *
2488dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2489{
2490 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2491 struct dpif_netdev_flow_dump_thread *thread;
2492
2493 thread = xmalloc(sizeof *thread);
2494 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2495 thread->dump = dump;
2496 return &thread->up;
2497}
2498
2499static void
2500dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2501{
2502 struct dpif_netdev_flow_dump_thread *thread
2503 = dpif_netdev_flow_dump_thread_cast(thread_);
2504
2505 free(thread);
2506}
2507
704a1e09 2508static int
ac64794a 2509dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 2510 struct dpif_flow *flows, int max_flows)
ac64794a
BP
2511{
2512 struct dpif_netdev_flow_dump_thread *thread
2513 = dpif_netdev_flow_dump_thread_cast(thread_);
2514 struct dpif_netdev_flow_dump *dump = thread->dump;
8bb113da 2515 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
8bb113da
RW
2516 int n_flows = 0;
2517 int i;
14608a15 2518
ac64794a 2519 ovs_mutex_lock(&dump->mutex);
8bb113da 2520 if (!dump->status) {
1c1e46ed
AW
2521 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2522 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2523 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2524 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2525
2526 /* First call to dump_next(), extracts the first pmd thread.
2527 * If there is no pmd thread, returns immediately. */
2528 if (!pmd) {
2529 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2530 if (!pmd) {
2531 ovs_mutex_unlock(&dump->mutex);
2532 return n_flows;
8bb113da 2533
8bb113da 2534 }
d2ad7ef1 2535 }
1c1e46ed
AW
2536
2537 do {
2538 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2539 struct cmap_node *node;
2540
2541 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2542 if (!node) {
2543 break;
2544 }
2545 netdev_flows[n_flows] = CONTAINER_OF(node,
2546 struct dp_netdev_flow,
2547 node);
2548 }
2549 /* When finishing dumping the current pmd thread, moves to
2550 * the next. */
2551 if (n_flows < flow_limit) {
2552 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2553 dp_netdev_pmd_unref(pmd);
2554 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2555 if (!pmd) {
2556 dump->status = EOF;
2557 break;
2558 }
2559 }
2560 /* Keeps the reference to next caller. */
2561 dump->cur_pmd = pmd;
2562
2563 /* If the current dump is empty, do not exit the loop, since the
2564 * remaining pmds could have flows to be dumped. Just dumps again
2565 * on the new 'pmd'. */
2566 } while (!n_flows);
8a4e3a85 2567 }
ac64794a 2568 ovs_mutex_unlock(&dump->mutex);
ac64794a 2569
8bb113da
RW
2570 for (i = 0; i < n_flows; i++) {
2571 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2572 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2573 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2574 struct dpif_flow *f = &flows[i];
7af12bd7 2575 struct ofpbuf key, mask;
8bb113da 2576
7af12bd7
JS
2577 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2578 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
64bb477f
JS
2579 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2580 dump->up.terse);
8bb113da 2581 }
feebdea2 2582
8bb113da 2583 return n_flows;
72865317
BP
2584}
2585
2586static int
758c456d 2587dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 2588 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
2589{
2590 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 2591 struct dp_netdev_pmd_thread *pmd;
1895cc8d 2592 struct dp_packet_batch pp;
72865317 2593
cf62fa4c
PS
2594 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2595 dp_packet_size(execute->packet) > UINT16_MAX) {
72865317
BP
2596 return EINVAL;
2597 }
2598
65f13b50
AW
2599 /* Tries finding the 'pmd'. If NULL is returned, that means
2600 * the current thread is a non-pmd thread and should use
b19befae 2601 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
65f13b50
AW
2602 pmd = ovsthread_getspecific(dp->per_pmd_key);
2603 if (!pmd) {
b19befae 2604 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
546e57d4
DDP
2605 if (!pmd) {
2606 return EBUSY;
2607 }
65f13b50
AW
2608 }
2609
2610 /* If the current thread is non-pmd thread, acquires
2611 * the 'non_pmd_mutex'. */
2612 if (pmd->core_id == NON_PMD_CORE_ID) {
2613 ovs_mutex_lock(&dp->non_pmd_mutex);
2614 }
1c1e46ed 2615
36d8de17
DDP
2616 /* The action processing expects the RSS hash to be valid, because
2617 * it's always initialized at the beginning of datapath processing.
2618 * In this case, though, 'execute->packet' may not have gone through
2619 * the datapath at all, it may have been generated by the upper layer
2620 * (OpenFlow packet-out, BFD frame, ...). */
2621 if (!dp_packet_rss_valid(execute->packet)) {
2622 dp_packet_set_rss_hash(execute->packet,
2623 flow_hash_5tuple(execute->flow, 0));
2624 }
2625
1895cc8d 2626 packet_batch_init_packet(&pp, execute->packet);
66e4ad8a
DDP
2627 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
2628 execute->actions, execute->actions_len,
2629 time_msec());
36d8de17 2630
65f13b50
AW
2631 if (pmd->core_id == NON_PMD_CORE_ID) {
2632 ovs_mutex_unlock(&dp->non_pmd_mutex);
e9985d6a 2633 dp_netdev_pmd_unref(pmd);
65f13b50 2634 }
8a4e3a85 2635
758c456d 2636 return 0;
72865317
BP
2637}
2638
1a0c894a
BP
2639static void
2640dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2641{
2642 size_t i;
2643
2644 for (i = 0; i < n_ops; i++) {
2645 struct dpif_op *op = ops[i];
2646
2647 switch (op->type) {
2648 case DPIF_OP_FLOW_PUT:
2649 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2650 break;
2651
2652 case DPIF_OP_FLOW_DEL:
2653 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2654 break;
2655
2656 case DPIF_OP_EXECUTE:
2657 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2658 break;
6fe09f8c
JS
2659
2660 case DPIF_OP_FLOW_GET:
2661 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2662 break;
1a0c894a
BP
2663 }
2664 }
2665}
2666
6e3c6fa4
DDP
2667/* Changes the number or the affinity of pmd threads. The changes are actually
2668 * applied in dpif_netdev_run(). */
f2eee189 2669static int
a14b8947 2670dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
f2eee189
AW
2671{
2672 struct dp_netdev *dp = get_dp_netdev(dpif);
2673
a6a426d6
IM
2674 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
2675 free(dp->pmd_cmask);
2676 dp->pmd_cmask = nullable_xstrdup(cmask);
2677 dp_netdev_request_reconfigure(dp);
f2eee189
AW
2678 }
2679
2680 return 0;
2681}
2682
3eb67853
IM
2683/* Parses affinity list and returns result in 'core_ids'. */
2684static int
2685parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
2686{
2687 unsigned i;
2688 char *list, *copy, *key, *value;
2689 int error = 0;
2690
2691 for (i = 0; i < n_rxq; i++) {
2692 core_ids[i] = -1;
2693 }
2694
2695 if (!affinity_list) {
2696 return 0;
2697 }
2698
2699 list = copy = xstrdup(affinity_list);
2700
2701 while (ofputil_parse_key_value(&list, &key, &value)) {
2702 int rxq_id, core_id;
2703
2704 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
2705 || !str_to_int(value, 0, &core_id) || core_id < 0) {
2706 error = EINVAL;
2707 break;
2708 }
2709
2710 if (rxq_id < n_rxq) {
2711 core_ids[rxq_id] = core_id;
2712 }
2713 }
2714
2715 free(copy);
2716 return error;
2717}
2718
2719/* Parses 'affinity_list' and applies configuration if it is valid. */
2720static int
2721dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
2722 const char *affinity_list)
2723{
2724 unsigned *core_ids, i;
2725 int error = 0;
2726
2727 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
2728 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
2729 error = EINVAL;
2730 goto exit;
2731 }
2732
2733 for (i = 0; i < port->n_rxq; i++) {
2734 port->rxqs[i].core_id = core_ids[i];
2735 }
2736
2737exit:
2738 free(core_ids);
2739 return error;
2740}
2741
2742/* Changes the affinity of port's rx queues. The changes are actually applied
2743 * in dpif_netdev_run(). */
2744static int
2745dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
2746 const struct smap *cfg)
2747{
2748 struct dp_netdev *dp = get_dp_netdev(dpif);
2749 struct dp_netdev_port *port;
2750 int error = 0;
2751 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
2752
2753 ovs_mutex_lock(&dp->port_mutex);
2754 error = get_port_by_number(dp, port_no, &port);
2755 if (error || !netdev_is_pmd(port->netdev)
2756 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
2757 goto unlock;
2758 }
2759
2760 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
2761 if (error) {
2762 goto unlock;
2763 }
2764 free(port->rxq_affinity_list);
2765 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
2766
2767 dp_netdev_request_reconfigure(dp);
2768unlock:
2769 ovs_mutex_unlock(&dp->port_mutex);
2770 return error;
2771}
2772
5bf93d67
EJ
2773static int
2774dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2775 uint32_t queue_id, uint32_t *priority)
2776{
2777 *priority = queue_id;
2778 return 0;
2779}
2780
72865317 2781\f
9ff55ae2
DDP
2782/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2783 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
a84cb64a
BP
2784struct dp_netdev_actions *
2785dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2786{
2787 struct dp_netdev_actions *netdev_actions;
2788
9ff55ae2
DDP
2789 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2790 memcpy(netdev_actions->actions, actions, size);
a84cb64a
BP
2791 netdev_actions->size = size;
2792
2793 return netdev_actions;
2794}
2795
a84cb64a 2796struct dp_netdev_actions *
61e7deb1 2797dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 2798{
61e7deb1 2799 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
2800}
2801
61e7deb1
BP
2802static void
2803dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 2804{
61e7deb1 2805 free(actions);
a84cb64a
BP
2806}
2807\f
55e3ca97
DDP
2808static inline unsigned long long
2809cycles_counter(void)
2810{
2811#ifdef DPDK_NETDEV
2812 return rte_get_tsc_cycles();
2813#else
2814 return 0;
2815#endif
2816}
2817
2818/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2819extern struct ovs_mutex cycles_counter_fake_mutex;
2820
2821/* Start counting cycles. Must be followed by 'cycles_count_end()' */
2822static inline void
2823cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2824 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2825 OVS_NO_THREAD_SAFETY_ANALYSIS
2826{
2827 pmd->last_cycles = cycles_counter();
2828}
2829
2830/* Stop counting cycles and add them to the counter 'type' */
2831static inline void
2832cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2833 enum pmd_cycles_counter_type type)
2834 OVS_RELEASES(&cycles_counter_fake_mutex)
2835 OVS_NO_THREAD_SAFETY_ANALYSIS
2836{
2837 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2838
2839 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2840}
e4cfed38 2841
5794e276 2842static void
65f13b50 2843dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2844 struct dp_netdev_port *port,
2845 struct netdev_rxq *rxq)
e4cfed38 2846{
1895cc8d
PS
2847 struct dp_packet_batch batch;
2848 int error;
e4cfed38 2849
1895cc8d 2850 dp_packet_batch_init(&batch);
55e3ca97 2851 cycles_count_start(pmd);
1895cc8d 2852 error = netdev_rxq_recv(rxq, &batch);
55e3ca97 2853 cycles_count_end(pmd, PMD_CYCLES_POLLING);
e4cfed38 2854 if (!error) {
3c33f0ff 2855 *recirc_depth_get() = 0;
41ccaa24 2856
55e3ca97 2857 cycles_count_start(pmd);
1895cc8d 2858 dp_netdev_input(pmd, &batch, port->port_no);
55e3ca97 2859 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
e4cfed38 2860 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 2861 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
2862
2863 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 2864 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
2865 }
2866}
2867
dc36593c
DDP
2868static int
2869port_reconfigure(struct dp_netdev_port *port)
2870{
2871 struct netdev *netdev = port->netdev;
dc36593c
DDP
2872 int i, err;
2873
050c60bf 2874 if (!netdev_is_reconf_required(netdev)) {
dc36593c
DDP
2875 return 0;
2876 }
2877
2878 /* Closes the existing 'rxq's. */
2879 for (i = 0; i < port->n_rxq; i++) {
3eb67853
IM
2880 netdev_rxq_close(port->rxqs[i].rxq);
2881 port->rxqs[i].rxq = NULL;
dc36593c
DDP
2882 }
2883 port->n_rxq = 0;
2884
050c60bf
DDP
2885 /* Allows 'netdev' to apply the pending configuration changes. */
2886 err = netdev_reconfigure(netdev);
dc36593c 2887 if (err && (err != EOPNOTSUPP)) {
050c60bf
DDP
2888 VLOG_ERR("Failed to set interface %s new configuration",
2889 netdev_get_name(netdev));
dc36593c
DDP
2890 return err;
2891 }
050c60bf 2892 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3eb67853
IM
2893 port->rxqs = xrealloc(port->rxqs,
2894 sizeof *port->rxqs * netdev_n_rxq(netdev));
324c8374
IM
2895 /* Realloc 'used' counters for tx queues. */
2896 free(port->txq_used);
2897 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
2898
dc36593c 2899 for (i = 0; i < netdev_n_rxq(netdev); i++) {
3eb67853 2900 err = netdev_rxq_open(netdev, &port->rxqs[i].rxq, i);
dc36593c
DDP
2901 if (err) {
2902 return err;
2903 }
2904 port->n_rxq++;
2905 }
2906
3eb67853
IM
2907 /* Parse affinity list to apply configuration for new queues. */
2908 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
2909
dc36593c
DDP
2910 return 0;
2911}
2912
6e3c6fa4
DDP
2913static void
2914reconfigure_pmd_threads(struct dp_netdev *dp)
2915 OVS_REQUIRES(dp->port_mutex)
2916{
dc36593c 2917 struct dp_netdev_port *port, *next;
324c8374 2918 int n_cores;
6e3c6fa4 2919
a6a426d6
IM
2920 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
2921
6e3c6fa4
DDP
2922 dp_netdev_destroy_all_pmds(dp);
2923
324c8374 2924 /* Reconfigures the cpu mask. */
a6a426d6 2925 ovs_numa_set_cpu_mask(dp->pmd_cmask);
324c8374
IM
2926
2927 n_cores = ovs_numa_get_n_cores();
2928 if (n_cores == OVS_CORE_UNSPEC) {
2929 VLOG_ERR("Cannot get cpu core info");
2930 return;
2931 }
2932
dc36593c
DDP
2933 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
2934 int err;
6e3c6fa4 2935
dc36593c
DDP
2936 err = port_reconfigure(port);
2937 if (err) {
2938 hmap_remove(&dp->ports, &port->node);
2939 seq_change(dp->port_seq);
2940 port_destroy(port);
324c8374
IM
2941 } else {
2942 port->dynamic_txqs = netdev_n_txq(port->netdev) < n_cores + 1;
6e3c6fa4
DDP
2943 }
2944 }
6e3c6fa4
DDP
2945 /* Restores the non-pmd. */
2946 dp_netdev_set_nonpmd(dp);
2947 /* Restores all pmd threads. */
2948 dp_netdev_reset_pmd_threads(dp);
2949}
2950
050c60bf
DDP
2951/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
2952static bool
2953ports_require_restart(const struct dp_netdev *dp)
2954 OVS_REQUIRES(dp->port_mutex)
2955{
2956 struct dp_netdev_port *port;
2957
2958 HMAP_FOR_EACH (port, node, &dp->ports) {
2959 if (netdev_is_reconf_required(port->netdev)) {
2960 return true;
2961 }
2962 }
2963
2964 return false;
2965}
2966
a36de779
PS
2967/* Return true if needs to revalidate datapath flows. */
2968static bool
e4cfed38
PS
2969dpif_netdev_run(struct dpif *dpif)
2970{
2971 struct dp_netdev_port *port;
2972 struct dp_netdev *dp = get_dp_netdev(dpif);
546e57d4 2973 struct dp_netdev_pmd_thread *non_pmd;
a36de779 2974 uint64_t new_tnl_seq;
e4cfed38 2975
e9985d6a 2976 ovs_mutex_lock(&dp->port_mutex);
546e57d4
DDP
2977 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2978 if (non_pmd) {
2979 ovs_mutex_lock(&dp->non_pmd_mutex);
2980 HMAP_FOR_EACH (port, node, &dp->ports) {
2981 if (!netdev_is_pmd(port->netdev)) {
2982 int i;
55c955bd 2983
546e57d4
DDP
2984 for (i = 0; i < port->n_rxq; i++) {
2985 dp_netdev_process_rxq_port(non_pmd, port,
2986 port->rxqs[i].rxq);
2987 }
55c955bd 2988 }
e4cfed38 2989 }
546e57d4
DDP
2990 dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
2991 ovs_mutex_unlock(&dp->non_pmd_mutex);
6e3c6fa4 2992
546e57d4
DDP
2993 dp_netdev_pmd_unref(non_pmd);
2994 }
1c1e46ed 2995
a6a426d6 2996 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
6e3c6fa4
DDP
2997 reconfigure_pmd_threads(dp);
2998 }
2999 ovs_mutex_unlock(&dp->port_mutex);
3000
53902038 3001 tnl_neigh_cache_run();
7f9b8504 3002 tnl_port_map_run();
a36de779
PS
3003 new_tnl_seq = seq_read(tnl_conf_seq);
3004
3005 if (dp->last_tnl_conf_seq != new_tnl_seq) {
3006 dp->last_tnl_conf_seq = new_tnl_seq;
3007 return true;
3008 }
3009 return false;
e4cfed38
PS
3010}
3011
3012static void
3013dpif_netdev_wait(struct dpif *dpif)
3014{
3015 struct dp_netdev_port *port;
3016 struct dp_netdev *dp = get_dp_netdev(dpif);
3017
59e6d833 3018 ovs_mutex_lock(&dp_netdev_mutex);
e9985d6a
DDP
3019 ovs_mutex_lock(&dp->port_mutex);
3020 HMAP_FOR_EACH (port, node, &dp->ports) {
050c60bf 3021 netdev_wait_reconf_required(port->netdev);
55c955bd
PS
3022 if (!netdev_is_pmd(port->netdev)) {
3023 int i;
3024
490e82af 3025 for (i = 0; i < port->n_rxq; i++) {
3eb67853 3026 netdev_rxq_wait(port->rxqs[i].rxq);
55c955bd 3027 }
e4cfed38
PS
3028 }
3029 }
e9985d6a 3030 ovs_mutex_unlock(&dp->port_mutex);
59e6d833 3031 ovs_mutex_unlock(&dp_netdev_mutex);
a36de779 3032 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
e4cfed38
PS
3033}
3034
d0cca6c3
DDP
3035static void
3036pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
3037{
3038 struct tx_port *tx_port_cached;
3039
324c8374
IM
3040 /* Free all used tx queue ids. */
3041 dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
3042
d0cca6c3
DDP
3043 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->port_cache) {
3044 free(tx_port_cached);
3045 }
3046}
3047
3048/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
3049 * 'pmd->port_cache' (thread local) */
3050static void
3051pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
3052 OVS_REQUIRES(pmd->port_mutex)
3053{
3054 struct tx_port *tx_port, *tx_port_cached;
3055
3056 pmd_free_cached_ports(pmd);
3057 hmap_shrink(&pmd->port_cache);
3058
3059 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
3060 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
3061 hmap_insert(&pmd->port_cache, &tx_port_cached->node,
324c8374 3062 hash_port_no(tx_port_cached->port->port_no));
d0cca6c3
DDP
3063 }
3064}
3065
e4cfed38 3066static int
d0cca6c3
DDP
3067pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
3068 struct rxq_poll **ppoll_list)
e4cfed38 3069{
f7791740 3070 struct rxq_poll *poll_list = *ppoll_list;
ae7ad0a1
IM
3071 struct rxq_poll *poll;
3072 int i;
e4cfed38 3073
d0cca6c3 3074 ovs_mutex_lock(&pmd->port_mutex);
ae7ad0a1 3075 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
a1fdee13 3076
ae7ad0a1
IM
3077 i = 0;
3078 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
ae7ad0a1 3079 poll_list[i++] = *poll;
e4cfed38 3080 }
d0cca6c3
DDP
3081
3082 pmd_load_cached_ports(pmd);
3083
3084 ovs_mutex_unlock(&pmd->port_mutex);
e4cfed38 3085
e4cfed38 3086 *ppoll_list = poll_list;
d42f9307 3087 return i;
e4cfed38
PS
3088}
3089
6c3eee82 3090static void *
e4cfed38 3091pmd_thread_main(void *f_)
6c3eee82 3092{
65f13b50 3093 struct dp_netdev_pmd_thread *pmd = f_;
e4cfed38 3094 unsigned int lc = 0;
f7791740 3095 struct rxq_poll *poll_list;
84067a4c 3096 unsigned int port_seq = PMD_INITIAL_SEQ;
d42f9307 3097 bool exiting;
e4cfed38
PS
3098 int poll_cnt;
3099 int i;
6c3eee82 3100
e4cfed38
PS
3101 poll_list = NULL;
3102
65f13b50
AW
3103 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
3104 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6930c7e0
DDP
3105 ovs_numa_thread_setaffinity_core(pmd->core_id);
3106 dpdk_set_lcore_id(pmd->core_id);
d0cca6c3 3107 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
e4cfed38 3108reload:
65f13b50 3109 emc_cache_init(&pmd->flow_cache);
ae7ad0a1 3110
7dd671f0
MK
3111 /* List port/core affinity */
3112 for (i = 0; i < poll_cnt; i++) {
ce179f11
IM
3113 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
3114 pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
3115 netdev_rxq_get_queue_id(poll_list[i].rx));
7dd671f0
MK
3116 }
3117
e4cfed38 3118 for (;;) {
e4cfed38 3119 for (i = 0; i < poll_cnt; i++) {
65f13b50 3120 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
e4cfed38
PS
3121 }
3122
3123 if (lc++ > 1024) {
84067a4c 3124 unsigned int seq;
6c3eee82 3125
e4cfed38 3126 lc = 0;
84067a4c 3127
fbe0962b 3128 coverage_try_clear();
3453b4d6 3129 dp_netdev_pmd_try_optimize(pmd);
9dede5cf
FL
3130 if (!ovsrcu_try_quiesce()) {
3131 emc_cache_slow_sweep(&pmd->flow_cache);
3132 }
84067a4c 3133
65f13b50 3134 atomic_read_relaxed(&pmd->change_seq, &seq);
84067a4c
JR
3135 if (seq != port_seq) {
3136 port_seq = seq;
6c3eee82
BP
3137 break;
3138 }
3139 }
e4cfed38 3140 }
6c3eee82 3141
d0cca6c3 3142 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
d42f9307
DDP
3143 exiting = latch_is_set(&pmd->exit_latch);
3144 /* Signal here to make sure the pmd finishes
3145 * reloading the updated configuration. */
3146 dp_netdev_pmd_reload_done(pmd);
3147
65f13b50 3148 emc_cache_uninit(&pmd->flow_cache);
9bbf1c3d 3149
d42f9307 3150 if (!exiting) {
e4cfed38
PS
3151 goto reload;
3152 }
6c3eee82 3153
e4cfed38 3154 free(poll_list);
d0cca6c3 3155 pmd_free_cached_ports(pmd);
6c3eee82
BP
3156 return NULL;
3157}
3158
6b31e073
RW
3159static void
3160dp_netdev_disable_upcall(struct dp_netdev *dp)
3161 OVS_ACQUIRES(dp->upcall_rwlock)
3162{
3163 fat_rwlock_wrlock(&dp->upcall_rwlock);
3164}
3165
3166static void
3167dpif_netdev_disable_upcall(struct dpif *dpif)
3168 OVS_NO_THREAD_SAFETY_ANALYSIS
3169{
3170 struct dp_netdev *dp = get_dp_netdev(dpif);
3171 dp_netdev_disable_upcall(dp);
3172}
3173
3174static void
3175dp_netdev_enable_upcall(struct dp_netdev *dp)
3176 OVS_RELEASES(dp->upcall_rwlock)
3177{
3178 fat_rwlock_unlock(&dp->upcall_rwlock);
3179}
3180
3181static void
3182dpif_netdev_enable_upcall(struct dpif *dpif)
3183 OVS_NO_THREAD_SAFETY_ANALYSIS
3184{
3185 struct dp_netdev *dp = get_dp_netdev(dpif);
3186 dp_netdev_enable_upcall(dp);
3187}
3188
ae7ad0a1 3189static void
accf8626
AW
3190dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
3191{
3192 ovs_mutex_lock(&pmd->cond_mutex);
3193 xpthread_cond_signal(&pmd->cond);
3194 ovs_mutex_unlock(&pmd->cond_mutex);
3195}
3196
1c1e46ed 3197/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
546e57d4
DDP
3198 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
3199 * 'core_id' is NON_PMD_CORE_ID).
1c1e46ed
AW
3200 *
3201 * Caller must unrefs the returned reference. */
65f13b50 3202static struct dp_netdev_pmd_thread *
bd5131ba 3203dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
65f13b50
AW
3204{
3205 struct dp_netdev_pmd_thread *pmd;
55847abe 3206 const struct cmap_node *pnode;
65f13b50 3207
b19befae 3208 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
1c1e46ed
AW
3209 if (!pnode) {
3210 return NULL;
3211 }
65f13b50
AW
3212 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
3213
1c1e46ed 3214 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
65f13b50
AW
3215}
3216
f2eee189
AW
3217/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
3218static void
3219dp_netdev_set_nonpmd(struct dp_netdev *dp)
e9985d6a 3220 OVS_REQUIRES(dp->port_mutex)
f2eee189
AW
3221{
3222 struct dp_netdev_pmd_thread *non_pmd;
d0cca6c3 3223 struct dp_netdev_port *port;
f2eee189
AW
3224
3225 non_pmd = xzalloc(sizeof *non_pmd);
00873463 3226 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
d0cca6c3 3227
e9985d6a 3228 HMAP_FOR_EACH (port, node, &dp->ports) {
d0cca6c3
DDP
3229 dp_netdev_add_port_tx_to_pmd(non_pmd, port);
3230 }
3231
3232 dp_netdev_reload_pmd__(non_pmd);
f2eee189
AW
3233}
3234
1c1e46ed
AW
3235/* Caller must have valid pointer to 'pmd'. */
3236static bool
3237dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
3238{
3239 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
3240}
3241
3242static void
3243dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
3244{
3245 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
3246 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
3247 }
3248}
3249
3250/* Given cmap position 'pos', tries to ref the next node. If try_ref()
3251 * fails, keeps checking for next node until reaching the end of cmap.
3252 *
3253 * Caller must unrefs the returned reference. */
3254static struct dp_netdev_pmd_thread *
3255dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
3256{
3257 struct dp_netdev_pmd_thread *next;
3258
3259 do {
3260 struct cmap_node *node;
3261
3262 node = cmap_next_position(&dp->poll_threads, pos);
3263 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
3264 : NULL;
3265 } while (next && !dp_netdev_pmd_try_ref(next));
3266
3267 return next;
3268}
3269
65f13b50 3270/* Configures the 'pmd' based on the input argument. */
6c3eee82 3271static void
65f13b50 3272dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
00873463 3273 unsigned core_id, int numa_id)
65f13b50
AW
3274{
3275 pmd->dp = dp;
65f13b50
AW
3276 pmd->core_id = core_id;
3277 pmd->numa_id = numa_id;
ae7ad0a1 3278 pmd->poll_cnt = 0;
1c1e46ed 3279
324c8374 3280 atomic_init(&pmd->static_tx_qid,
347ba9bb
IM
3281 (core_id == NON_PMD_CORE_ID)
3282 ? ovs_numa_get_n_cores()
3283 : get_n_pmd_threads(dp));
3284
1c1e46ed 3285 ovs_refcount_init(&pmd->ref_cnt);
65f13b50
AW
3286 latch_init(&pmd->exit_latch);
3287 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
accf8626
AW
3288 xpthread_cond_init(&pmd->cond, NULL);
3289 ovs_mutex_init(&pmd->cond_mutex);
1c1e46ed 3290 ovs_mutex_init(&pmd->flow_mutex);
d0cca6c3 3291 ovs_mutex_init(&pmd->port_mutex);
1c1e46ed 3292 cmap_init(&pmd->flow_table);
3453b4d6
JS
3293 cmap_init(&pmd->classifiers);
3294 pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
417e7e66 3295 ovs_list_init(&pmd->poll_list);
d0cca6c3
DDP
3296 hmap_init(&pmd->tx_ports);
3297 hmap_init(&pmd->port_cache);
65f13b50
AW
3298 /* init the 'flow_cache' since there is no
3299 * actual thread created for NON_PMD_CORE_ID. */
3300 if (core_id == NON_PMD_CORE_ID) {
3301 emc_cache_init(&pmd->flow_cache);
3302 }
3303 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
3304 hash_int(core_id, 0));
3305}
3306
1c1e46ed
AW
3307static void
3308dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
3309{
3453b4d6
JS
3310 struct dpcls *cls;
3311
1c1e46ed 3312 dp_netdev_pmd_flow_flush(pmd);
d0cca6c3
DDP
3313 hmap_destroy(&pmd->port_cache);
3314 hmap_destroy(&pmd->tx_ports);
3453b4d6
JS
3315 /* All flows (including their dpcls_rules) have been deleted already */
3316 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
3317 dpcls_destroy(cls);
3318 }
3319 cmap_destroy(&pmd->classifiers);
1c1e46ed
AW
3320 cmap_destroy(&pmd->flow_table);
3321 ovs_mutex_destroy(&pmd->flow_mutex);
3322 latch_destroy(&pmd->exit_latch);
3323 xpthread_cond_destroy(&pmd->cond);
3324 ovs_mutex_destroy(&pmd->cond_mutex);
d0cca6c3 3325 ovs_mutex_destroy(&pmd->port_mutex);
1c1e46ed
AW
3326 free(pmd);
3327}
3328
3329/* Stops the pmd thread, removes it from the 'dp->poll_threads',
3330 * and unrefs the struct. */
65f13b50 3331static void
e4e74c3a 3332dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6c3eee82 3333{
d0cca6c3
DDP
3334 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
3335 * but extra cleanup is necessary */
65f13b50
AW
3336 if (pmd->core_id == NON_PMD_CORE_ID) {
3337 emc_cache_uninit(&pmd->flow_cache);
d0cca6c3 3338 pmd_free_cached_ports(pmd);
65f13b50
AW
3339 } else {
3340 latch_set(&pmd->exit_latch);
3341 dp_netdev_reload_pmd__(pmd);
3342 ovs_numa_unpin_core(pmd->core_id);
3343 xpthread_join(pmd->thread, NULL);
3344 }
ae7ad0a1 3345
d0cca6c3 3346 dp_netdev_pmd_clear_ports(pmd);
ae7ad0a1 3347
e4e74c3a
AW
3348 /* Purges the 'pmd''s flows after stopping the thread, but before
3349 * destroying the flows, so that the flow stats can be collected. */
3350 if (dp->dp_purge_cb) {
3351 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
3352 }
65f13b50 3353 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
1c1e46ed 3354 dp_netdev_pmd_unref(pmd);
65f13b50 3355}
6c3eee82 3356
65f13b50
AW
3357/* Destroys all pmd threads. */
3358static void
3359dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
3360{
3361 struct dp_netdev_pmd_thread *pmd;
d916785c
DDP
3362 struct dp_netdev_pmd_thread **pmd_list;
3363 size_t k = 0, n_pmds;
3364
3365 n_pmds = cmap_count(&dp->poll_threads);
3366 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
65f13b50
AW
3367
3368 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
3369 /* We cannot call dp_netdev_del_pmd(), since it alters
3370 * 'dp->poll_threads' (while we're iterating it) and it
3371 * might quiesce. */
3372 ovs_assert(k < n_pmds);
3373 pmd_list[k++] = pmd;
6c3eee82 3374 }
d916785c
DDP
3375
3376 for (size_t i = 0; i < k; i++) {
3377 dp_netdev_del_pmd(dp, pmd_list[i]);
3378 }
3379 free(pmd_list);
65f13b50 3380}
6c3eee82 3381
347ba9bb 3382/* Deletes all pmd threads on numa node 'numa_id' and
324c8374 3383 * fixes static_tx_qids of other threads to keep them sequential. */
65f13b50
AW
3384static void
3385dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3386{
3387 struct dp_netdev_pmd_thread *pmd;
347ba9bb
IM
3388 int n_pmds_on_numa, n_pmds;
3389 int *free_idx, k = 0;
d916785c 3390 struct dp_netdev_pmd_thread **pmd_list;
347ba9bb
IM
3391
3392 n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
d916785c
DDP
3393 free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
3394 pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
6c3eee82 3395
65f13b50 3396 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d916785c
DDP
3397 /* We cannot call dp_netdev_del_pmd(), since it alters
3398 * 'dp->poll_threads' (while we're iterating it) and it
3399 * might quiesce. */
65f13b50 3400 if (pmd->numa_id == numa_id) {
324c8374 3401 atomic_read_relaxed(&pmd->static_tx_qid, &free_idx[k]);
d916785c
DDP
3402 pmd_list[k] = pmd;
3403 ovs_assert(k < n_pmds_on_numa);
347ba9bb 3404 k++;
65f13b50 3405 }
6c3eee82 3406 }
347ba9bb 3407
d916785c
DDP
3408 for (int i = 0; i < k; i++) {
3409 dp_netdev_del_pmd(dp, pmd_list[i]);
3410 }
3411
347ba9bb
IM
3412 n_pmds = get_n_pmd_threads(dp);
3413 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3414 int old_tx_qid;
3415
324c8374 3416 atomic_read_relaxed(&pmd->static_tx_qid, &old_tx_qid);
347ba9bb
IM
3417
3418 if (old_tx_qid >= n_pmds) {
3419 int new_tx_qid = free_idx[--k];
3420
324c8374 3421 atomic_store_relaxed(&pmd->static_tx_qid, new_tx_qid);
347ba9bb
IM
3422 }
3423 }
3424
d916785c 3425 free(pmd_list);
347ba9bb 3426 free(free_idx);
65f13b50 3427}
6c3eee82 3428
d0cca6c3
DDP
3429/* Deletes all rx queues from pmd->poll_list and all the ports from
3430 * pmd->tx_ports. */
cc245ce8 3431static void
d0cca6c3 3432dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
3433{
3434 struct rxq_poll *poll;
d0cca6c3 3435 struct tx_port *port;
cc245ce8 3436
d0cca6c3 3437 ovs_mutex_lock(&pmd->port_mutex);
cc245ce8 3438 LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
cc245ce8
IM
3439 free(poll);
3440 }
3441 pmd->poll_cnt = 0;
d0cca6c3
DDP
3442 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
3443 free(port);
3444 }
3445 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8
IM
3446}
3447
d0cca6c3
DDP
3448static struct tx_port *
3449tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3450{
3451 struct tx_port *tx;
3452
3453 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
324c8374 3454 if (tx->port->port_no == port_no) {
d0cca6c3
DDP
3455 return tx;
3456 }
3457 }
3458
3459 return NULL;
3460}
3461
3462/* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
3463 * 'tx_ports' of 'pmd' thread. Returns true if 'port' was found in 'pmd'
3464 * (therefore a restart is required). */
b68872d8
DDP
3465static bool
3466dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
3467 struct dp_netdev_pmd_thread *pmd)
cc245ce8
IM
3468{
3469 struct rxq_poll *poll, *next;
d0cca6c3 3470 struct tx_port *tx;
cc245ce8
IM
3471 bool found = false;
3472
d0cca6c3 3473 ovs_mutex_lock(&pmd->port_mutex);
cc245ce8
IM
3474 LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
3475 if (poll->port == port) {
3476 found = true;
417e7e66 3477 ovs_list_remove(&poll->node);
cc245ce8
IM
3478 pmd->poll_cnt--;
3479 free(poll);
3480 }
3481 }
d0cca6c3
DDP
3482
3483 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
3484 if (tx) {
3485 hmap_remove(&pmd->tx_ports, &tx->node);
3486 free(tx);
3487 found = true;
3488 }
3489 ovs_mutex_unlock(&pmd->port_mutex);
b68872d8
DDP
3490
3491 return found;
3492}
3493
d0cca6c3
DDP
3494/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3495 * threads. The pmd threads that need to be restarted are inserted in
3496 * 'to_reload'. */
b68872d8
DDP
3497static void
3498dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
3499 struct dp_netdev_port *port,
3500 struct hmapx *to_reload)
3501{
b68872d8
DDP
3502 struct dp_netdev_pmd_thread *pmd;
3503
3504 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
d0cca6c3 3505 bool found;
b68872d8 3506
d0cca6c3 3507 found = dp_netdev_del_port_from_pmd__(port, pmd);
b68872d8 3508
d0cca6c3
DDP
3509 if (found) {
3510 hmapx_add(to_reload, pmd);
3511 }
cc245ce8
IM
3512 }
3513}
3514
d0cca6c3
DDP
3515/* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3516 * threads. Reloads the threads if needed. */
cc245ce8
IM
3517static void
3518dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
3519 struct dp_netdev_port *port)
3520{
cc245ce8 3521 struct dp_netdev_pmd_thread *pmd;
b68872d8
DDP
3522 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3523 struct hmapx_node *node;
cc245ce8 3524
b68872d8
DDP
3525 dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);
3526
3527 HMAPX_FOR_EACH (node, &to_reload) {
3528 pmd = (struct dp_netdev_pmd_thread *) node->data;
3529 dp_netdev_reload_pmd__(pmd);
cc245ce8 3530 }
b68872d8
DDP
3531
3532 hmapx_destroy(&to_reload);
cc245ce8
IM
3533}
3534
b68872d8 3535
3eb67853
IM
3536/* Returns non-isolated PMD thread from this numa node with fewer
3537 * rx queues to poll. Returns NULL if there is no non-isolated PMD threads
3538 * on this numa node. Can be called safely only by main thread. */
ae7ad0a1
IM
3539static struct dp_netdev_pmd_thread *
3540dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
3541{
3542 int min_cnt = -1;
3543 struct dp_netdev_pmd_thread *pmd, *res = NULL;
3544
3545 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3eb67853 3546 if (!pmd->isolated && pmd->numa_id == numa_id
ae7ad0a1
IM
3547 && (min_cnt > pmd->poll_cnt || res == NULL)) {
3548 min_cnt = pmd->poll_cnt;
3549 res = pmd;
3550 }
3551 }
3552
3553 return res;
3554}
3555
3556/* Adds rx queue to poll_list of PMD thread. */
3557static void
3558dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3559 struct dp_netdev_port *port, struct netdev_rxq *rx)
d0cca6c3 3560 OVS_REQUIRES(pmd->port_mutex)
ae7ad0a1
IM
3561{
3562 struct rxq_poll *poll = xmalloc(sizeof *poll);
3563
ae7ad0a1
IM
3564 poll->port = port;
3565 poll->rx = rx;
3566
417e7e66 3567 ovs_list_push_back(&pmd->poll_list, &poll->node);
ae7ad0a1
IM
3568 pmd->poll_cnt++;
3569}
3570
d0cca6c3
DDP
3571/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
3572 * changes to take effect. */
cc245ce8 3573static void
d0cca6c3
DDP
3574dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
3575 struct dp_netdev_port *port)
3576{
3577 struct tx_port *tx = xzalloc(sizeof *tx);
3578
324c8374
IM
3579 tx->port = port;
3580 tx->qid = -1;
d0cca6c3
DDP
3581
3582 ovs_mutex_lock(&pmd->port_mutex);
324c8374 3583 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
d0cca6c3
DDP
3584 ovs_mutex_unlock(&pmd->port_mutex);
3585}
3586
3eb67853
IM
3587/* Distribute all {pinned|non-pinned} rx queues of 'port' between PMD
3588 * threads in 'dp'. The pmd threads that need to be restarted are inserted
3589 * in 'to_reload'. PMD threads with pinned queues marked as isolated. */
d0cca6c3
DDP
3590static void
3591dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
3592 struct dp_netdev_port *port,
3eb67853 3593 struct hmapx *to_reload, bool pinned)
cc245ce8
IM
3594{
3595 int numa_id = netdev_get_numa_id(port->netdev);
3eb67853 3596 struct dp_netdev_pmd_thread *pmd;
cc245ce8
IM
3597 int i;
3598
d0cca6c3
DDP
3599 if (!netdev_is_pmd(port->netdev)) {
3600 return;
3601 }
cc245ce8 3602
490e82af 3603 for (i = 0; i < port->n_rxq; i++) {
3eb67853
IM
3604 if (pinned) {
3605 if (port->rxqs[i].core_id == -1) {
3606 continue;
3607 }
3608 pmd = dp_netdev_get_pmd(dp, port->rxqs[i].core_id);
3609 if (!pmd) {
3610 VLOG_WARN("There is no PMD thread on core %d. "
3611 "Queue %d on port \'%s\' will not be polled.",
3612 port->rxqs[i].core_id, i,
3613 netdev_get_name(port->netdev));
3614 continue;
3615 }
3616 pmd->isolated = true;
3617 dp_netdev_pmd_unref(pmd);
3618 } else {
3619 if (port->rxqs[i].core_id != -1) {
3620 continue;
3621 }
3622 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
3623 if (!pmd) {
3624 VLOG_WARN("There's no available pmd thread on numa node %d",
3625 numa_id);
3626 break;
3627 }
cc245ce8
IM
3628 }
3629
d0cca6c3 3630 ovs_mutex_lock(&pmd->port_mutex);
3eb67853 3631 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxqs[i].rxq);
d0cca6c3 3632 ovs_mutex_unlock(&pmd->port_mutex);
cc245ce8 3633
b68872d8 3634 hmapx_add(to_reload, pmd);
cc245ce8 3635 }
b68872d8
DDP
3636}
3637
3eb67853
IM
3638/* Distributes all non-pinned rx queues of 'port' between all PMD threads
3639 * in 'dp' and inserts 'port' in the PMD threads 'tx_ports'. The pmd threads
3640 * that need to be restarted are inserted in 'to_reload'. */
d0cca6c3
DDP
3641static void
3642dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
3643 struct hmapx *to_reload)
3644{
3645 struct dp_netdev_pmd_thread *pmd;
3646
3eb67853 3647 dp_netdev_add_port_rx_to_pmds(dp, port, to_reload, false);
d0cca6c3
DDP
3648
3649 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3650 dp_netdev_add_port_tx_to_pmd(pmd, port);
3651 hmapx_add(to_reload, pmd);
3652 }
3653}
3654
3eb67853
IM
3655/* Distributes all non-pinned rx queues of 'port' between all PMD threads
3656 * in 'dp', inserts 'port' in the PMD threads 'tx_ports' and reloads them,
3657 * if needed. */
b68872d8
DDP
3658static void
3659dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
3660{
3661 struct dp_netdev_pmd_thread *pmd;
3662 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3663 struct hmapx_node *node;
3664
3665 dp_netdev_add_port_to_pmds__(dp, port, &to_reload);
cc245ce8
IM
3666
3667 HMAPX_FOR_EACH (node, &to_reload) {
3668 pmd = (struct dp_netdev_pmd_thread *) node->data;
3669 dp_netdev_reload_pmd__(pmd);
3670 }
3671
3672 hmapx_destroy(&to_reload);
3673}
3674
d0cca6c3
DDP
3675/* Starts pmd threads for the numa node 'numa_id', if not already started.
3676 * The function takes care of filling the threads tx port cache. */
65f13b50
AW
3677static void
3678dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
e9985d6a 3679 OVS_REQUIRES(dp->port_mutex)
65f13b50
AW
3680{
3681 int n_pmds;
e4cfed38 3682
65f13b50 3683 if (!ovs_numa_numa_id_is_valid(numa_id)) {
d0cca6c3
DDP
3684 VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
3685 numa_id);
3686 return;
65f13b50
AW
3687 }
3688
3689 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
3690
3691 /* If there are already pmd threads created for the numa node
3692 * in which 'netdev' is on, do nothing. Else, creates the
3693 * pmd threads for the numa node. */
3694 if (!n_pmds) {
d0cca6c3 3695 int can_have, n_unpinned, i;
65f13b50
AW
3696
3697 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
3698 if (!n_unpinned) {
d0cca6c3
DDP
3699 VLOG_WARN("Cannot create pmd threads due to out of unpinned "
3700 "cores on numa node %d", numa_id);
65f13b50
AW
3701 return;
3702 }
6c3eee82 3703
f2eee189
AW
3704 /* If cpu mask is specified, uses all unpinned cores, otherwise
3705 * tries creating NR_PMD_THREADS pmd threads. */
3706 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
65f13b50 3707 for (i = 0; i < can_have; i++) {
bd5131ba 3708 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
d0cca6c3
DDP
3709 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
3710 struct dp_netdev_port *port;
ae7ad0a1 3711
d0cca6c3
DDP
3712 dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
3713
e9985d6a 3714 HMAP_FOR_EACH (port, node, &dp->ports) {
d0cca6c3 3715 dp_netdev_add_port_tx_to_pmd(pmd, port);
ae7ad0a1 3716 }
ae7ad0a1 3717
d0cca6c3 3718 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
65f13b50
AW
3719 }
3720 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
6c3eee82
BP
3721 }
3722}
e4cfed38 3723
6c3eee82 3724\f
f2eee189
AW
3725/* Called after pmd threads config change. Restarts pmd threads with
3726 * new configuration. */
3727static void
3728dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
e9985d6a 3729 OVS_REQUIRES(dp->port_mutex)
f2eee189 3730{
d0cca6c3
DDP
3731 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3732 struct dp_netdev_pmd_thread *pmd;
f2eee189 3733 struct dp_netdev_port *port;
d0cca6c3 3734 struct hmapx_node *node;
f2eee189 3735
e9985d6a 3736 HMAP_FOR_EACH (port, node, &dp->ports) {
f2eee189
AW
3737 if (netdev_is_pmd(port->netdev)) {
3738 int numa_id = netdev_get_numa_id(port->netdev);
3739
3740 dp_netdev_set_pmds_on_numa(dp, numa_id);
3741 }
3eb67853
IM
3742 /* Distribute only pinned rx queues first to mark threads as isolated */
3743 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, true);
3744 }
3745
3746 /* Distribute remaining non-pinned rx queues to non-isolated PMD threads. */
3747 HMAP_FOR_EACH (port, node, &dp->ports) {
3748 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload, false);
d0cca6c3
DDP
3749 }
3750
3751 HMAPX_FOR_EACH (node, &to_reload) {
3752 pmd = (struct dp_netdev_pmd_thread *) node->data;
3753 dp_netdev_reload_pmd__(pmd);
f2eee189 3754 }
d0cca6c3
DDP
3755
3756 hmapx_destroy(&to_reload);
f2eee189
AW
3757}
3758
b5cbbcf6
AZ
3759static char *
3760dpif_netdev_get_datapath_version(void)
3761{
3762 return xstrdup("<built-in>");
3763}
3764
72865317 3765static void
1c1e46ed 3766dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
11bfdadd 3767 uint16_t tcp_flags, long long now)
72865317 3768{
eb94da30 3769 uint16_t flags;
72865317 3770
eb94da30
DDP
3771 atomic_store_relaxed(&netdev_flow->stats.used, now);
3772 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
3773 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
3774 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3775 flags |= tcp_flags;
3776 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
51852a57
BP
3777}
3778
3779static void
1c1e46ed
AW
3780dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
3781 enum dp_stat_type type, int cnt)
51852a57 3782{
eb94da30 3783 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
51852a57
BP
3784}
3785
623540e4 3786static int
e14deea0 3787dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7af12bd7 3788 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
623540e4
EJ
3789 enum dpif_upcall_type type, const struct nlattr *userdata,
3790 struct ofpbuf *actions, struct ofpbuf *put_actions)
3791{
1c1e46ed 3792 struct dp_netdev *dp = pmd->dp;
623540e4 3793
623540e4
EJ
3794 if (OVS_UNLIKELY(!dp->upcall_cb)) {
3795 return ENODEV;
3796 }
3797
3798 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
3799 struct ds ds = DS_EMPTY_INITIALIZER;
623540e4 3800 char *packet_str;
cf62fa4c 3801 struct ofpbuf key;
5262eea1
JG
3802 struct odp_flow_key_parms odp_parms = {
3803 .flow = flow,
3804 .mask = &wc->masks,
2494ccd7 3805 .support = dp_netdev_support,
5262eea1 3806 };
623540e4
EJ
3807
3808 ofpbuf_init(&key, 0);
5262eea1 3809 odp_flow_key_from_flow(&odp_parms, &key);
cf62fa4c
PS
3810 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3811 dp_packet_size(packet_));
623540e4 3812
6fd6ed71 3813 odp_flow_key_format(key.data, key.size, &ds);
623540e4
EJ
3814
3815 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3816 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3817
3818 ofpbuf_uninit(&key);
3819 free(packet_str);
6fd6ed71 3820
623540e4
EJ
3821 ds_destroy(&ds);
3822 }
3823
8d8ab6c2
JG
3824 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
3825 actions, wc, put_actions, dp->upcall_aux);
623540e4
EJ
3826}
3827
9bbf1c3d 3828static inline uint32_t
048963aa
DDP
3829dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3830 const struct miniflow *mf)
9bbf1c3d 3831{
048963aa 3832 uint32_t hash, recirc_depth;
9bbf1c3d 3833
f2f44f5d
DDP
3834 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
3835 hash = dp_packet_get_rss_hash(packet);
3836 } else {
9bbf1c3d 3837 hash = miniflow_hash_5tuple(mf, 0);
2bc1bbd2 3838 dp_packet_set_rss_hash(packet, hash);
9bbf1c3d 3839 }
048963aa
DDP
3840
3841 /* The RSS hash must account for the recirculation depth to avoid
3842 * collisions in the exact match cache */
3843 recirc_depth = *recirc_depth_get_unsafe();
3844 if (OVS_UNLIKELY(recirc_depth)) {
3845 hash = hash_finish(hash, recirc_depth);
3846 dp_packet_set_rss_hash(packet, hash);
3847 }
9bbf1c3d
DDP
3848 return hash;
3849}
3850
f7ce4811 3851struct packet_batch_per_flow {
8cbf4f47
DDP
3852 unsigned int byte_count;
3853 uint16_t tcp_flags;
8cbf4f47
DDP
3854 struct dp_netdev_flow *flow;
3855
1895cc8d 3856 struct dp_packet_batch array;
8cbf4f47
DDP
3857};
3858
3859static inline void
f7ce4811
PS
3860packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
3861 struct dp_packet *packet,
3862 const struct miniflow *mf)
8cbf4f47 3863{
cf62fa4c 3864 batch->byte_count += dp_packet_size(packet);
1895cc8d
PS
3865 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3866 batch->array.packets[batch->array.count++] = packet;
8cbf4f47
DDP
3867}
3868
3869static inline void
f7ce4811
PS
3870packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
3871 struct dp_netdev_flow *flow)
8cbf4f47 3872{
11e5cf1f 3873 flow->batch = batch;
8cbf4f47 3874
11e5cf1f 3875 batch->flow = flow;
1895cc8d 3876 dp_packet_batch_init(&batch->array);
8cbf4f47
DDP
3877 batch->byte_count = 0;
3878 batch->tcp_flags = 0;
8cbf4f47
DDP
3879}
3880
3881static inline void
f7ce4811
PS
3882packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
3883 struct dp_netdev_pmd_thread *pmd,
3884 long long now)
8cbf4f47
DDP
3885{
3886 struct dp_netdev_actions *actions;
3887 struct dp_netdev_flow *flow = batch->flow;
3888
1895cc8d 3889 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
11bfdadd 3890 batch->tcp_flags, now);
8cbf4f47
DDP
3891
3892 actions = dp_netdev_flow_get_actions(flow);
3893
66e4ad8a 3894 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
324c8374 3895 actions->actions, actions->size, now);
8cbf4f47
DDP
3896}
3897
8aaa125d 3898static inline void
e14deea0 3899dp_netdev_queue_batches(struct dp_packet *pkt,
9bbf1c3d 3900 struct dp_netdev_flow *flow, const struct miniflow *mf,
f7ce4811 3901 struct packet_batch_per_flow *batches, size_t *n_batches)
9bbf1c3d 3902{
f7ce4811 3903 struct packet_batch_per_flow *batch = flow->batch;
11e5cf1f 3904
f9fe365b
AZ
3905 if (OVS_UNLIKELY(!batch)) {
3906 batch = &batches[(*n_batches)++];
f7ce4811 3907 packet_batch_per_flow_init(batch, flow);
9bbf1c3d
DDP
3908 }
3909
f7ce4811 3910 packet_batch_per_flow_update(batch, pkt, mf);
9bbf1c3d
DDP
3911}
3912
9bbf1c3d 3913/* Try to process all ('cnt') the 'packets' using only the exact match cache
a90ed026 3914 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
8aaa125d
DDP
3915 * miniflow is copied into 'keys' and the packet pointer is moved at the
3916 * beginning of the 'packets' array.
9bbf1c3d
DDP
3917 *
3918 * The function returns the number of packets that needs to be processed in the
3919 * 'packets' array (they have been moved to the beginning of the vector).
a90ed026
DDP
3920 *
3921 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
3922 * initialized by this function using 'port_no'.
9bbf1c3d
DDP
3923 */
3924static inline size_t
1895cc8d
PS
3925emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
3926 struct netdev_flow_key *keys,
f7ce4811 3927 struct packet_batch_per_flow batches[], size_t *n_batches,
a90ed026 3928 bool md_is_valid, odp_port_t port_no)
72865317 3929{
65f13b50 3930 struct emc_cache *flow_cache = &pmd->flow_cache;
b89c678b 3931 struct netdev_flow_key *key = &keys[0];
3d88a620 3932 size_t i, n_missed = 0, n_dropped = 0;
1895cc8d
PS
3933 struct dp_packet **packets = packets_->packets;
3934 int cnt = packets_->count;
8cbf4f47 3935
84d6d5eb 3936 for (i = 0; i < cnt; i++) {
9bbf1c3d 3937 struct dp_netdev_flow *flow;
5a2fed48 3938 struct dp_packet *packet = packets[i];
9bbf1c3d 3939
5a2fed48
AZ
3940 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
3941 dp_packet_delete(packet);
3d88a620 3942 n_dropped++;
84d6d5eb
EJ
3943 continue;
3944 }
8cbf4f47 3945
72a5e2b8 3946 if (i != cnt - 1) {
a90ed026 3947 /* Prefetch next packet data and metadata. */
72a5e2b8 3948 OVS_PREFETCH(dp_packet_data(packets[i+1]));
a90ed026 3949 pkt_metadata_prefetch_init(&packets[i+1]->md);
72a5e2b8
DDP
3950 }
3951
a90ed026
DDP
3952 if (!md_is_valid) {
3953 pkt_metadata_init(&packet->md, port_no);
3954 }
5a2fed48 3955 miniflow_extract(packet, &key->mf);
d262ac2c 3956 key->len = 0; /* Not computed yet. */
5a2fed48 3957 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
9bbf1c3d 3958
d262ac2c 3959 flow = emc_lookup(flow_cache, key);
8aaa125d 3960 if (OVS_LIKELY(flow)) {
5a2fed48 3961 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
8aaa125d
DDP
3962 n_batches);
3963 } else {
d1aa0b94
AZ
3964 /* Exact match cache missed. Group missed packets together at
3965 * the beginning of the 'packets' array. */
b89c678b 3966 packets[n_missed] = packet;
400486f7
DDP
3967 /* 'key[n_missed]' contains the key of the current packet and it
3968 * must be returned to the caller. The next key should be extracted
3969 * to 'keys[n_missed + 1]'. */
3970 key = &keys[++n_missed];
9bbf1c3d
DDP
3971 }
3972 }
3973
3d88a620 3974 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
4f150744 3975
3d88a620 3976 return n_missed;
9bbf1c3d
DDP
3977}
3978
a260d966
PS
3979static inline void
3980handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
3981 const struct netdev_flow_key *key,
3982 struct ofpbuf *actions, struct ofpbuf *put_actions,
324c8374 3983 int *lost_cnt, long long now)
a260d966
PS
3984{
3985 struct ofpbuf *add_actions;
3986 struct dp_packet_batch b;
3987 struct match match;
3988 ovs_u128 ufid;
3989 int error;
3990
3991 match.tun_md.valid = false;
3992 miniflow_expand(&key->mf, &match.flow);
3993
3994 ofpbuf_clear(actions);
3995 ofpbuf_clear(put_actions);
3996
3997 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
3998 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
3999 &ufid, DPIF_UC_MISS, NULL, actions,
4000 put_actions);
4001 if (OVS_UNLIKELY(error && error != ENOSPC)) {
4002 dp_packet_delete(packet);
4003 (*lost_cnt)++;
4004 return;
4005 }
4006
4007 /* The Netlink encoding of datapath flow keys cannot express
4008 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
4009 * tag is interpreted as exact match on the fact that there is no
4010 * VLAN. Unless we refactor a lot of code that translates between
4011 * Netlink and struct flow representations, we have to do the same
4012 * here. */
4013 if (!match.wc.masks.vlan_tci) {
4014 match.wc.masks.vlan_tci = htons(0xffff);
4015 }
4016
4017 /* We can't allow the packet batching in the next loop to execute
4018 * the actions. Otherwise, if there are any slow path actions,
4019 * we'll send the packet up twice. */
4020 packet_batch_init_packet(&b, packet);
66e4ad8a 4021 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
324c8374 4022 actions->data, actions->size, now);
a260d966
PS
4023
4024 add_actions = put_actions->size ? put_actions : actions;
4025 if (OVS_LIKELY(error != ENOSPC)) {
4026 struct dp_netdev_flow *netdev_flow;
4027
4028 /* XXX: There's a race window where a flow covering this packet
4029 * could have already been installed since we last did the flow
4030 * lookup before upcall. This could be solved by moving the
4031 * mutex lock outside the loop, but that's an awful long time
4032 * to be locking everyone out of making flow installs. If we
4033 * move to a per-core classifier, it would be reasonable. */
4034 ovs_mutex_lock(&pmd->flow_mutex);
3453b4d6 4035 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
a260d966
PS
4036 if (OVS_LIKELY(!netdev_flow)) {
4037 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
4038 add_actions->data,
4039 add_actions->size);
4040 }
4041 ovs_mutex_unlock(&pmd->flow_mutex);
4042
4043 emc_insert(&pmd->flow_cache, key, netdev_flow);
4044 }
4045}
4046
9bbf1c3d 4047static inline void
65f13b50 4048fast_path_processing(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4049 struct dp_packet_batch *packets_,
8aaa125d 4050 struct netdev_flow_key *keys,
324c8374 4051 struct packet_batch_per_flow batches[], size_t *n_batches,
3453b4d6 4052 odp_port_t in_port,
324c8374 4053 long long now)
9bbf1c3d 4054{
1895cc8d 4055 int cnt = packets_->count;
1a0d5831 4056#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
4057 const size_t PKT_ARRAY_SIZE = cnt;
4058#else
1a0d5831 4059 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 4060 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d 4061#endif
1895cc8d 4062 struct dp_packet **packets = packets_->packets;
3453b4d6 4063 struct dpcls *cls;
0de8783a 4064 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
65f13b50
AW
4065 struct dp_netdev *dp = pmd->dp;
4066 struct emc_cache *flow_cache = &pmd->flow_cache;
8aaa125d 4067 int miss_cnt = 0, lost_cnt = 0;
3453b4d6 4068 int lookup_cnt = 0, add_lookup_cnt;
9bbf1c3d 4069 bool any_miss;
8aaa125d 4070 size_t i;
9bbf1c3d
DDP
4071
4072 for (i = 0; i < cnt; i++) {
0de8783a 4073 /* Key length is needed in all the cases, hash computed on demand. */
361d808d 4074 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
9bbf1c3d 4075 }
3453b4d6
JS
4076 /* Get the classifier for the in_port */
4077 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
4078 if (OVS_LIKELY(cls)) {
4079 any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
4080 } else {
4081 any_miss = true;
4082 memset(rules, 0, sizeof(rules));
4083 }
623540e4
EJ
4084 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4085 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
4086 struct ofpbuf actions, put_actions;
623540e4
EJ
4087
4088 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
4089 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
4090
4091 for (i = 0; i < cnt; i++) {
0de8783a 4092 struct dp_netdev_flow *netdev_flow;
623540e4 4093
0de8783a 4094 if (OVS_LIKELY(rules[i])) {
623540e4
EJ
4095 continue;
4096 }
4097
4098 /* It's possible that an earlier slow path execution installed
0de8783a 4099 * a rule covering this flow. In this case, it's a lot cheaper
623540e4 4100 * to catch it here than execute a miss. */
3453b4d6
JS
4101 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
4102 &add_lookup_cnt);
623540e4 4103 if (netdev_flow) {
3453b4d6 4104 lookup_cnt += add_lookup_cnt;
0de8783a 4105 rules[i] = &netdev_flow->cr;
623540e4
EJ
4106 continue;
4107 }
4108
60fc3b7b 4109 miss_cnt++;
324c8374
IM
4110 handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
4111 &put_actions, &lost_cnt, now);
623540e4
EJ
4112 }
4113
4114 ofpbuf_uninit(&actions);
4115 ofpbuf_uninit(&put_actions);
4116 fat_rwlock_unlock(&dp->upcall_rwlock);
60fc3b7b 4117 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
ac8c2081 4118 } else if (OVS_UNLIKELY(any_miss)) {
ac8c2081 4119 for (i = 0; i < cnt; i++) {
0de8783a 4120 if (OVS_UNLIKELY(!rules[i])) {
e14deea0 4121 dp_packet_delete(packets[i]);
8aaa125d
DDP
4122 lost_cnt++;
4123 miss_cnt++;
ac8c2081
DDP
4124 }
4125 }
623540e4 4126 }
84d6d5eb 4127
8cbf4f47 4128 for (i = 0; i < cnt; i++) {
e14deea0 4129 struct dp_packet *packet = packets[i];
84d6d5eb 4130 struct dp_netdev_flow *flow;
8cbf4f47 4131
0de8783a 4132 if (OVS_UNLIKELY(!rules[i])) {
84d6d5eb
EJ
4133 continue;
4134 }
4135
84d6d5eb 4136 flow = dp_netdev_flow_cast(rules[i]);
0de8783a 4137
0de8783a 4138 emc_insert(flow_cache, &keys[i], flow);
8aaa125d 4139 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
8cbf4f47
DDP
4140 }
4141
8aaa125d 4142 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
3453b4d6 4143 dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
8aaa125d
DDP
4144 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
4145 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
72865317
BP
4146}
4147
a90ed026
DDP
4148/* Packets enter the datapath from a port (or from recirculation) here.
4149 *
4150 * For performance reasons a caller may choose not to initialize the metadata
4151 * in 'packets': in this case 'mdinit' is false and this function needs to
4152 * initialize it using 'port_no'. If the metadata in 'packets' is already
4153 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
adcf00ba 4154static void
a90ed026 4155dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4156 struct dp_packet_batch *packets,
a90ed026 4157 bool md_is_valid, odp_port_t port_no)
9bbf1c3d 4158{
1895cc8d 4159 int cnt = packets->count;
1a0d5831 4160#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
4161 const size_t PKT_ARRAY_SIZE = cnt;
4162#else
1a0d5831 4163 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 4164 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
9bbf1c3d
DDP
4165#endif
4166 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
f7ce4811 4167 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
11bfdadd 4168 long long now = time_msec();
8aaa125d 4169 size_t newcnt, n_batches, i;
3453b4d6 4170 odp_port_t in_port;
9bbf1c3d 4171
8aaa125d 4172 n_batches = 0;
1895cc8d 4173 newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
a90ed026 4174 md_is_valid, port_no);
9bbf1c3d 4175 if (OVS_UNLIKELY(newcnt)) {
1895cc8d 4176 packets->count = newcnt;
3453b4d6
JS
4177 /* Get ingress port from first packet's metadata. */
4178 in_port = packets->packets[0]->md.in_port.odp_port;
4179 fast_path_processing(pmd, packets, keys, batches, &n_batches, in_port, now);
8aaa125d
DDP
4180 }
4181
603f2ce0
EJ
4182 for (i = 0; i < n_batches; i++) {
4183 batches[i].flow->batch = NULL;
4184 }
4185
8aaa125d 4186 for (i = 0; i < n_batches; i++) {
f7ce4811 4187 packet_batch_per_flow_execute(&batches[i], pmd, now);
9bbf1c3d
DDP
4188 }
4189}
4190
a90ed026
DDP
4191static void
4192dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4193 struct dp_packet_batch *packets,
a90ed026
DDP
4194 odp_port_t port_no)
4195{
3453b4d6 4196 dp_netdev_input__(pmd, packets, false, port_no);
a90ed026
DDP
4197}
4198
4199static void
4200dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4201 struct dp_packet_batch *packets)
a90ed026 4202{
3453b4d6 4203 dp_netdev_input__(pmd, packets, true, 0);
a90ed026
DDP
4204}
4205
9080a111 4206struct dp_netdev_execute_aux {
65f13b50 4207 struct dp_netdev_pmd_thread *pmd;
324c8374 4208 long long now;
66e4ad8a 4209 const struct flow *flow;
9080a111
JR
4210};
4211
e4e74c3a
AW
4212static void
4213dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
4214 void *aux)
4215{
4216 struct dp_netdev *dp = get_dp_netdev(dpif);
4217 dp->dp_purge_aux = aux;
4218 dp->dp_purge_cb = cb;
4219}
4220
6b31e073 4221static void
623540e4
EJ
4222dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
4223 void *aux)
6b31e073
RW
4224{
4225 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 4226 dp->upcall_aux = aux;
6b31e073
RW
4227 dp->upcall_cb = cb;
4228}
4229
324c8374
IM
4230static void
4231dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
4232 long long now, bool purge)
4233{
4234 struct tx_port *tx;
4235 struct dp_netdev_port *port;
4236 long long interval;
4237
4238 HMAP_FOR_EACH (tx, node, &pmd->port_cache) {
9f7a3035 4239 if (!tx->port->dynamic_txqs) {
324c8374
IM
4240 continue;
4241 }
4242 interval = now - tx->last_used;
4243 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
4244 port = tx->port;
4245 ovs_mutex_lock(&port->txq_used_mutex);
4246 port->txq_used[tx->qid]--;
4247 ovs_mutex_unlock(&port->txq_used_mutex);
4248 tx->qid = -1;
4249 }
4250 }
4251}
4252
4253static int
4254dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
4255 struct tx_port *tx, long long now)
4256{
4257 struct dp_netdev_port *port;
4258 long long interval;
4259 int i, min_cnt, min_qid;
4260
4261 if (OVS_UNLIKELY(!now)) {
4262 now = time_msec();
4263 }
4264
4265 interval = now - tx->last_used;
4266 tx->last_used = now;
4267
4268 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
4269 return tx->qid;
4270 }
4271
4272 port = tx->port;
4273
4274 ovs_mutex_lock(&port->txq_used_mutex);
4275 if (tx->qid >= 0) {
4276 port->txq_used[tx->qid]--;
4277 tx->qid = -1;
4278 }
4279
4280 min_cnt = -1;
4281 min_qid = 0;
4282 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
4283 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
4284 min_cnt = port->txq_used[i];
4285 min_qid = i;
4286 }
4287 }
4288
4289 port->txq_used[min_qid]++;
4290 tx->qid = min_qid;
4291
4292 ovs_mutex_unlock(&port->txq_used_mutex);
4293
4294 dpif_netdev_xps_revalidate_pmd(pmd, now, false);
4295
4296 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
4297 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
4298 return min_qid;
4299}
4300
d0cca6c3
DDP
4301static struct tx_port *
4302pmd_tx_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
4303 odp_port_t port_no)
4304{
4305 return tx_port_lookup(&pmd->port_cache, port_no);
4306}
4307
a36de779 4308static int
d0cca6c3 4309push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
1895cc8d
PS
4310 const struct nlattr *attr,
4311 struct dp_packet_batch *batch)
a36de779 4312{
d0cca6c3 4313 struct tx_port *tun_port;
a36de779 4314 const struct ovs_action_push_tnl *data;
4c742796 4315 int err;
a36de779
PS
4316
4317 data = nl_attr_get(attr);
4318
d0cca6c3 4319 tun_port = pmd_tx_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
a36de779 4320 if (!tun_port) {
4c742796
PS
4321 err = -EINVAL;
4322 goto error;
a36de779 4323 }
324c8374 4324 err = netdev_push_header(tun_port->port->netdev, batch, data);
4c742796
PS
4325 if (!err) {
4326 return 0;
4327 }
4328error:
4329 dp_packet_delete_batch(batch, true);
4330 return err;
a36de779
PS
4331}
4332
66525ef3
PS
4333static void
4334dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
4335 struct dp_packet *packet, bool may_steal,
4336 struct flow *flow, ovs_u128 *ufid,
4337 struct ofpbuf *actions,
324c8374 4338 const struct nlattr *userdata, long long now)
66525ef3
PS
4339{
4340 struct dp_packet_batch b;
4341 int error;
4342
4343 ofpbuf_clear(actions);
4344
4345 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
4346 DPIF_UC_ACTION, userdata, actions,
4347 NULL);
4348 if (!error || error == ENOSPC) {
4349 packet_batch_init_packet(&b, packet);
66e4ad8a 4350 dp_netdev_execute_actions(pmd, &b, may_steal, flow,
324c8374 4351 actions->data, actions->size, now);
66525ef3
PS
4352 } else if (may_steal) {
4353 dp_packet_delete(packet);
4354 }
4355}
4356
a36de779 4357static void
1895cc8d 4358dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
09f9da0b 4359 const struct nlattr *a, bool may_steal)
9080a111
JR
4360{
4361 struct dp_netdev_execute_aux *aux = aux_;
623540e4 4362 uint32_t *depth = recirc_depth_get();
28e2fa02
DDP
4363 struct dp_netdev_pmd_thread *pmd = aux->pmd;
4364 struct dp_netdev *dp = pmd->dp;
09f9da0b 4365 int type = nl_attr_type(a);
324c8374 4366 long long now = aux->now;
d0cca6c3 4367 struct tx_port *p;
9080a111 4368
09f9da0b
JR
4369 switch ((enum ovs_action_attr)type) {
4370 case OVS_ACTION_ATTR_OUTPUT:
d0cca6c3 4371 p = pmd_tx_port_cache_lookup(pmd, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 4372 if (OVS_LIKELY(p)) {
347ba9bb 4373 int tx_qid;
324c8374 4374 bool dynamic_txqs;
347ba9bb 4375
324c8374
IM
4376 dynamic_txqs = p->port->dynamic_txqs;
4377 if (dynamic_txqs) {
4378 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
4379 } else {
4380 atomic_read_relaxed(&pmd->static_tx_qid, &tx_qid);
4381 }
347ba9bb 4382
324c8374
IM
4383 netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
4384 dynamic_txqs);
ac8c2081 4385 return;
8a4e3a85 4386 }
09f9da0b
JR
4387 break;
4388
a36de779
PS
4389 case OVS_ACTION_ATTR_TUNNEL_PUSH:
4390 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 4391 struct dp_packet_batch tnl_pkt;
aaca4fe0 4392 struct dp_packet_batch *orig_packets_ = packets_;
a36de779
PS
4393 int err;
4394
4395 if (!may_steal) {
1895cc8d
PS
4396 dp_packet_batch_clone(&tnl_pkt, packets_);
4397 packets_ = &tnl_pkt;
aaca4fe0 4398 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
4399 }
4400
aaca4fe0
WT
4401 dp_packet_batch_apply_cutlen(packets_);
4402
d0cca6c3 4403 err = push_tnl_action(pmd, a, packets_);
a36de779
PS
4404 if (!err) {
4405 (*depth)++;
1895cc8d 4406 dp_netdev_recirculate(pmd, packets_);
a36de779 4407 (*depth)--;
a36de779
PS
4408 }
4409 return;
4410 }
4411 break;
4412
4413 case OVS_ACTION_ATTR_TUNNEL_POP:
4414 if (*depth < MAX_RECIRC_DEPTH) {
aaca4fe0 4415 struct dp_packet_batch *orig_packets_ = packets_;
a36de779
PS
4416 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
4417
d0cca6c3 4418 p = pmd_tx_port_cache_lookup(pmd, portno);
a36de779 4419 if (p) {
1895cc8d 4420 struct dp_packet_batch tnl_pkt;
9235b479 4421 int i;
a36de779
PS
4422
4423 if (!may_steal) {
aaca4fe0
WT
4424 dp_packet_batch_clone(&tnl_pkt, packets_);
4425 packets_ = &tnl_pkt;
4426 dp_packet_batch_reset_cutlen(orig_packets_);
a36de779
PS
4427 }
4428
aaca4fe0
WT
4429 dp_packet_batch_apply_cutlen(packets_);
4430
324c8374 4431 netdev_pop_header(p->port->netdev, packets_);
1895cc8d 4432 if (!packets_->count) {
1c8f98d9
PS
4433 return;
4434 }
9235b479
PS
4435
4436 for (i = 0; i < packets_->count; i++) {
4437 packets_->packets[i]->md.in_port.odp_port = portno;
a36de779 4438 }
9235b479
PS
4439
4440 (*depth)++;
4441 dp_netdev_recirculate(pmd, packets_);
4442 (*depth)--;
a36de779
PS
4443 return;
4444 }
4445 }
4446 break;
4447
623540e4
EJ
4448 case OVS_ACTION_ATTR_USERSPACE:
4449 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
aaca4fe0 4450 struct dp_packet_batch *orig_packets_ = packets_;
1895cc8d 4451 struct dp_packet **packets = packets_->packets;
623540e4 4452 const struct nlattr *userdata;
aaca4fe0 4453 struct dp_packet_batch usr_pkt;
623540e4
EJ
4454 struct ofpbuf actions;
4455 struct flow flow;
7af12bd7 4456 ovs_u128 ufid;
aaca4fe0 4457 bool clone = false;
1c8f98d9 4458 int i;
4fc65926 4459
623540e4
EJ
4460 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
4461 ofpbuf_init(&actions, 0);
8cbf4f47 4462
aaca4fe0
WT
4463 if (packets_->trunc) {
4464 if (!may_steal) {
4465 dp_packet_batch_clone(&usr_pkt, packets_);
4466 packets_ = &usr_pkt;
4467 packets = packets_->packets;
4468 clone = true;
4469 dp_packet_batch_reset_cutlen(orig_packets_);
4470 }
4471
4472 dp_packet_batch_apply_cutlen(packets_);
4473 }
4474
1895cc8d 4475 for (i = 0; i < packets_->count; i++) {
cf62fa4c 4476 flow_extract(packets[i], &flow);
7af12bd7 4477 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
66525ef3 4478 dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
324c8374 4479 &ufid, &actions, userdata, now);
db73f716 4480 }
aaca4fe0
WT
4481
4482 if (clone) {
4483 dp_packet_delete_batch(packets_, true);
4484 }
4485
623540e4
EJ
4486 ofpbuf_uninit(&actions);
4487 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 4488
ac8c2081
DDP
4489 return;
4490 }
09f9da0b 4491 break;
572f732a 4492
adcf00ba
AZ
4493 case OVS_ACTION_ATTR_RECIRC:
4494 if (*depth < MAX_RECIRC_DEPTH) {
1895cc8d 4495 struct dp_packet_batch recirc_pkts;
1c8f98d9 4496 int i;
572f732a 4497
28e2fa02 4498 if (!may_steal) {
1895cc8d
PS
4499 dp_packet_batch_clone(&recirc_pkts, packets_);
4500 packets_ = &recirc_pkts;
28e2fa02 4501 }
8cbf4f47 4502
1895cc8d
PS
4503 for (i = 0; i < packets_->count; i++) {
4504 packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
8cbf4f47 4505 }
28e2fa02
DDP
4506
4507 (*depth)++;
1895cc8d 4508 dp_netdev_recirculate(pmd, packets_);
adcf00ba
AZ
4509 (*depth)--;
4510
ac8c2081 4511 return;
adcf00ba 4512 }
ac8c2081
DDP
4513
4514 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 4515 break;
572f732a 4516
5cf3edb3
DDP
4517 case OVS_ACTION_ATTR_CT: {
4518 const struct nlattr *b;
4519 bool commit = false;
4520 unsigned int left;
4521 uint16_t zone = 0;
4522 const char *helper = NULL;
4523 const uint32_t *setmark = NULL;
4524 const struct ovs_key_ct_labels *setlabel = NULL;
4525
4526 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
4527 nl_attr_get_size(a)) {
4528 enum ovs_ct_attr sub_type = nl_attr_type(b);
4529
4530 switch(sub_type) {
4531 case OVS_CT_ATTR_COMMIT:
4532 commit = true;
4533 break;
4534 case OVS_CT_ATTR_ZONE:
4535 zone = nl_attr_get_u16(b);
4536 break;
4537 case OVS_CT_ATTR_HELPER:
4538 helper = nl_attr_get_string(b);
4539 break;
4540 case OVS_CT_ATTR_MARK:
4541 setmark = nl_attr_get(b);
4542 break;
4543 case OVS_CT_ATTR_LABELS:
4544 setlabel = nl_attr_get(b);
4545 break;
4546 case OVS_CT_ATTR_NAT:
4547 case OVS_CT_ATTR_UNSPEC:
4548 case __OVS_CT_ATTR_MAX:
4549 OVS_NOT_REACHED();
4550 }
4551 }
4552
66e4ad8a
DDP
4553 conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, commit,
4554 zone, setmark, setlabel, helper);
07659514 4555 break;
5cf3edb3 4556 }
07659514 4557
09f9da0b
JR
4558 case OVS_ACTION_ATTR_PUSH_VLAN:
4559 case OVS_ACTION_ATTR_POP_VLAN:
4560 case OVS_ACTION_ATTR_PUSH_MPLS:
4561 case OVS_ACTION_ATTR_POP_MPLS:
4562 case OVS_ACTION_ATTR_SET:
6d670e7f 4563 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b 4564 case OVS_ACTION_ATTR_SAMPLE:
53e1d6f1 4565 case OVS_ACTION_ATTR_HASH:
09f9da0b 4566 case OVS_ACTION_ATTR_UNSPEC:
aaca4fe0 4567 case OVS_ACTION_ATTR_TRUNC:
09f9da0b
JR
4568 case __OVS_ACTION_ATTR_MAX:
4569 OVS_NOT_REACHED();
da546e07 4570 }
ac8c2081 4571
1895cc8d 4572 dp_packet_delete_batch(packets_, may_steal);
98403001
BP
4573}
4574
4edb9ae9 4575static void
65f13b50 4576dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
1895cc8d 4577 struct dp_packet_batch *packets,
66e4ad8a 4578 bool may_steal, const struct flow *flow,
324c8374
IM
4579 const struct nlattr *actions, size_t actions_len,
4580 long long now)
72865317 4581{
66e4ad8a 4582 struct dp_netdev_execute_aux aux = { pmd, now, flow };
9080a111 4583
1895cc8d 4584 odp_execute_actions(&aux, packets, may_steal, actions,
8cbf4f47 4585 actions_len, dp_execute_cb);
72865317
BP
4586}
4587
4d4e68ed
DDP
4588struct dp_netdev_ct_dump {
4589 struct ct_dpif_dump_state up;
4590 struct conntrack_dump dump;
4591 struct conntrack *ct;
4592 struct dp_netdev *dp;
4593};
4594
4595static int
4596dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
4597 const uint16_t *pzone)
4598{
4599 struct dp_netdev *dp = get_dp_netdev(dpif);
4600 struct dp_netdev_ct_dump *dump;
4601
4602 dump = xzalloc(sizeof *dump);
4603 dump->dp = dp;
4604 dump->ct = &dp->conntrack;
4605
4606 conntrack_dump_start(&dp->conntrack, &dump->dump, pzone);
4607
4608 *dump_ = &dump->up;
4609
4610 return 0;
4611}
4612
4613static int
4614dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
4615 struct ct_dpif_dump_state *dump_,
4616 struct ct_dpif_entry *entry)
4617{
4618 struct dp_netdev_ct_dump *dump;
4619
4620 INIT_CONTAINER(dump, dump_, up);
4621
4622 return conntrack_dump_next(&dump->dump, entry);
4623}
4624
4625static int
4626dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
4627 struct ct_dpif_dump_state *dump_)
4628{
4629 struct dp_netdev_ct_dump *dump;
4630 int err;
4631
4632 INIT_CONTAINER(dump, dump_, up);
4633
4634 err = conntrack_dump_done(&dump->dump);
4635
4636 free(dump);
4637
4638 return err;
4639}
4640
5d9cbb4c
DDP
4641static int
4642dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
4643{
4644 struct dp_netdev *dp = get_dp_netdev(dpif);
4645
4646 return conntrack_flush(&dp->conntrack, zone);
4647}
4648
72865317 4649const struct dpif_class dpif_netdev_class = {
72865317 4650 "netdev",
6553d06b 4651 dpif_netdev_init,
2197d7ab 4652 dpif_netdev_enumerate,
0aeaabc8 4653 dpif_netdev_port_open_type,
72865317
BP
4654 dpif_netdev_open,
4655 dpif_netdev_close,
7dab847a 4656 dpif_netdev_destroy,
e4cfed38
PS
4657 dpif_netdev_run,
4658 dpif_netdev_wait,
72865317 4659 dpif_netdev_get_stats,
72865317
BP
4660 dpif_netdev_port_add,
4661 dpif_netdev_port_del,
3eb67853 4662 dpif_netdev_port_set_config,
72865317
BP
4663 dpif_netdev_port_query_by_number,
4664 dpif_netdev_port_query_by_name,
98403001 4665 NULL, /* port_get_pid */
b0ec0f27
BP
4666 dpif_netdev_port_dump_start,
4667 dpif_netdev_port_dump_next,
4668 dpif_netdev_port_dump_done,
72865317
BP
4669 dpif_netdev_port_poll,
4670 dpif_netdev_port_poll_wait,
72865317 4671 dpif_netdev_flow_flush,
ac64794a
BP
4672 dpif_netdev_flow_dump_create,
4673 dpif_netdev_flow_dump_destroy,
4674 dpif_netdev_flow_dump_thread_create,
4675 dpif_netdev_flow_dump_thread_destroy,
704a1e09 4676 dpif_netdev_flow_dump_next,
1a0c894a 4677 dpif_netdev_operate,
6b31e073
RW
4678 NULL, /* recv_set */
4679 NULL, /* handlers_set */
f2eee189 4680 dpif_netdev_pmd_set,
5bf93d67 4681 dpif_netdev_queue_to_priority,
6b31e073
RW
4682 NULL, /* recv */
4683 NULL, /* recv_wait */
4684 NULL, /* recv_purge */
e4e74c3a 4685 dpif_netdev_register_dp_purge_cb,
6b31e073
RW
4686 dpif_netdev_register_upcall_cb,
4687 dpif_netdev_enable_upcall,
4688 dpif_netdev_disable_upcall,
b5cbbcf6 4689 dpif_netdev_get_datapath_version,
4d4e68ed
DDP
4690 dpif_netdev_ct_dump_start,
4691 dpif_netdev_ct_dump_next,
4692 dpif_netdev_ct_dump_done,
5d9cbb4c 4693 dpif_netdev_ct_flush,
72865317 4694};
614c4892 4695
74cc3969
BP
4696static void
4697dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
4698 const char *argv[], void *aux OVS_UNUSED)
4699{
e9985d6a 4700 struct dp_netdev_port *port;
74cc3969 4701 struct dp_netdev *dp;
ff073a71 4702 odp_port_t port_no;
74cc3969 4703
8a4e3a85 4704 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
4705 dp = shash_find_data(&dp_netdevs, argv[1]);
4706 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 4707 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
4708 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4709 return;
4710 }
8a4e3a85
BP
4711 ovs_refcount_ref(&dp->ref_cnt);
4712 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 4713
59e6d833 4714 ovs_mutex_lock(&dp->port_mutex);
e9985d6a 4715 if (get_port_by_name(dp, argv[2], &port)) {
74cc3969 4716 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 4717 goto exit;
74cc3969
BP
4718 }
4719
ff073a71
BP
4720 port_no = u32_to_odp(atoi(argv[3]));
4721 if (!port_no || port_no == ODPP_NONE) {
74cc3969 4722 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 4723 goto exit;
74cc3969 4724 }
ff073a71 4725 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 4726 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 4727 goto exit;
74cc3969 4728 }
59e6d833 4729
e9985d6a
DDP
4730 /* Remove port. */
4731 hmap_remove(&dp->ports, &port->node);
4732 dp_netdev_del_port_from_all_pmds(dp, port);
59e6d833 4733
e9985d6a
DDP
4734 /* Reinsert with new port number. */
4735 port->port_no = port_no;
4736 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
4737 dp_netdev_add_port_to_pmds(dp, port);
59e6d833 4738
d33ed218 4739 seq_change(dp->port_seq);
74cc3969 4740 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
4741
4742exit:
59e6d833 4743 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 4744 dp_netdev_unref(dp);
74cc3969
BP
4745}
4746
0cbfe35d
BP
4747static void
4748dpif_dummy_register__(const char *type)
4749{
4750 struct dpif_class *class;
4751
4752 class = xmalloc(sizeof *class);
4753 *class = dpif_netdev_class;
4754 class->type = xstrdup(type);
4755 dp_register_provider(class);
4756}
4757
8420c7ad
BP
4758static void
4759dpif_dummy_override(const char *type)
4760{
65d43fdc
YT
4761 int error;
4762
4763 /*
4764 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
4765 * a userland-only build. It's useful for testsuite.
4766 */
4767 error = dp_unregister_provider(type);
4768 if (error == 0 || error == EAFNOSUPPORT) {
8420c7ad
BP
4769 dpif_dummy_register__(type);
4770 }
4771}
4772
614c4892 4773void
8420c7ad 4774dpif_dummy_register(enum dummy_level level)
614c4892 4775{
8420c7ad 4776 if (level == DUMMY_OVERRIDE_ALL) {
0cbfe35d
BP
4777 struct sset types;
4778 const char *type;
4779
4780 sset_init(&types);
4781 dp_enumerate_types(&types);
4782 SSET_FOR_EACH (type, &types) {
8420c7ad 4783 dpif_dummy_override(type);
0cbfe35d
BP
4784 }
4785 sset_destroy(&types);
8420c7ad
BP
4786 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
4787 dpif_dummy_override("system");
614c4892 4788 }
0cbfe35d
BP
4789
4790 dpif_dummy_register__("dummy");
74cc3969
BP
4791
4792 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 4793 "dp port new-number",
74cc3969 4794 3, 3, dpif_dummy_change_port_number, NULL);
614c4892 4795}
0de8783a
JR
4796\f
4797/* Datapath Classifier. */
4798
4799/* A set of rules that all have the same fields wildcarded. */
4800struct dpcls_subtable {
4801 /* The fields are only used by writers. */
4802 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
4803
4804 /* These fields are accessed by readers. */
4805 struct cmap rules; /* Contains "struct dpcls_rule"s. */
3453b4d6
JS
4806 uint32_t hit_cnt; /* Number of match hits in subtable in current
4807 optimization interval. */
0de8783a
JR
4808 struct netdev_flow_key mask; /* Wildcards for fields (const). */
4809 /* 'mask' must be the last field, additional space is allocated here. */
4810};
4811
4812/* Initializes 'cls' as a classifier that initially contains no classification
4813 * rules. */
4814static void
4815dpcls_init(struct dpcls *cls)
4816{
4817 cmap_init(&cls->subtables_map);
da9cfca6 4818 pvector_init(&cls->subtables);
0de8783a
JR
4819}
4820
4821static void
4822dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
4823{
3453b4d6 4824 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
da9cfca6 4825 pvector_remove(&cls->subtables, subtable);
0de8783a
JR
4826 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
4827 subtable->mask.hash);
4828 cmap_destroy(&subtable->rules);
4829 ovsrcu_postpone(free, subtable);
4830}
4831
4832/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
4833 * caller's responsibility.
4834 * May only be called after all the readers have been terminated. */
4835static void
4836dpcls_destroy(struct dpcls *cls)
4837{
4838 if (cls) {
4839 struct dpcls_subtable *subtable;
4840
4841 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
361d808d 4842 ovs_assert(cmap_count(&subtable->rules) == 0);
0de8783a
JR
4843 dpcls_destroy_subtable(cls, subtable);
4844 }
4845 cmap_destroy(&cls->subtables_map);
da9cfca6 4846 pvector_destroy(&cls->subtables);
0de8783a
JR
4847 }
4848}
4849
4850static struct dpcls_subtable *
4851dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4852{
4853 struct dpcls_subtable *subtable;
4854
4855 /* Need to add one. */
caeb4906
JR
4856 subtable = xmalloc(sizeof *subtable
4857 - sizeof subtable->mask.mf + mask->len);
0de8783a 4858 cmap_init(&subtable->rules);
3453b4d6 4859 subtable->hit_cnt = 0;
0de8783a
JR
4860 netdev_flow_key_clone(&subtable->mask, mask);
4861 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
3453b4d6 4862 /* Add the new subtable at the end of the pvector (with no hits yet) */
da9cfca6 4863 pvector_insert(&cls->subtables, subtable, 0);
84dbfb2b 4864 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
3453b4d6 4865 cmap_count(&cls->subtables_map), subtable, cls->in_port);
da9cfca6 4866 pvector_publish(&cls->subtables);
0de8783a
JR
4867
4868 return subtable;
4869}
4870
4871static inline struct dpcls_subtable *
4872dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4873{
4874 struct dpcls_subtable *subtable;
4875
4876 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
4877 &cls->subtables_map) {
4878 if (netdev_flow_key_equal(&subtable->mask, mask)) {
4879 return subtable;
4880 }
4881 }
4882 return dpcls_create_subtable(cls, mask);
4883}
4884
3453b4d6
JS
4885
4886/* Periodically sort the dpcls subtable vectors according to hit counts */
4887static void
4888dpcls_sort_subtable_vector(struct dpcls *cls)
4889{
4890 struct pvector *pvec = &cls->subtables;
4891 struct dpcls_subtable *subtable;
4892
4893 PVECTOR_FOR_EACH (subtable, pvec) {
4894 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
4895 subtable->hit_cnt = 0;
4896 }
4897 pvector_publish(pvec);
4898}
4899
4900static inline void
4901dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)
4902{
4903 struct dpcls *cls;
4904 long long int now = time_msec();
4905
4906 if (now > pmd->next_optimization) {
4907 /* Try to obtain the flow lock to block out revalidator threads.
4908 * If not possible, just try next time. */
4909 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
4910 /* Optimize each classifier */
4911 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
4912 dpcls_sort_subtable_vector(cls);
4913 }
4914 ovs_mutex_unlock(&pmd->flow_mutex);
4915 /* Start new measuring interval */
4916 pmd->next_optimization = now + DPCLS_OPTIMIZATION_INTERVAL;
4917 }
4918 }
4919}
4920
0de8783a
JR
4921/* Insert 'rule' into 'cls'. */
4922static void
4923dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
4924 const struct netdev_flow_key *mask)
4925{
4926 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
4927
3453b4d6 4928 /* Refer to subtable's mask, also for later removal. */
0de8783a
JR
4929 rule->mask = &subtable->mask;
4930 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
4931}
4932
4933/* Removes 'rule' from 'cls', also destructing the 'rule'. */
4934static void
4935dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
4936{
4937 struct dpcls_subtable *subtable;
4938
4939 ovs_assert(rule->mask);
4940
3453b4d6 4941 /* Get subtable from reference in rule->mask. */
0de8783a 4942 INIT_CONTAINER(subtable, rule->mask, mask);
0de8783a
JR
4943 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
4944 == 0) {
3453b4d6 4945 /* Delete empty subtable. */
0de8783a 4946 dpcls_destroy_subtable(cls, subtable);
da9cfca6 4947 pvector_publish(&cls->subtables);
0de8783a
JR
4948 }
4949}
4950
361d808d
JR
4951/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
4952 * in 'mask' the values in 'key' and 'target' are the same. */
0de8783a
JR
4953static inline bool
4954dpcls_rule_matches_key(const struct dpcls_rule *rule,
4955 const struct netdev_flow_key *target)
4956{
09b0fa9c
JR
4957 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
4958 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5fcff47b 4959 uint64_t value;
0de8783a 4960
5fcff47b
JR
4961 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
4962 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
0de8783a
JR
4963 return false;
4964 }
4965 }
4966 return true;
4967}
4968
5b1c9c78
FA
4969/* For each miniflow in 'keys' performs a classifier lookup writing the result
4970 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
0de8783a
JR
4971 * NULL it is skipped.
4972 *
4973 * This function is optimized for use in the userspace datapath and therefore
4974 * does not implement a lot of features available in the standard
4975 * classifier_lookup() function. Specifically, it does not implement
4976 * priorities, instead returning any rule which matches the flow.
4977 *
5b1c9c78 4978 * Returns true if all miniflows found a corresponding rule. */
0de8783a 4979static bool
3453b4d6
JS
4980dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
4981 struct dpcls_rule **rules, const size_t cnt,
4982 int *num_lookups_p)
0de8783a 4983{
5b1c9c78
FA
4984 /* The received 'cnt' miniflows are the search-keys that will be processed
4985 * in batches of 16 elements. N_MAPS will contain the number of these
4986 * 16-elements batches. i.e. for 'cnt' = 32, N_MAPS will be 2. The batch
4987 * size 16 was experimentally found faster than 8 or 32. */
0de8783a
JR
4988 typedef uint16_t map_type;
4989#define MAP_BITS (sizeof(map_type) * CHAR_BIT)
4990
4991#if !defined(__CHECKER__) && !defined(_WIN32)
4992 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
4993#else
cd159f1a 4994 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
0de8783a
JR
4995#endif
4996 map_type maps[N_MAPS];
4997 struct dpcls_subtable *subtable;
4998
4999 memset(maps, 0xff, sizeof maps);
5000 if (cnt % MAP_BITS) {
5001 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
5002 }
5003 memset(rules, 0, cnt * sizeof *rules);
5004
3453b4d6
JS
5005 int lookups_match = 0, subtable_pos = 1;
5006
5b1c9c78
FA
5007 /* The Datapath classifier - aka dpcls - is composed of subtables.
5008 * Subtables are dynamically created as needed when new rules are inserted.
5009 * Each subtable collects rules with matches on a specific subset of packet
5010 * fields as defined by the subtable's mask. We proceed to process every
5011 * search-key against each subtable, but when a match is found for a
5012 * search-key, the search for that key can stop because the rules are
5013 * non-overlapping. */
da9cfca6 5014 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
0de8783a
JR
5015 const struct netdev_flow_key *mkeys = keys;
5016 struct dpcls_rule **mrules = rules;
5017 map_type remains = 0;
5018 int m;
5019
5020 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
5021
5b1c9c78 5022 /* Loops on each batch of 16 search-keys. */
0de8783a
JR
5023 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
5024 uint32_t hashes[MAP_BITS];
5025 const struct cmap_node *nodes[MAP_BITS];
5026 unsigned long map = maps[m];
5027 int i;
5028
5029 if (!map) {
5030 continue; /* Skip empty maps. */
5031 }
5032
5b1c9c78
FA
5033 /* Compute hashes for the remaining keys. Each search-key is
5034 * masked with the subtable's mask to avoid hashing the wildcarded
5035 * bits. */
3ee6026a 5036 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
5037 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
5038 &subtable->mask);
5039 }
5040 /* Lookup. */
5041 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
5b1c9c78
FA
5042 /* Check results. When the i-th bit of map is set, it means that a
5043 * set of nodes with a matching hash value was found for the i-th
5044 * search-key. Due to possible hash collisions we need to check
5045 * which of the found rules, if any, really matches our masked
5046 * search-key. */
3ee6026a 5047 ULLONG_FOR_EACH_1(i, map) {
0de8783a
JR
5048 struct dpcls_rule *rule;
5049
5050 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
5051 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
5052 mrules[i] = rule;
3453b4d6
JS
5053 /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
5054 * within one second optimization interval */
5055 subtable->hit_cnt++;
5056 lookups_match += subtable_pos;
0de8783a
JR
5057 goto next;
5058 }
5059 }
5b1c9c78
FA
5060 /* None of the found rules was a match. Reset the i-th bit to
5061 * keep searching in the next subtable. */
3ee6026a 5062 ULLONG_SET0(map, i); /* Did not match. */
0de8783a
JR
5063 next:
5064 ; /* Keep Sparse happy. */
5065 }
5066 maps[m] &= ~map; /* Clear the found rules. */
5067 remains |= maps[m];
5068 }
5069 if (!remains) {
3453b4d6
JS
5070 if (num_lookups_p) {
5071 *num_lookups_p = lookups_match;
5072 }
0de8783a
JR
5073 return true; /* All found. */
5074 }
3453b4d6
JS
5075 subtable_pos++;
5076 }
5077 if (num_lookups_p) {
5078 *num_lookups_p = lookups_match;
0de8783a
JR
5079 }
5080 return false; /* Some misses. */
5081}