]> git.proxmox.com Git - ovs.git/blame - lib/dpif-netdev.c
timeval: Fix seq memory leak
[ovs.git] / lib / dpif-netdev.c
CommitLineData
72865317 1/*
ff073a71 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
72865317
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
db73f716 18#include "dpif-netdev.h"
72865317 19
72865317
BP
20#include <ctype.h>
21#include <errno.h>
22#include <fcntl.h>
23#include <inttypes.h>
72865317 24#include <netinet/in.h>
9d82ec47 25#include <sys/socket.h>
7f3adc00 26#include <net/if.h>
cdee00fd 27#include <stdint.h>
72865317
BP
28#include <stdlib.h>
29#include <string.h>
30#include <sys/ioctl.h>
31#include <sys/stat.h>
72865317
BP
32#include <unistd.h>
33
2c0ea78f 34#include "classifier.h"
59e6d833 35#include "cmap.h"
72865317 36#include "csum.h"
614c4892 37#include "dpif.h"
72865317 38#include "dpif-provider.h"
614c4892 39#include "dummy.h"
36956a7d 40#include "dynamic-string.h"
afae68b1 41#include "fat-rwlock.h"
72865317 42#include "flow.h"
9f361d6b 43#include "cmap.h"
6c3eee82 44#include "latch.h"
72865317 45#include "list.h"
8c301900 46#include "meta-flow.h"
72865317 47#include "netdev.h"
8617afff 48#include "netdev-dpdk.h"
de281153 49#include "netdev-vport.h"
cdee00fd 50#include "netlink.h"
f094af7b 51#include "odp-execute.h"
72865317
BP
52#include "odp-util.h"
53#include "ofp-print.h"
54#include "ofpbuf.h"
5a034064 55#include "ovs-numa.h"
61e7deb1 56#include "ovs-rcu.h"
91088554 57#include "packet-dpif.h"
72865317
BP
58#include "packets.h"
59#include "poll-loop.h"
26c6b6cd 60#include "random.h"
d33ed218 61#include "seq.h"
462278db 62#include "shash.h"
0cbfe35d 63#include "sset.h"
72865317 64#include "timeval.h"
74cc3969 65#include "unixctl.h"
72865317 66#include "util.h"
72865317 67#include "vlog.h"
5136ce49 68
d98e6007 69VLOG_DEFINE_THIS_MODULE(dpif_netdev);
72865317 70
2c0ea78f
GS
71/* By default, choose a priority in the middle. */
72#define NETDEV_RULE_PRIORITY 0x8000
73
8bb113da 74#define FLOW_DUMP_MAX_BATCH 50
adcf00ba
AZ
75/* Use per thread recirc_depth to prevent recirculation loop. */
76#define MAX_RECIRC_DEPTH 5
77DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
e4cfed38 78
72865317 79/* Configuration parameters. */
72865317
BP
80enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
81
8a4e3a85
BP
82/* Protects against changes to 'dp_netdevs'. */
83static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
84
85/* Contains all 'struct dp_netdev's. */
86static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
87 = SHASH_INITIALIZER(&dp_netdevs);
88
623540e4 89static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
6b31e073 90
79df317f 91/* Stores a miniflow with inline values */
9bbf1c3d
DDP
92
93/* There are fields in the flow structure that we never use. Therefore we can
94 * save a few words of memory */
95#define NETDEV_KEY_BUF_SIZE_U32 (FLOW_U32S \
96 - MINI_N_INLINE \
97 - FLOW_U32_SIZE(regs) \
98 - FLOW_U32_SIZE(metadata) \
99 )
100struct netdev_flow_key {
101 struct miniflow flow;
102 uint32_t buf[NETDEV_KEY_BUF_SIZE_U32];
103};
104
105/* Exact match cache for frequently used flows
106 *
107 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
108 * search its entries for a miniflow that matches exactly the miniflow of the
109 * packet. It stores the 'cls_rule'(rule) that matches the miniflow.
110 *
111 * A cache entry holds a reference to its 'dp_netdev_flow'.
112 *
113 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
114 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
115 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
116 * value is the index of a cache entry where the miniflow could be.
117 *
118 *
119 * Thread-safety
120 * =============
121 *
122 * Each pmd_thread has its own private exact match cache.
123 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
124 */
125
126#define EM_FLOW_HASH_SHIFT 10
127#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
128#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
129#define EM_FLOW_HASH_SEGS 2
130
131struct emc_entry {
132 uint32_t hash;
0023a1cb 133 uint32_t mf_len;
9bbf1c3d
DDP
134 struct netdev_flow_key mf;
135 struct dp_netdev_flow *flow;
136};
137
138struct emc_cache {
139 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
140};
141
142/* Iterate in the exact match cache through every entry that might contain a
143 * miniflow with hash 'HASH'. */
144#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
145 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
146 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
147 i__ < EM_FLOW_HASH_SEGS; \
148 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
149
8a4e3a85
BP
150/* Datapath based on the network device interface from netdev.h.
151 *
152 *
153 * Thread-safety
154 * =============
155 *
156 * Some members, marked 'const', are immutable. Accessing other members
157 * requires synchronization, as noted in more detail below.
158 *
159 * Acquisition order is, from outermost to innermost:
160 *
161 * dp_netdev_mutex (global)
59e6d833 162 * port_mutex
8a4e3a85 163 * flow_mutex
8a4e3a85 164 */
72865317 165struct dp_netdev {
8a4e3a85
BP
166 const struct dpif_class *const class;
167 const char *const name;
6b31e073 168 struct dpif *dpif;
6a8267c5
BP
169 struct ovs_refcount ref_cnt;
170 atomic_flag destroyed;
72865317 171
8a4e3a85
BP
172 /* Flows.
173 *
afae68b1
JR
174 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
175 * changes to 'cls' must be made while still holding the 'flow_mutex'.
8a4e3a85
BP
176 */
177 struct ovs_mutex flow_mutex;
afae68b1 178 struct classifier cls;
9f361d6b 179 struct cmap flow_table OVS_GUARDED; /* Flow table. */
8a4e3a85 180
8a4e3a85
BP
181 /* Statistics.
182 *
51852a57
BP
183 * ovsthread_stats is internally synchronized. */
184 struct ovsthread_stats stats; /* Contains 'struct dp_netdev_stats *'. */
72865317 185
8a4e3a85
BP
186 /* Ports.
187 *
59e6d833
BP
188 * Protected by RCU. Take the mutex to add or remove ports. */
189 struct ovs_mutex port_mutex;
190 struct cmap ports;
d33ed218 191 struct seq *port_seq; /* Incremented whenever a port changes. */
6c3eee82 192
6b31e073
RW
193 /* Protects access to ofproto-dpif-upcall interface during revalidator
194 * thread synchronization. */
195 struct fat_rwlock upcall_rwlock;
623540e4
EJ
196 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
197 void *upcall_aux;
6b31e073 198
65f13b50
AW
199 /* Stores all 'struct dp_netdev_pmd_thread's. */
200 struct cmap poll_threads;
201
202 /* Protects the access of the 'struct dp_netdev_pmd_thread'
203 * instance for non-pmd thread. */
204 struct ovs_mutex non_pmd_mutex;
205
206 /* Each pmd thread will store its pointer to
207 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
208 ovsthread_key_t per_pmd_key;
f2eee189
AW
209
210 /* Number of rx queues for each dpdk interface and the cpu mask
211 * for pin of pmd threads. */
212 size_t n_dpdk_rxqs;
213 char *pmd_cmask;
72865317
BP
214};
215
8a4e3a85 216static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
59e6d833 217 odp_port_t);
ff073a71 218
51852a57
BP
219enum dp_stat_type {
220 DP_STAT_HIT, /* Packets that matched in the flow table. */
221 DP_STAT_MISS, /* Packets that did not match. */
222 DP_STAT_LOST, /* Packets not passed up to the client. */
223 DP_N_STATS
224};
225
226/* Contained by struct dp_netdev's 'stats' member. */
227struct dp_netdev_stats {
228 struct ovs_mutex mutex; /* Protects 'n'. */
229
230 /* Indexed by DP_STAT_*, protected by 'mutex'. */
231 unsigned long long int n[DP_N_STATS] OVS_GUARDED;
232};
233
234
72865317
BP
235/* A port in a netdev-based datapath. */
236struct dp_netdev_port {
59e6d833 237 struct cmap_node node; /* Node in dp_netdev's 'ports'. */
ff073a71 238 odp_port_t port_no;
72865317 239 struct netdev *netdev;
4b609110 240 struct netdev_saved_flags *sf;
55c955bd 241 struct netdev_rxq **rxq;
b284085e 242 struct ovs_refcount ref_cnt;
0cbfe35d 243 char *type; /* Port type as requested by user. */
72865317
BP
244};
245
8a4e3a85
BP
246/* A flow in dp_netdev's 'flow_table'.
247 *
248 *
249 * Thread-safety
250 * =============
251 *
252 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
253 * its dp_netdev's classifier. The text below calls this classifier 'cls'.
254 *
255 * Motivation
256 * ----------
257 *
258 * The thread safety rules described here for "struct dp_netdev_flow" are
259 * motivated by two goals:
260 *
261 * - Prevent threads that read members of "struct dp_netdev_flow" from
262 * reading bad data due to changes by some thread concurrently modifying
263 * those members.
264 *
265 * - Prevent two threads making changes to members of a given "struct
266 * dp_netdev_flow" from interfering with each other.
267 *
268 *
269 * Rules
270 * -----
271 *
ed79f89a
DDP
272 * A flow 'flow' may be accessed without a risk of being freed during an RCU
273 * grace period. Code that needs to hold onto a flow for a while
274 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
8a4e3a85
BP
275 *
276 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
ed79f89a
DDP
277 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
278 * from modification.
8a4e3a85
BP
279 *
280 * Some members, marked 'const', are immutable. Accessing other members
281 * requires synchronization, as noted in more detail below.
282 */
72865317 283struct dp_netdev_flow {
9bbf1c3d 284 bool dead;
2c0ea78f 285 /* Packet classification. */
8a4e3a85 286 const struct cls_rule cr; /* In owning dp_netdev's 'cls'. */
2c0ea78f 287
8a4e3a85 288 /* Hash table index by unmasked flow. */
9f361d6b 289 const struct cmap_node node; /* In owning dp_netdev's 'flow_table'. */
8a4e3a85 290 const struct flow flow; /* The flow that created this entry. */
72865317 291
ed79f89a
DDP
292 /* Number of references.
293 * The classifier owns one reference.
294 * Any thread trying to keep a rule from being freed should hold its own
295 * reference. */
296 struct ovs_refcount ref_cnt;
297
8a4e3a85
BP
298 /* Statistics.
299 *
300 * Reading or writing these members requires 'mutex'. */
679ba04c 301 struct ovsthread_stats stats; /* Contains "struct dp_netdev_flow_stats". */
8a4e3a85 302
45c626a3 303 /* Actions. */
61e7deb1 304 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
72865317
BP
305};
306
ed79f89a 307static void dp_netdev_flow_unref(struct dp_netdev_flow *);
9bbf1c3d 308static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
8a4e3a85 309
679ba04c
BP
310/* Contained by struct dp_netdev_flow's 'stats' member. */
311struct dp_netdev_flow_stats {
312 struct ovs_mutex mutex; /* Guards all the other members. */
313
314 long long int used OVS_GUARDED; /* Last used time, in monotonic msecs. */
315 long long int packet_count OVS_GUARDED; /* Number of packets matched. */
316 long long int byte_count OVS_GUARDED; /* Number of bytes matched. */
317 uint16_t tcp_flags OVS_GUARDED; /* Bitwise-OR of seen tcp_flags values. */
318};
319
a84cb64a
BP
320/* A set of datapath actions within a "struct dp_netdev_flow".
321 *
322 *
323 * Thread-safety
324 * =============
325 *
45c626a3 326 * A struct dp_netdev_actions 'actions' is protected with RCU. */
a84cb64a 327struct dp_netdev_actions {
a84cb64a
BP
328 /* These members are immutable: they do not change during the struct's
329 * lifetime. */
330 struct nlattr *actions; /* Sequence of OVS_ACTION_ATTR_* attributes. */
331 unsigned int size; /* Size of 'actions', in bytes. */
332};
333
334struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
335 size_t);
61e7deb1
BP
336struct dp_netdev_actions *dp_netdev_flow_get_actions(
337 const struct dp_netdev_flow *);
338static void dp_netdev_actions_free(struct dp_netdev_actions *);
a84cb64a 339
e4cfed38
PS
340/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
341 * the performance overhead of interrupt processing. Therefore netdev can
342 * not implement rx-wait for these devices. dpif-netdev needs to poll
343 * these device to check for recv buffer. pmd-thread does polling for
344 * devices assigned to itself thread.
345 *
346 * DPDK used PMD for accessing NIC.
347 *
65f13b50
AW
348 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
349 * I/O of all non-pmd threads. There will be no actual thread created
350 * for the instance.
e4cfed38 351 **/
65f13b50 352struct dp_netdev_pmd_thread {
6c3eee82 353 struct dp_netdev *dp;
65f13b50
AW
354 struct cmap_node node; /* In 'dp->poll_threads'. */
355 /* Per thread exact-match cache. Note, the instance for cpu core
356 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
357 * need to be protected (e.g. by 'dp_netdev_mutex'). All other
358 * instances will only be accessed by its own pmd thread. */
9bbf1c3d 359 struct emc_cache flow_cache;
65f13b50
AW
360 struct latch exit_latch; /* For terminating the pmd thread. */
361 atomic_uint change_seq; /* For reloading pmd ports. */
6c3eee82 362 pthread_t thread;
65f13b50
AW
363 int index; /* Idx of this pmd thread among pmd*/
364 /* threads on same numa node. */
365 int core_id; /* CPU core id of this pmd thread. */
366 int numa_id; /* numa node id of this pmd thread. */
6c3eee82
BP
367};
368
84067a4c
JR
369#define PMD_INITIAL_SEQ 1
370
72865317
BP
371/* Interface to netdev-based datapath. */
372struct dpif_netdev {
373 struct dpif dpif;
374 struct dp_netdev *dp;
d33ed218 375 uint64_t last_port_seq;
72865317
BP
376};
377
8a4e3a85 378static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
59e6d833 379 struct dp_netdev_port **portp);
8a4e3a85 380static int get_port_by_name(struct dp_netdev *dp, const char *devname,
59e6d833 381 struct dp_netdev_port **portp);
8a4e3a85
BP
382static void dp_netdev_free(struct dp_netdev *)
383 OVS_REQUIRES(dp_netdev_mutex);
72865317 384static void dp_netdev_flow_flush(struct dp_netdev *);
8a4e3a85
BP
385static int do_add_port(struct dp_netdev *dp, const char *devname,
386 const char *type, odp_port_t port_no)
59e6d833 387 OVS_REQUIRES(dp->port_mutex);
c40b890f 388static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
59e6d833 389 OVS_REQUIRES(dp->port_mutex);
614c4892
BP
390static int dpif_netdev_open(const struct dpif_class *, const char *name,
391 bool create, struct dpif **);
65f13b50 392static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8cbf4f47
DDP
393 struct dpif_packet **, int c,
394 bool may_steal, struct pkt_metadata *,
4edb9ae9 395 const struct nlattr *actions,
e4cfed38 396 size_t actions_len);
65f13b50 397static void dp_netdev_input(struct dp_netdev_pmd_thread *,
3c33f0ff
JR
398 struct dpif_packet **, int cnt,
399 struct pkt_metadata *);
6b31e073 400static void dp_netdev_disable_upcall(struct dp_netdev *);
65f13b50
AW
401static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
402 struct dp_netdev *dp, int index,
403 int core_id, int numa_id);
f2eee189 404static void dp_netdev_set_nonpmd(struct dp_netdev *dp);
65f13b50
AW
405static struct dp_netdev_pmd_thread *dp_netdev_get_nonpmd(struct dp_netdev *dp);
406static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
407static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
408static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
f2eee189 409static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
72865317 410
9bbf1c3d
DDP
411static void emc_clear_entry(struct emc_entry *ce);
412
413static void
414emc_cache_init(struct emc_cache *flow_cache)
415{
416 int i;
417
418 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
419 flow_cache->entries[i].flow = NULL;
420 flow_cache->entries[i].hash = 0;
0023a1cb 421 flow_cache->entries[i].mf_len = 0;
9bbf1c3d
DDP
422 miniflow_initialize(&flow_cache->entries[i].mf.flow,
423 flow_cache->entries[i].mf.buf);
424 }
425}
426
427static void
428emc_cache_uninit(struct emc_cache *flow_cache)
429{
430 int i;
431
432 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
433 emc_clear_entry(&flow_cache->entries[i]);
434 }
435}
436
72865317
BP
437static struct dpif_netdev *
438dpif_netdev_cast(const struct dpif *dpif)
439{
cb22974d 440 ovs_assert(dpif->dpif_class->open == dpif_netdev_open);
72865317
BP
441 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
442}
443
444static struct dp_netdev *
445get_dp_netdev(const struct dpif *dpif)
446{
447 return dpif_netdev_cast(dpif)->dp;
448}
449
2197d7ab 450static int
2240af25
DDP
451dpif_netdev_enumerate(struct sset *all_dps,
452 const struct dpif_class *dpif_class)
2197d7ab
GL
453{
454 struct shash_node *node;
455
97be1538 456 ovs_mutex_lock(&dp_netdev_mutex);
2197d7ab 457 SHASH_FOR_EACH(node, &dp_netdevs) {
2240af25
DDP
458 struct dp_netdev *dp = node->data;
459 if (dpif_class != dp->class) {
460 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
461 * If the class doesn't match, skip this dpif. */
462 continue;
463 }
2197d7ab
GL
464 sset_add(all_dps, node->name);
465 }
97be1538 466 ovs_mutex_unlock(&dp_netdev_mutex);
5279f8fd 467
2197d7ab
GL
468 return 0;
469}
470
add90f6f
EJ
471static bool
472dpif_netdev_class_is_dummy(const struct dpif_class *class)
473{
474 return class != &dpif_netdev_class;
475}
476
0aeaabc8
JP
477static const char *
478dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
479{
480 return strcmp(type, "internal") ? type
add90f6f 481 : dpif_netdev_class_is_dummy(class) ? "dummy"
0aeaabc8
JP
482 : "tap";
483}
484
72865317
BP
485static struct dpif *
486create_dpif_netdev(struct dp_netdev *dp)
487{
462278db 488 uint16_t netflow_id = hash_string(dp->name, 0);
72865317 489 struct dpif_netdev *dpif;
72865317 490
6a8267c5 491 ovs_refcount_ref(&dp->ref_cnt);
72865317 492
72865317 493 dpif = xmalloc(sizeof *dpif);
614c4892 494 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
72865317 495 dpif->dp = dp;
d33ed218 496 dpif->last_port_seq = seq_read(dp->port_seq);
72865317
BP
497
498 return &dpif->dpif;
499}
500
4e022ec0
AW
501/* Choose an unused, non-zero port number and return it on success.
502 * Return ODPP_NONE on failure. */
503static odp_port_t
e44768b7 504choose_port(struct dp_netdev *dp, const char *name)
59e6d833 505 OVS_REQUIRES(dp->port_mutex)
e44768b7 506{
4e022ec0 507 uint32_t port_no;
e44768b7
JP
508
509 if (dp->class != &dpif_netdev_class) {
510 const char *p;
511 int start_no = 0;
512
513 /* If the port name begins with "br", start the number search at
514 * 100 to make writing tests easier. */
515 if (!strncmp(name, "br", 2)) {
516 start_no = 100;
517 }
518
519 /* If the port name contains a number, try to assign that port number.
520 * This can make writing unit tests easier because port numbers are
521 * predictable. */
522 for (p = name; *p != '\0'; p++) {
523 if (isdigit((unsigned char) *p)) {
524 port_no = start_no + strtol(p, NULL, 10);
ff073a71
BP
525 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
526 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 527 return u32_to_odp(port_no);
e44768b7
JP
528 }
529 break;
530 }
531 }
532 }
533
ff073a71
BP
534 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
535 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
4e022ec0 536 return u32_to_odp(port_no);
e44768b7
JP
537 }
538 }
539
4e022ec0 540 return ODPP_NONE;
e44768b7
JP
541}
542
72865317 543static int
614c4892
BP
544create_dp_netdev(const char *name, const struct dpif_class *class,
545 struct dp_netdev **dpp)
8a4e3a85 546 OVS_REQUIRES(dp_netdev_mutex)
72865317
BP
547{
548 struct dp_netdev *dp;
549 int error;
72865317 550
462278db 551 dp = xzalloc(sizeof *dp);
8a4e3a85
BP
552 shash_add(&dp_netdevs, name, dp);
553
554 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
555 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
6a8267c5 556 ovs_refcount_init(&dp->ref_cnt);
1a65ba85 557 atomic_flag_clear(&dp->destroyed);
8a4e3a85
BP
558
559 ovs_mutex_init(&dp->flow_mutex);
560 classifier_init(&dp->cls, NULL);
9f361d6b 561 cmap_init(&dp->flow_table);
8a4e3a85 562
51852a57 563 ovsthread_stats_init(&dp->stats);
ed27e010 564
59e6d833
BP
565 ovs_mutex_init(&dp->port_mutex);
566 cmap_init(&dp->ports);
d33ed218 567 dp->port_seq = seq_create();
6b31e073
RW
568 fat_rwlock_init(&dp->upcall_rwlock);
569
570 /* Disable upcalls by default. */
571 dp_netdev_disable_upcall(dp);
623540e4 572 dp->upcall_aux = NULL;
6b31e073 573 dp->upcall_cb = NULL;
e44768b7 574
65f13b50
AW
575 cmap_init(&dp->poll_threads);
576 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
577 ovsthread_key_create(&dp->per_pmd_key, NULL);
578
579 /* Reserves the core NON_PMD_CORE_ID for all non-pmd threads. */
580 ovs_numa_try_pin_core_specific(NON_PMD_CORE_ID);
f2eee189
AW
581 dp_netdev_set_nonpmd(dp);
582 dp->n_dpdk_rxqs = NR_QUEUE;
65f13b50 583
59e6d833 584 ovs_mutex_lock(&dp->port_mutex);
4e022ec0 585 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
59e6d833 586 ovs_mutex_unlock(&dp->port_mutex);
72865317
BP
587 if (error) {
588 dp_netdev_free(dp);
462278db 589 return error;
72865317
BP
590 }
591
462278db 592 *dpp = dp;
72865317
BP
593 return 0;
594}
595
596static int
614c4892 597dpif_netdev_open(const struct dpif_class *class, const char *name,
4a387741 598 bool create, struct dpif **dpifp)
72865317 599{
462278db 600 struct dp_netdev *dp;
5279f8fd 601 int error;
462278db 602
97be1538 603 ovs_mutex_lock(&dp_netdev_mutex);
462278db
BP
604 dp = shash_find_data(&dp_netdevs, name);
605 if (!dp) {
5279f8fd 606 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
72865317 607 } else {
5279f8fd
BP
608 error = (dp->class != class ? EINVAL
609 : create ? EEXIST
610 : 0);
611 }
612 if (!error) {
613 *dpifp = create_dpif_netdev(dp);
6b31e073 614 dp->dpif = *dpifp;
72865317 615 }
97be1538 616 ovs_mutex_unlock(&dp_netdev_mutex);
462278db 617
5279f8fd 618 return error;
72865317
BP
619}
620
88ace79b
DDP
621static void
622dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
623 OVS_NO_THREAD_SAFETY_ANALYSIS
624{
625 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
626 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
627
628 /* Before freeing a lock we should release it */
629 fat_rwlock_unlock(&dp->upcall_rwlock);
630 fat_rwlock_destroy(&dp->upcall_rwlock);
631}
632
8a4e3a85
BP
633/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
634 * through the 'dp_netdevs' shash while freeing 'dp'. */
1ba530f4
BP
635static void
636dp_netdev_free(struct dp_netdev *dp)
8a4e3a85 637 OVS_REQUIRES(dp_netdev_mutex)
1ba530f4 638{
59e6d833 639 struct dp_netdev_port *port;
51852a57
BP
640 struct dp_netdev_stats *bucket;
641 int i;
4ad28026 642
8a4e3a85
BP
643 shash_find_and_delete(&dp_netdevs, dp->name);
644
65f13b50
AW
645 dp_netdev_destroy_all_pmds(dp);
646 ovs_mutex_destroy(&dp->non_pmd_mutex);
647 ovsthread_key_delete(dp->per_pmd_key);
6c3eee82 648
1ba530f4 649 dp_netdev_flow_flush(dp);
59e6d833 650 ovs_mutex_lock(&dp->port_mutex);
a532e683 651 CMAP_FOR_EACH (port, node, &dp->ports) {
c40b890f 652 do_del_port(dp, port);
1ba530f4 653 }
59e6d833 654 ovs_mutex_unlock(&dp->port_mutex);
51852a57
BP
655
656 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &dp->stats) {
657 ovs_mutex_destroy(&bucket->mutex);
658 free_cacheline(bucket);
659 }
660 ovsthread_stats_destroy(&dp->stats);
f5126b57 661
2c0ea78f 662 classifier_destroy(&dp->cls);
9f361d6b 663 cmap_destroy(&dp->flow_table);
8a4e3a85 664 ovs_mutex_destroy(&dp->flow_mutex);
d33ed218 665 seq_destroy(dp->port_seq);
59e6d833 666 cmap_destroy(&dp->ports);
88ace79b
DDP
667
668 /* Upcalls must be disabled at this point */
669 dp_netdev_destroy_upcall_lock(dp);
9bbf1c3d 670
f2eee189 671 free(dp->pmd_cmask);
8a4e3a85 672 free(CONST_CAST(char *, dp->name));
72865317
BP
673 free(dp);
674}
675
8a4e3a85
BP
676static void
677dp_netdev_unref(struct dp_netdev *dp)
678{
679 if (dp) {
680 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
681 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
682 ovs_mutex_lock(&dp_netdev_mutex);
24f83812 683 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
8a4e3a85
BP
684 dp_netdev_free(dp);
685 }
686 ovs_mutex_unlock(&dp_netdev_mutex);
687 }
688}
689
72865317
BP
690static void
691dpif_netdev_close(struct dpif *dpif)
692{
693 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 694
8a4e3a85 695 dp_netdev_unref(dp);
72865317
BP
696 free(dpif);
697}
698
699static int
7dab847a 700dpif_netdev_destroy(struct dpif *dpif)
72865317
BP
701{
702 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 703
6a8267c5 704 if (!atomic_flag_test_and_set(&dp->destroyed)) {
24f83812 705 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
6a8267c5
BP
706 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
707 OVS_NOT_REACHED();
708 }
709 }
5279f8fd 710
72865317
BP
711 return 0;
712}
713
714static int
a8d9304d 715dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
72865317
BP
716{
717 struct dp_netdev *dp = get_dp_netdev(dpif);
51852a57
BP
718 struct dp_netdev_stats *bucket;
719 size_t i;
5279f8fd 720
9f361d6b 721 stats->n_flows = cmap_count(&dp->flow_table);
8a4e3a85 722
51852a57
BP
723 stats->n_hit = stats->n_missed = stats->n_lost = 0;
724 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &dp->stats) {
725 ovs_mutex_lock(&bucket->mutex);
726 stats->n_hit += bucket->n[DP_STAT_HIT];
727 stats->n_missed += bucket->n[DP_STAT_MISS];
728 stats->n_lost += bucket->n[DP_STAT_LOST];
729 ovs_mutex_unlock(&bucket->mutex);
730 }
1ce3fa06 731 stats->n_masks = UINT32_MAX;
847108dc 732 stats->n_mask_hit = UINT64_MAX;
5279f8fd 733
72865317
BP
734 return 0;
735}
736
e4cfed38 737static void
65f13b50 738dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
e4cfed38 739{
65f13b50
AW
740 int old_seq;
741
742 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
743}
e4cfed38 744
65f13b50
AW
745/* Causes all pmd threads to reload its tx/rx devices.
746 * Must be called after adding/removing ports. */
747static void
748dp_netdev_reload_pmds(struct dp_netdev *dp)
749{
750 struct dp_netdev_pmd_thread *pmd;
e4cfed38 751
65f13b50
AW
752 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
753 dp_netdev_reload_pmd__(pmd);
84067a4c 754 }
e4cfed38
PS
755}
756
59e6d833
BP
757static uint32_t
758hash_port_no(odp_port_t port_no)
759{
760 return hash_int(odp_to_u32(port_no), 0);
761}
762
72865317 763static int
c3827f61 764do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
4e022ec0 765 odp_port_t port_no)
59e6d833 766 OVS_REQUIRES(dp->port_mutex)
72865317 767{
4b609110 768 struct netdev_saved_flags *sf;
72865317
BP
769 struct dp_netdev_port *port;
770 struct netdev *netdev;
2499a8ce 771 enum netdev_flags flags;
0cbfe35d 772 const char *open_type;
72865317 773 int error;
55c955bd 774 int i;
72865317
BP
775
776 /* XXX reject devices already in some dp_netdev. */
777
778 /* Open and validate network device. */
0aeaabc8 779 open_type = dpif_netdev_port_open_type(dp->class, type);
0cbfe35d 780 error = netdev_open(devname, open_type, &netdev);
72865317
BP
781 if (error) {
782 return error;
783 }
72865317
BP
784 /* XXX reject non-Ethernet devices */
785
2499a8ce
AC
786 netdev_get_flags(netdev, &flags);
787 if (flags & NETDEV_LOOPBACK) {
788 VLOG_ERR("%s: cannot add a loopback device", devname);
789 netdev_close(netdev);
790 return EINVAL;
791 }
792
5a034064
AW
793 if (netdev_is_pmd(netdev)) {
794 int n_cores = ovs_numa_get_n_cores();
795
796 if (n_cores == OVS_CORE_UNSPEC) {
797 VLOG_ERR("%s, cannot get cpu core info", devname);
798 return ENOENT;
799 }
800 /* There can only be ovs_numa_get_n_cores() pmd threads,
f2eee189
AW
801 * so creates a txq for each. */
802 error = netdev_set_multiq(netdev, n_cores, dp->n_dpdk_rxqs);
5a034064
AW
803 if (error) {
804 VLOG_ERR("%s, cannot set multiq", devname);
805 return errno;
806 }
807 }
e4cfed38
PS
808 port = xzalloc(sizeof *port);
809 port->port_no = port_no;
810 port->netdev = netdev;
55c955bd 811 port->rxq = xmalloc(sizeof *port->rxq * netdev_n_rxq(netdev));
e4cfed38 812 port->type = xstrdup(type);
55c955bd
PS
813 for (i = 0; i < netdev_n_rxq(netdev); i++) {
814 error = netdev_rxq_open(netdev, &port->rxq[i], i);
815 if (error
816 && !(error == EOPNOTSUPP && dpif_netdev_class_is_dummy(dp->class))) {
817 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
818 devname, ovs_strerror(errno));
819 netdev_close(netdev);
16bea12c
TG
820 free(port->type);
821 free(port->rxq);
822 free(port);
55c955bd
PS
823 return error;
824 }
7b6b0ef4
BP
825 }
826
4b609110 827 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
72865317 828 if (error) {
55c955bd
PS
829 for (i = 0; i < netdev_n_rxq(netdev); i++) {
830 netdev_rxq_close(port->rxq[i]);
831 }
72865317 832 netdev_close(netdev);
16bea12c 833 free(port->type);
f7791740 834 free(port->rxq);
e4cfed38 835 free(port);
72865317
BP
836 return error;
837 }
4b609110 838 port->sf = sf;
e4cfed38
PS
839
840 if (netdev_is_pmd(netdev)) {
65f13b50
AW
841 dp_netdev_set_pmds_on_numa(dp, netdev_get_numa_id(netdev));
842 dp_netdev_reload_pmds(dp);
e4cfed38
PS
843 }
844 ovs_refcount_init(&port->ref_cnt);
72865317 845
59e6d833 846 cmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
d33ed218 847 seq_change(dp->port_seq);
72865317
BP
848
849 return 0;
850}
851
247527db
BP
852static int
853dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
4e022ec0 854 odp_port_t *port_nop)
247527db
BP
855{
856 struct dp_netdev *dp = get_dp_netdev(dpif);
3aa30359
BP
857 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
858 const char *dpif_port;
4e022ec0 859 odp_port_t port_no;
5279f8fd 860 int error;
247527db 861
59e6d833 862 ovs_mutex_lock(&dp->port_mutex);
3aa30359 863 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
4e022ec0 864 if (*port_nop != ODPP_NONE) {
ff073a71
BP
865 port_no = *port_nop;
866 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
232dfa4a 867 } else {
3aa30359 868 port_no = choose_port(dp, dpif_port);
5279f8fd 869 error = port_no == ODPP_NONE ? EFBIG : 0;
232dfa4a 870 }
5279f8fd 871 if (!error) {
247527db 872 *port_nop = port_no;
5279f8fd 873 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
247527db 874 }
59e6d833 875 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
876
877 return error;
72865317
BP
878}
879
880static int
4e022ec0 881dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
72865317
BP
882{
883 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd
BP
884 int error;
885
59e6d833 886 ovs_mutex_lock(&dp->port_mutex);
c40b890f
BP
887 if (port_no == ODPP_LOCAL) {
888 error = EINVAL;
889 } else {
890 struct dp_netdev_port *port;
891
892 error = get_port_by_number(dp, port_no, &port);
893 if (!error) {
894 do_del_port(dp, port);
895 }
896 }
59e6d833 897 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd
BP
898
899 return error;
72865317
BP
900}
901
902static bool
4e022ec0 903is_valid_port_number(odp_port_t port_no)
72865317 904{
ff073a71
BP
905 return port_no != ODPP_NONE;
906}
907
908static struct dp_netdev_port *
909dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
910{
911 struct dp_netdev_port *port;
912
59e6d833 913 CMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
ff073a71
BP
914 if (port->port_no == port_no) {
915 return port;
916 }
917 }
918 return NULL;
72865317
BP
919}
920
921static int
922get_port_by_number(struct dp_netdev *dp,
4e022ec0 923 odp_port_t port_no, struct dp_netdev_port **portp)
72865317
BP
924{
925 if (!is_valid_port_number(port_no)) {
926 *portp = NULL;
927 return EINVAL;
928 } else {
ff073a71 929 *portp = dp_netdev_lookup_port(dp, port_no);
72865317
BP
930 return *portp ? 0 : ENOENT;
931 }
932}
933
b284085e
PS
934static void
935port_ref(struct dp_netdev_port *port)
936{
937 if (port) {
938 ovs_refcount_ref(&port->ref_cnt);
939 }
940}
941
a1fdee13
AW
942static bool
943port_try_ref(struct dp_netdev_port *port)
944{
945 if (port) {
946 return ovs_refcount_try_ref_rcu(&port->ref_cnt);
947 }
948
949 return false;
950}
951
b284085e 952static void
59e6d833 953port_destroy__(struct dp_netdev_port *port)
b284085e 954{
98de6beb 955 int n_rxq = netdev_n_rxq(port->netdev);
59e6d833 956 int i;
55c955bd 957
59e6d833
BP
958 netdev_close(port->netdev);
959 netdev_restore_flags(port->sf);
55c955bd 960
59e6d833
BP
961 for (i = 0; i < n_rxq; i++) {
962 netdev_rxq_close(port->rxq[i]);
963 }
964 free(port->rxq);
965 free(port->type);
966 free(port);
967}
968
969static void
970port_unref(struct dp_netdev_port *port)
971{
24f83812 972 if (port && ovs_refcount_unref_relaxed(&port->ref_cnt) == 1) {
59e6d833 973 ovsrcu_postpone(port_destroy__, port);
b284085e
PS
974 }
975}
976
72865317
BP
977static int
978get_port_by_name(struct dp_netdev *dp,
979 const char *devname, struct dp_netdev_port **portp)
59e6d833 980 OVS_REQUIRES(dp->port_mutex)
72865317
BP
981{
982 struct dp_netdev_port *port;
983
a532e683 984 CMAP_FOR_EACH (port, node, &dp->ports) {
3efb6063 985 if (!strcmp(netdev_get_name(port->netdev), devname)) {
72865317
BP
986 *portp = port;
987 return 0;
988 }
989 }
990 return ENOENT;
991}
992
65f13b50
AW
993static int
994get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
995{
996 struct dp_netdev_pmd_thread *pmd;
997 int n_pmds = 0;
998
999 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1000 if (pmd->numa_id == numa_id) {
1001 n_pmds++;
1002 }
1003 }
1004
1005 return n_pmds;
1006}
1007
1008/* Returns 'true' if there is a port with pmd netdev and the netdev
1009 * is on numa node 'numa_id'. */
1010static bool
1011has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
1012{
1013 struct dp_netdev_port *port;
1014
1015 CMAP_FOR_EACH (port, node, &dp->ports) {
1016 if (netdev_is_pmd(port->netdev)
1017 && netdev_get_numa_id(port->netdev) == numa_id) {
1018 return true;
1019 }
1020 }
1021
1022 return false;
1023}
1024
1025
c40b890f
BP
1026static void
1027do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
59e6d833 1028 OVS_REQUIRES(dp->port_mutex)
72865317 1029{
c40b890f 1030 cmap_remove(&dp->ports, &port->node, hash_odp_port(port->port_no));
d33ed218 1031 seq_change(dp->port_seq);
e4cfed38 1032 if (netdev_is_pmd(port->netdev)) {
65f13b50
AW
1033 int numa_id = netdev_get_numa_id(port->netdev);
1034
1035 /* If there is no netdev on the numa node, deletes the pmd threads
1036 * for that numa. Else, just reloads the queues. */
1037 if (!has_pmd_port_for_numa(dp, numa_id)) {
1038 dp_netdev_del_pmds_on_numa(dp, numa_id);
1039 }
1040 dp_netdev_reload_pmds(dp);
e4cfed38 1041 }
72865317 1042
b284085e 1043 port_unref(port);
72865317
BP
1044}
1045
1046static void
4c738a8d
BP
1047answer_port_query(const struct dp_netdev_port *port,
1048 struct dpif_port *dpif_port)
72865317 1049{
3efb6063 1050 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
0cbfe35d 1051 dpif_port->type = xstrdup(port->type);
4c738a8d 1052 dpif_port->port_no = port->port_no;
72865317
BP
1053}
1054
1055static int
4e022ec0 1056dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
4c738a8d 1057 struct dpif_port *dpif_port)
72865317
BP
1058{
1059 struct dp_netdev *dp = get_dp_netdev(dpif);
1060 struct dp_netdev_port *port;
1061 int error;
1062
1063 error = get_port_by_number(dp, port_no, &port);
4afba28d 1064 if (!error && dpif_port) {
4c738a8d 1065 answer_port_query(port, dpif_port);
72865317 1066 }
5279f8fd 1067
72865317
BP
1068 return error;
1069}
1070
1071static int
1072dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
4c738a8d 1073 struct dpif_port *dpif_port)
72865317
BP
1074{
1075 struct dp_netdev *dp = get_dp_netdev(dpif);
1076 struct dp_netdev_port *port;
1077 int error;
1078
59e6d833 1079 ovs_mutex_lock(&dp->port_mutex);
72865317 1080 error = get_port_by_name(dp, devname, &port);
4afba28d 1081 if (!error && dpif_port) {
4c738a8d 1082 answer_port_query(port, dpif_port);
72865317 1083 }
59e6d833 1084 ovs_mutex_unlock(&dp->port_mutex);
5279f8fd 1085
72865317
BP
1086 return error;
1087}
1088
61e7deb1
BP
1089static void
1090dp_netdev_flow_free(struct dp_netdev_flow *flow)
1091{
1092 struct dp_netdev_flow_stats *bucket;
1093 size_t i;
1094
1095 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &flow->stats) {
1096 ovs_mutex_destroy(&bucket->mutex);
1097 free_cacheline(bucket);
1098 }
1099 ovsthread_stats_destroy(&flow->stats);
1100
1101 cls_rule_destroy(CONST_CAST(struct cls_rule *, &flow->cr));
1102 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
61e7deb1
BP
1103 free(flow);
1104}
1105
ed79f89a
DDP
1106static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1107{
1108 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1109 ovsrcu_postpone(dp_netdev_flow_free, flow);
1110 }
1111}
1112
72865317 1113static void
8a4e3a85 1114dp_netdev_remove_flow(struct dp_netdev *dp, struct dp_netdev_flow *flow)
8a4e3a85 1115 OVS_REQUIRES(dp->flow_mutex)
72865317 1116{
8a4e3a85 1117 struct cls_rule *cr = CONST_CAST(struct cls_rule *, &flow->cr);
9f361d6b 1118 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2c0ea78f 1119
8a4e3a85 1120 classifier_remove(&dp->cls, cr);
9f361d6b 1121 cmap_remove(&dp->flow_table, node, flow_hash(&flow->flow, 0));
9bbf1c3d 1122 flow->dead = true;
ed79f89a
DDP
1123
1124 dp_netdev_flow_unref(flow);
72865317
BP
1125}
1126
1127static void
1128dp_netdev_flow_flush(struct dp_netdev *dp)
1129{
78c8df12 1130 struct dp_netdev_flow *netdev_flow;
72865317 1131
8a4e3a85 1132 ovs_mutex_lock(&dp->flow_mutex);
6bc3bb82 1133 CMAP_FOR_EACH (netdev_flow, node, &dp->flow_table) {
8a4e3a85 1134 dp_netdev_remove_flow(dp, netdev_flow);
72865317 1135 }
8a4e3a85 1136 ovs_mutex_unlock(&dp->flow_mutex);
72865317
BP
1137}
1138
1139static int
1140dpif_netdev_flow_flush(struct dpif *dpif)
1141{
1142 struct dp_netdev *dp = get_dp_netdev(dpif);
5279f8fd 1143
72865317
BP
1144 dp_netdev_flow_flush(dp);
1145 return 0;
1146}
1147
b0ec0f27 1148struct dp_netdev_port_state {
59e6d833 1149 struct cmap_position position;
4c738a8d 1150 char *name;
b0ec0f27
BP
1151};
1152
1153static int
1154dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1155{
1156 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1157 return 0;
1158}
1159
72865317 1160static int
b0ec0f27 1161dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
4c738a8d 1162 struct dpif_port *dpif_port)
72865317 1163{
b0ec0f27 1164 struct dp_netdev_port_state *state = state_;
72865317 1165 struct dp_netdev *dp = get_dp_netdev(dpif);
59e6d833 1166 struct cmap_node *node;
ff073a71 1167 int retval;
72865317 1168
59e6d833 1169 node = cmap_next_position(&dp->ports, &state->position);
ff073a71
BP
1170 if (node) {
1171 struct dp_netdev_port *port;
5279f8fd 1172
ff073a71
BP
1173 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1174
1175 free(state->name);
1176 state->name = xstrdup(netdev_get_name(port->netdev));
1177 dpif_port->name = state->name;
1178 dpif_port->type = port->type;
1179 dpif_port->port_no = port->port_no;
1180
1181 retval = 0;
1182 } else {
1183 retval = EOF;
72865317 1184 }
5279f8fd 1185
ff073a71 1186 return retval;
b0ec0f27
BP
1187}
1188
1189static int
4c738a8d 1190dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
b0ec0f27 1191{
4c738a8d
BP
1192 struct dp_netdev_port_state *state = state_;
1193 free(state->name);
b0ec0f27
BP
1194 free(state);
1195 return 0;
72865317
BP
1196}
1197
1198static int
67a4917b 1199dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
72865317
BP
1200{
1201 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
d33ed218 1202 uint64_t new_port_seq;
5279f8fd
BP
1203 int error;
1204
d33ed218
BP
1205 new_port_seq = seq_read(dpif->dp->port_seq);
1206 if (dpif->last_port_seq != new_port_seq) {
1207 dpif->last_port_seq = new_port_seq;
5279f8fd 1208 error = ENOBUFS;
72865317 1209 } else {
5279f8fd 1210 error = EAGAIN;
72865317 1211 }
5279f8fd
BP
1212
1213 return error;
72865317
BP
1214}
1215
1216static void
1217dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1218{
1219 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
5279f8fd 1220
d33ed218 1221 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
8a4e3a85
BP
1222}
1223
1224static struct dp_netdev_flow *
1225dp_netdev_flow_cast(const struct cls_rule *cr)
1226{
1227 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
72865317
BP
1228}
1229
9bbf1c3d
DDP
1230static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1231{
1232 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1233}
1234
79df317f
DDP
1235/* netdev_flow_key utilities.
1236 *
1237 * netdev_flow_key is basically a miniflow. We use these functions
1238 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1239 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1240 *
1241 * - Since we are dealing exclusively with miniflows created by
1242 * miniflow_extract(), if the map is different the miniflow is different.
1243 * Therefore we can be faster by comparing the map and the miniflow in a
1244 * single memcmp().
1245 * _ netdev_flow_key's miniflow has always inline values.
1246 * - These functions can be inlined by the compiler.
1247 *
1248 * The following assertions make sure that what we're doing with miniflow is
1249 * safe
1250 */
1251BUILD_ASSERT_DECL(offsetof(struct miniflow, inline_values)
1252 == sizeof(uint64_t));
1253BUILD_ASSERT_DECL(offsetof(struct netdev_flow_key, flow) == 0);
1254
1255static inline struct netdev_flow_key *
1256miniflow_to_netdev_flow_key(const struct miniflow *mf)
1257{
1258 return (struct netdev_flow_key *) CONST_CAST(struct miniflow *, mf);
1259}
1260
1261/* Given the number of bits set in the miniflow map, returns the size of the
1262 * netdev_flow key */
1263static inline uint32_t
1264netdev_flow_key_size(uint32_t flow_u32s)
1265{
1266 return MINIFLOW_VALUES_SIZE(flow_u32s)
1267 + offsetof(struct miniflow, inline_values);
1268}
1269
1270/* Used to compare 'netdev_flow_key's (miniflows) in the exact match cache. */
1271static inline bool
1272netdev_flow_key_equal(const struct netdev_flow_key *a,
0023a1cb
DDP
1273 const struct netdev_flow_key *b,
1274 uint32_t size)
79df317f 1275{
0023a1cb 1276 return !memcmp(a, b, size);
79df317f
DDP
1277}
1278
1279static inline void
1280netdev_flow_key_clone(struct netdev_flow_key *dst,
1281 const struct netdev_flow_key *src,
1282 uint32_t size)
1283{
0023a1cb 1284 memcpy(dst, src, size);
79df317f
DDP
1285}
1286
9bbf1c3d
DDP
1287static inline bool
1288emc_entry_alive(struct emc_entry *ce)
1289{
1290 return ce->flow && !ce->flow->dead;
1291}
1292
1293static void
1294emc_clear_entry(struct emc_entry *ce)
1295{
1296 if (ce->flow) {
1297 dp_netdev_flow_unref(ce->flow);
1298 ce->flow = NULL;
1299 }
1300}
1301
1302static inline void
1303emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
79df317f 1304 const struct netdev_flow_key *mf, uint32_t hash)
9bbf1c3d
DDP
1305{
1306 if (ce->flow != flow) {
1307 if (ce->flow) {
1308 dp_netdev_flow_unref(ce->flow);
1309 }
1310
1311 if (dp_netdev_flow_ref(flow)) {
1312 ce->flow = flow;
1313 } else {
1314 ce->flow = NULL;
1315 }
1316 }
1317 if (mf) {
0023a1cb
DDP
1318 uint32_t mf_len = netdev_flow_key_size(count_1bits(mf->flow.map));
1319
1320 netdev_flow_key_clone(&ce->mf, mf, mf_len);
9bbf1c3d 1321 ce->hash = hash;
0023a1cb 1322 ce->mf_len = mf_len;
9bbf1c3d
DDP
1323 }
1324}
1325
1326static inline void
1327emc_insert(struct emc_cache *cache, const struct miniflow *mf, uint32_t hash,
1328 struct dp_netdev_flow *flow)
1329{
1330 struct emc_entry *to_be_replaced = NULL;
1331 struct emc_entry *current_entry;
1332
1333 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, hash) {
1334 if (current_entry->hash == hash
79df317f 1335 && netdev_flow_key_equal(&current_entry->mf,
0023a1cb
DDP
1336 miniflow_to_netdev_flow_key(mf),
1337 current_entry->mf_len)) {
9bbf1c3d
DDP
1338
1339 /* We found the entry with the 'mf' miniflow */
1340 emc_change_entry(current_entry, flow, NULL, 0);
1341 return;
1342 }
1343
1344 /* Replacement policy: put the flow in an empty (not alive) entry, or
1345 * in the first entry where it can be */
1346 if (!to_be_replaced
1347 || (emc_entry_alive(to_be_replaced)
1348 && !emc_entry_alive(current_entry))
1349 || current_entry->hash < to_be_replaced->hash) {
1350 to_be_replaced = current_entry;
1351 }
1352 }
1353 /* We didn't find the miniflow in the cache.
1354 * The 'to_be_replaced' entry is where the new flow will be stored */
1355
79df317f
DDP
1356 emc_change_entry(to_be_replaced, flow, miniflow_to_netdev_flow_key(mf),
1357 hash);
9bbf1c3d
DDP
1358}
1359
1360static inline struct dp_netdev_flow *
1361emc_lookup(struct emc_cache *cache, const struct miniflow *mf, uint32_t hash)
1362{
1363 struct emc_entry *current_entry;
1364
1365 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, hash) {
1366 if (current_entry->hash == hash && emc_entry_alive(current_entry)
79df317f 1367 && netdev_flow_key_equal(&current_entry->mf,
0023a1cb
DDP
1368 miniflow_to_netdev_flow_key(mf),
1369 current_entry->mf_len)) {
9bbf1c3d
DDP
1370
1371 /* We found the entry with the 'mf' miniflow */
1372 return current_entry->flow;
1373 }
1374 }
1375
1376 return NULL;
1377}
1378
72865317 1379static struct dp_netdev_flow *
4f150744 1380dp_netdev_lookup_flow(const struct dp_netdev *dp, const struct miniflow *key)
2c0ea78f 1381{
8a4e3a85 1382 struct dp_netdev_flow *netdev_flow;
4f150744 1383 struct cls_rule *rule;
2c0ea78f 1384
b7648634 1385 classifier_lookup_miniflow_batch(&dp->cls, &key, &rule, 1);
4f150744 1386 netdev_flow = dp_netdev_flow_cast(rule);
2c0ea78f 1387
8a4e3a85 1388 return netdev_flow;
2c0ea78f
GS
1389}
1390
1391static struct dp_netdev_flow *
1392dp_netdev_find_flow(const struct dp_netdev *dp, const struct flow *flow)
72865317 1393{
1763b4b8 1394 struct dp_netdev_flow *netdev_flow;
72865317 1395
9f361d6b 1396 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, flow_hash(flow, 0),
1763b4b8 1397 &dp->flow_table) {
2c0ea78f 1398 if (flow_equal(&netdev_flow->flow, flow)) {
61e7deb1 1399 return netdev_flow;
72865317
BP
1400 }
1401 }
8a4e3a85 1402
72865317
BP
1403 return NULL;
1404}
1405
1406static void
6fe09f8c 1407get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow,
1763b4b8 1408 struct dpif_flow_stats *stats)
feebdea2 1409{
679ba04c
BP
1410 struct dp_netdev_flow_stats *bucket;
1411 size_t i;
1412
1413 memset(stats, 0, sizeof *stats);
1414 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &netdev_flow->stats) {
1415 ovs_mutex_lock(&bucket->mutex);
1416 stats->n_packets += bucket->packet_count;
1417 stats->n_bytes += bucket->byte_count;
1418 stats->used = MAX(stats->used, bucket->used);
1419 stats->tcp_flags |= bucket->tcp_flags;
1420 ovs_mutex_unlock(&bucket->mutex);
1421 }
72865317
BP
1422}
1423
6fe09f8c
JS
1424static void
1425dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
1426 struct ofpbuf *buffer, struct dpif_flow *flow)
1427{
1428 struct flow_wildcards wc;
1429 struct dp_netdev_actions *actions;
1430
1431 minimask_expand(&netdev_flow->cr.match.mask, &wc);
1432 odp_flow_key_from_mask(buffer, &wc.masks, &netdev_flow->flow,
1433 odp_to_u32(wc.masks.in_port.odp_port),
1434 SIZE_MAX, true);
1435 flow->mask = ofpbuf_data(buffer);
1436 flow->mask_len = ofpbuf_size(buffer);
1437
1438 actions = dp_netdev_flow_get_actions(netdev_flow);
1439 flow->actions = actions->actions;
1440 flow->actions_len = actions->size;
1441
1442 get_dpif_flow_stats(netdev_flow, &flow->stats);
1443}
1444
36956a7d 1445static int
8c301900
JR
1446dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1447 const struct nlattr *mask_key,
1448 uint32_t mask_key_len, const struct flow *flow,
1449 struct flow *mask)
1450{
1451 if (mask_key_len) {
80e44883
BP
1452 enum odp_key_fitness fitness;
1453
1454 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, mask, flow);
1455 if (fitness) {
8c301900
JR
1456 /* This should not happen: it indicates that
1457 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
1458 * disagree on the acceptable form of a mask. Log the problem
1459 * as an error, with enough details to enable debugging. */
1460 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1461
1462 if (!VLOG_DROP_ERR(&rl)) {
1463 struct ds s;
1464
1465 ds_init(&s);
1466 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
1467 true);
80e44883
BP
1468 VLOG_ERR("internal error parsing flow mask %s (%s)",
1469 ds_cstr(&s), odp_key_fitness_to_string(fitness));
8c301900
JR
1470 ds_destroy(&s);
1471 }
1472
1473 return EINVAL;
1474 }
8c301900
JR
1475 } else {
1476 enum mf_field_id id;
1477 /* No mask key, unwildcard everything except fields whose
1478 * prerequisities are not met. */
1479 memset(mask, 0x0, sizeof *mask);
1480
1481 for (id = 0; id < MFF_N_IDS; ++id) {
1482 /* Skip registers and metadata. */
1483 if (!(id >= MFF_REG0 && id < MFF_REG0 + FLOW_N_REGS)
1484 && id != MFF_METADATA) {
1485 const struct mf_field *mf = mf_from_id(id);
1486 if (mf_are_prereqs_ok(mf, flow)) {
1487 mf_mask_field(mf, mask);
1488 }
1489 }
1490 }
1491 }
1492
f3f750e5
BP
1493 /* Force unwildcard the in_port.
1494 *
1495 * We need to do this even in the case where we unwildcard "everything"
1496 * above because "everything" only includes the 16-bit OpenFlow port number
1497 * mask->in_port.ofp_port, which only covers half of the 32-bit datapath
1498 * port number mask->in_port.odp_port. */
1499 mask->in_port.odp_port = u32_to_odp(UINT32_MAX);
1500
8c301900
JR
1501 return 0;
1502}
1503
1504static int
1505dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1506 struct flow *flow)
36956a7d 1507{
586ddea5
BP
1508 odp_port_t in_port;
1509
8c301900 1510 if (odp_flow_key_to_flow(key, key_len, flow)) {
36956a7d 1511 /* This should not happen: it indicates that odp_flow_key_from_flow()
8c301900
JR
1512 * and odp_flow_key_to_flow() disagree on the acceptable form of a
1513 * flow. Log the problem as an error, with enough details to enable
1514 * debugging. */
36956a7d
BP
1515 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1516
1517 if (!VLOG_DROP_ERR(&rl)) {
1518 struct ds s;
1519
1520 ds_init(&s);
8c301900 1521 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
36956a7d
BP
1522 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
1523 ds_destroy(&s);
1524 }
1525
1526 return EINVAL;
1527 }
1528
586ddea5
BP
1529 in_port = flow->in_port.odp_port;
1530 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
18886b60
BP
1531 return EINVAL;
1532 }
1533
36956a7d
BP
1534 return 0;
1535}
1536
72865317 1537static int
6fe09f8c 1538dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
72865317
BP
1539{
1540 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1541 struct dp_netdev_flow *netdev_flow;
bc4a05c6
BP
1542 struct flow key;
1543 int error;
36956a7d 1544
6fe09f8c 1545 error = dpif_netdev_flow_from_nlattrs(get->key, get->key_len, &key);
bc4a05c6
BP
1546 if (error) {
1547 return error;
1548 }
14608a15 1549
2c0ea78f 1550 netdev_flow = dp_netdev_find_flow(dp, &key);
8a4e3a85 1551
1763b4b8 1552 if (netdev_flow) {
6fe09f8c 1553 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->flow);
61e7deb1 1554 } else {
5279f8fd 1555 error = ENOENT;
72865317 1556 }
bc4a05c6 1557
5279f8fd 1558 return error;
72865317
BP
1559}
1560
72865317 1561static int
ae2ceebd
EJ
1562dp_netdev_flow_add(struct dp_netdev *dp, struct match *match,
1563 const struct nlattr *actions, size_t actions_len)
8a4e3a85 1564 OVS_REQUIRES(dp->flow_mutex)
72865317 1565{
1763b4b8 1566 struct dp_netdev_flow *netdev_flow;
72865317 1567
1763b4b8 1568 netdev_flow = xzalloc(sizeof *netdev_flow);
ae2ceebd 1569 *CONST_CAST(struct flow *, &netdev_flow->flow) = match->flow;
8a4e3a85 1570
ed79f89a
DDP
1571 ovs_refcount_init(&netdev_flow->ref_cnt);
1572
679ba04c
BP
1573 ovsthread_stats_init(&netdev_flow->stats);
1574
61e7deb1
BP
1575 ovsrcu_set(&netdev_flow->actions,
1576 dp_netdev_actions_create(actions, actions_len));
2c0ea78f 1577
8a4e3a85 1578 cls_rule_init(CONST_CAST(struct cls_rule *, &netdev_flow->cr),
ae2ceebd 1579 match, NETDEV_RULE_PRIORITY);
9f361d6b
JR
1580 cmap_insert(&dp->flow_table,
1581 CONST_CAST(struct cmap_node *, &netdev_flow->node),
ae2ceebd 1582 flow_hash(&match->flow, 0));
8a4e3a85
BP
1583 classifier_insert(&dp->cls,
1584 CONST_CAST(struct cls_rule *, &netdev_flow->cr));
72865317 1585
623540e4
EJ
1586 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
1587 struct ds ds = DS_EMPTY_INITIALIZER;
1588
1589 ds_put_cstr(&ds, "flow_add: ");
1590 match_format(match, &ds, OFP_DEFAULT_PRIORITY);
1591 ds_put_cstr(&ds, ", actions:");
1592 format_odp_actions(&ds, actions, actions_len);
1593
1594 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
1595
1596 ds_destroy(&ds);
1597 }
1598
72865317
BP
1599 return 0;
1600}
1601
1602static void
1763b4b8 1603clear_stats(struct dp_netdev_flow *netdev_flow)
72865317 1604{
679ba04c
BP
1605 struct dp_netdev_flow_stats *bucket;
1606 size_t i;
1607
1608 OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &netdev_flow->stats) {
1609 ovs_mutex_lock(&bucket->mutex);
1610 bucket->used = 0;
1611 bucket->packet_count = 0;
1612 bucket->byte_count = 0;
1613 bucket->tcp_flags = 0;
1614 ovs_mutex_unlock(&bucket->mutex);
1615 }
72865317
BP
1616}
1617
1618static int
89625d1e 1619dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
72865317
BP
1620{
1621 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1622 struct dp_netdev_flow *netdev_flow;
4f150744 1623 struct miniflow miniflow;
ae2ceebd 1624 struct match match;
36956a7d
BP
1625 int error;
1626
ae2ceebd 1627 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
8c301900
JR
1628 if (error) {
1629 return error;
1630 }
1631 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
1632 put->mask, put->mask_len,
ae2ceebd 1633 &match.flow, &match.wc.masks);
36956a7d
BP
1634 if (error) {
1635 return error;
1636 }
ae2ceebd 1637 miniflow_init(&miniflow, &match.flow);
72865317 1638
8a4e3a85 1639 ovs_mutex_lock(&dp->flow_mutex);
4f150744 1640 netdev_flow = dp_netdev_lookup_flow(dp, &miniflow);
1763b4b8 1641 if (!netdev_flow) {
89625d1e 1642 if (put->flags & DPIF_FP_CREATE) {
9f361d6b 1643 if (cmap_count(&dp->flow_table) < MAX_FLOWS) {
89625d1e
BP
1644 if (put->stats) {
1645 memset(put->stats, 0, sizeof *put->stats);
feebdea2 1646 }
ae2ceebd 1647 error = dp_netdev_flow_add(dp, &match, put->actions,
5279f8fd 1648 put->actions_len);
72865317 1649 } else {
5279f8fd 1650 error = EFBIG;
72865317
BP
1651 }
1652 } else {
5279f8fd 1653 error = ENOENT;
72865317
BP
1654 }
1655 } else {
2c0ea78f 1656 if (put->flags & DPIF_FP_MODIFY
ae2ceebd 1657 && flow_equal(&match.flow, &netdev_flow->flow)) {
8a4e3a85
BP
1658 struct dp_netdev_actions *new_actions;
1659 struct dp_netdev_actions *old_actions;
1660
1661 new_actions = dp_netdev_actions_create(put->actions,
1662 put->actions_len);
1663
61e7deb1
BP
1664 old_actions = dp_netdev_flow_get_actions(netdev_flow);
1665 ovsrcu_set(&netdev_flow->actions, new_actions);
679ba04c 1666
a84cb64a
BP
1667 if (put->stats) {
1668 get_dpif_flow_stats(netdev_flow, put->stats);
1669 }
1670 if (put->flags & DPIF_FP_ZERO_STATS) {
1671 clear_stats(netdev_flow);
72865317 1672 }
8a4e3a85 1673
61e7deb1 1674 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2c0ea78f 1675 } else if (put->flags & DPIF_FP_CREATE) {
5279f8fd 1676 error = EEXIST;
2c0ea78f
GS
1677 } else {
1678 /* Overlapping flow. */
1679 error = EINVAL;
72865317
BP
1680 }
1681 }
8a4e3a85 1682 ovs_mutex_unlock(&dp->flow_mutex);
5715de14 1683 miniflow_destroy(&miniflow);
5279f8fd
BP
1684
1685 return error;
72865317
BP
1686}
1687
72865317 1688static int
b99d3cee 1689dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
72865317
BP
1690{
1691 struct dp_netdev *dp = get_dp_netdev(dpif);
1763b4b8 1692 struct dp_netdev_flow *netdev_flow;
14608a15 1693 struct flow key;
36956a7d
BP
1694 int error;
1695
b99d3cee 1696 error = dpif_netdev_flow_from_nlattrs(del->key, del->key_len, &key);
36956a7d
BP
1697 if (error) {
1698 return error;
1699 }
72865317 1700
8a4e3a85 1701 ovs_mutex_lock(&dp->flow_mutex);
2c0ea78f 1702 netdev_flow = dp_netdev_find_flow(dp, &key);
1763b4b8 1703 if (netdev_flow) {
b99d3cee 1704 if (del->stats) {
1763b4b8 1705 get_dpif_flow_stats(netdev_flow, del->stats);
feebdea2 1706 }
8a4e3a85 1707 dp_netdev_remove_flow(dp, netdev_flow);
72865317 1708 } else {
5279f8fd 1709 error = ENOENT;
72865317 1710 }
8a4e3a85 1711 ovs_mutex_unlock(&dp->flow_mutex);
5279f8fd
BP
1712
1713 return error;
72865317
BP
1714}
1715
ac64794a
BP
1716struct dpif_netdev_flow_dump {
1717 struct dpif_flow_dump up;
9f361d6b 1718 struct cmap_position pos;
d2ad7ef1
JS
1719 int status;
1720 struct ovs_mutex mutex;
e723fd32
JS
1721};
1722
ac64794a
BP
1723static struct dpif_netdev_flow_dump *
1724dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
72865317 1725{
ac64794a 1726 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
e723fd32
JS
1727}
1728
ac64794a
BP
1729static struct dpif_flow_dump *
1730dpif_netdev_flow_dump_create(const struct dpif *dpif_)
e723fd32 1731{
ac64794a 1732 struct dpif_netdev_flow_dump *dump;
e723fd32 1733
ac64794a
BP
1734 dump = xmalloc(sizeof *dump);
1735 dpif_flow_dump_init(&dump->up, dpif_);
9f361d6b 1736 memset(&dump->pos, 0, sizeof dump->pos);
ac64794a
BP
1737 dump->status = 0;
1738 ovs_mutex_init(&dump->mutex);
1739
1740 return &dump->up;
e723fd32
JS
1741}
1742
1743static int
ac64794a 1744dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
e723fd32 1745{
ac64794a 1746 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
e723fd32 1747
ac64794a
BP
1748 ovs_mutex_destroy(&dump->mutex);
1749 free(dump);
704a1e09
BP
1750 return 0;
1751}
1752
ac64794a
BP
1753struct dpif_netdev_flow_dump_thread {
1754 struct dpif_flow_dump_thread up;
1755 struct dpif_netdev_flow_dump *dump;
8bb113da
RW
1756 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
1757 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
1758};
1759
1760static struct dpif_netdev_flow_dump_thread *
1761dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
1762{
1763 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
1764}
1765
1766static struct dpif_flow_dump_thread *
1767dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
1768{
1769 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
1770 struct dpif_netdev_flow_dump_thread *thread;
1771
1772 thread = xmalloc(sizeof *thread);
1773 dpif_flow_dump_thread_init(&thread->up, &dump->up);
1774 thread->dump = dump;
1775 return &thread->up;
1776}
1777
1778static void
1779dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
1780{
1781 struct dpif_netdev_flow_dump_thread *thread
1782 = dpif_netdev_flow_dump_thread_cast(thread_);
1783
1784 free(thread);
1785}
1786
704a1e09 1787static int
ac64794a 1788dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
8bb113da 1789 struct dpif_flow *flows, int max_flows)
ac64794a
BP
1790{
1791 struct dpif_netdev_flow_dump_thread *thread
1792 = dpif_netdev_flow_dump_thread_cast(thread_);
1793 struct dpif_netdev_flow_dump *dump = thread->dump;
1794 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
8bb113da 1795 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
ac64794a 1796 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
8bb113da
RW
1797 int n_flows = 0;
1798 int i;
14608a15 1799
ac64794a 1800 ovs_mutex_lock(&dump->mutex);
8bb113da 1801 if (!dump->status) {
8bb113da
RW
1802 for (n_flows = 0; n_flows < MIN(max_flows, FLOW_DUMP_MAX_BATCH);
1803 n_flows++) {
9f361d6b 1804 struct cmap_node *node;
8bb113da 1805
9f361d6b 1806 node = cmap_next_position(&dp->flow_table, &dump->pos);
8bb113da
RW
1807 if (!node) {
1808 dump->status = EOF;
1809 break;
1810 }
1811 netdev_flows[n_flows] = CONTAINER_OF(node, struct dp_netdev_flow,
1812 node);
d2ad7ef1 1813 }
8a4e3a85 1814 }
ac64794a 1815 ovs_mutex_unlock(&dump->mutex);
ac64794a 1816
8bb113da
RW
1817 for (i = 0; i < n_flows; i++) {
1818 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
1819 struct odputil_keybuf *keybuf = &thread->keybuf[i];
1820 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
1821 struct dpif_flow *f = &flows[i];
1822 struct dp_netdev_actions *dp_actions;
1823 struct flow_wildcards wc;
1824 struct ofpbuf buf;
1825
1826 minimask_expand(&netdev_flow->cr.match.mask, &wc);
1827
1828 /* Key. */
1829 ofpbuf_use_stack(&buf, keybuf, sizeof *keybuf);
1830 odp_flow_key_from_flow(&buf, &netdev_flow->flow, &wc.masks,
1831 netdev_flow->flow.in_port.odp_port, true);
1832 f->key = ofpbuf_data(&buf);
1833 f->key_len = ofpbuf_size(&buf);
1834
1835 /* Mask. */
1836 ofpbuf_use_stack(&buf, maskbuf, sizeof *maskbuf);
1837 odp_flow_key_from_mask(&buf, &wc.masks, &netdev_flow->flow,
1838 odp_to_u32(wc.masks.in_port.odp_port),
1839 SIZE_MAX, true);
1840 f->mask = ofpbuf_data(&buf);
1841 f->mask_len = ofpbuf_size(&buf);
1842
1843 /* Actions. */
1844 dp_actions = dp_netdev_flow_get_actions(netdev_flow);
1845 f->actions = dp_actions->actions;
1846 f->actions_len = dp_actions->size;
1847
1848 /* Stats. */
1849 get_dpif_flow_stats(netdev_flow, &f->stats);
1850 }
feebdea2 1851
8bb113da 1852 return n_flows;
72865317
BP
1853}
1854
1855static int
758c456d 1856dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
65f13b50 1857 OVS_NO_THREAD_SAFETY_ANALYSIS
72865317
BP
1858{
1859 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 1860 struct dp_netdev_pmd_thread *pmd;
8cbf4f47 1861 struct dpif_packet packet, *pp;
758c456d 1862 struct pkt_metadata *md = &execute->md;
72865317 1863
1f317cb5
PS
1864 if (ofpbuf_size(execute->packet) < ETH_HEADER_LEN ||
1865 ofpbuf_size(execute->packet) > UINT16_MAX) {
72865317
BP
1866 return EINVAL;
1867 }
1868
91088554 1869 packet.ofpbuf = *execute->packet;
8cbf4f47 1870 pp = &packet;
91088554 1871
65f13b50
AW
1872 /* Tries finding the 'pmd'. If NULL is returned, that means
1873 * the current thread is a non-pmd thread and should use
1874 * dp_netdev_get_nonpmd(). */
1875 pmd = ovsthread_getspecific(dp->per_pmd_key);
1876 if (!pmd) {
1877 pmd = dp_netdev_get_nonpmd(dp);
1878 }
1879
1880 /* If the current thread is non-pmd thread, acquires
1881 * the 'non_pmd_mutex'. */
1882 if (pmd->core_id == NON_PMD_CORE_ID) {
1883 ovs_mutex_lock(&dp->non_pmd_mutex);
1884 }
1885 dp_netdev_execute_actions(pmd, &pp, 1, false, md, execute->actions,
9bbf1c3d 1886 execute->actions_len);
65f13b50
AW
1887 if (pmd->core_id == NON_PMD_CORE_ID) {
1888 ovs_mutex_unlock(&dp->non_pmd_mutex);
1889 }
8a4e3a85 1890
91088554
DDP
1891 /* Even though may_steal is set to false, some actions could modify or
1892 * reallocate the ofpbuf memory. We need to pass those changes to the
1893 * caller */
1894 *execute->packet = packet.ofpbuf;
1895
758c456d 1896 return 0;
72865317
BP
1897}
1898
1a0c894a
BP
1899static void
1900dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
1901{
1902 size_t i;
1903
1904 for (i = 0; i < n_ops; i++) {
1905 struct dpif_op *op = ops[i];
1906
1907 switch (op->type) {
1908 case DPIF_OP_FLOW_PUT:
1909 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
1910 break;
1911
1912 case DPIF_OP_FLOW_DEL:
1913 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
1914 break;
1915
1916 case DPIF_OP_EXECUTE:
1917 op->error = dpif_netdev_execute(dpif, &op->u.execute);
1918 break;
6fe09f8c
JS
1919
1920 case DPIF_OP_FLOW_GET:
1921 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
1922 break;
1a0c894a
BP
1923 }
1924 }
1925}
1926
f2eee189
AW
1927/* Returns true if the configuration for rx queues or cpu mask
1928 * is changed. */
1929static bool
1930pmd_config_changed(const struct dp_netdev *dp, size_t rxqs, const char *cmask)
1931{
1932 if (dp->n_dpdk_rxqs != rxqs) {
1933 return true;
1934 } else {
1935 if (dp->pmd_cmask != NULL && cmask != NULL) {
1936 return strcmp(dp->pmd_cmask, cmask);
1937 } else {
1938 return (dp->pmd_cmask != NULL || cmask != NULL);
1939 }
1940 }
1941}
1942
1943/* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
1944static int
1945dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs, const char *cmask)
1946{
1947 struct dp_netdev *dp = get_dp_netdev(dpif);
1948
1949 if (pmd_config_changed(dp, n_rxqs, cmask)) {
1950 struct dp_netdev_port *port;
1951
1952 dp_netdev_destroy_all_pmds(dp);
1953
1954 CMAP_FOR_EACH (port, node, &dp->ports) {
1955 if (netdev_is_pmd(port->netdev)) {
1956 int i, err;
1957
1958 /* Closes the existing 'rxq's. */
1959 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
1960 netdev_rxq_close(port->rxq[i]);
1961 port->rxq[i] = NULL;
1962 }
1963
1964 /* Sets the new rx queue config. */
1965 err = netdev_set_multiq(port->netdev, ovs_numa_get_n_cores(),
1966 n_rxqs);
1967 if (err) {
1968 VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
1969 " %u", netdev_get_name(port->netdev),
1970 n_rxqs);
1971 return err;
1972 }
1973
1974 /* If the set_multiq() above succeeds, reopens the 'rxq's. */
1975 port->rxq = xrealloc(port->rxq, sizeof *port->rxq
1976 * netdev_n_rxq(port->netdev));
1977 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
1978 netdev_rxq_open(port->netdev, &port->rxq[i], i);
1979 }
1980 }
1981 }
1982 dp->n_dpdk_rxqs = n_rxqs;
1983
1984 /* Reconfigures the cpu mask. */
1985 ovs_numa_set_cpu_mask(cmask);
1986 free(dp->pmd_cmask);
1987 dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
1988
1989 /* Restores the non-pmd. */
1990 dp_netdev_set_nonpmd(dp);
1991 /* Restores all pmd threads. */
1992 dp_netdev_reset_pmd_threads(dp);
1993 }
1994
1995 return 0;
1996}
1997
5bf93d67
EJ
1998static int
1999dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2000 uint32_t queue_id, uint32_t *priority)
2001{
2002 *priority = queue_id;
2003 return 0;
2004}
2005
72865317 2006\f
a84cb64a
BP
2007/* Creates and returns a new 'struct dp_netdev_actions', with a reference count
2008 * of 1, whose actions are a copy of from the 'ofpacts_len' bytes of
2009 * 'ofpacts'. */
2010struct dp_netdev_actions *
2011dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2012{
2013 struct dp_netdev_actions *netdev_actions;
2014
2015 netdev_actions = xmalloc(sizeof *netdev_actions);
a84cb64a
BP
2016 netdev_actions->actions = xmemdup(actions, size);
2017 netdev_actions->size = size;
2018
2019 return netdev_actions;
2020}
2021
a84cb64a 2022struct dp_netdev_actions *
61e7deb1 2023dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
a84cb64a 2024{
61e7deb1 2025 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
a84cb64a
BP
2026}
2027
61e7deb1
BP
2028static void
2029dp_netdev_actions_free(struct dp_netdev_actions *actions)
a84cb64a 2030{
61e7deb1
BP
2031 free(actions->actions);
2032 free(actions);
a84cb64a
BP
2033}
2034\f
e4cfed38 2035
5794e276 2036static void
65f13b50 2037dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2038 struct dp_netdev_port *port,
2039 struct netdev_rxq *rxq)
e4cfed38 2040{
8cbf4f47
DDP
2041 struct dpif_packet *packets[NETDEV_MAX_RX_BATCH];
2042 int error, cnt;
e4cfed38 2043
8cbf4f47 2044 error = netdev_rxq_recv(rxq, packets, &cnt);
e4cfed38 2045 if (!error) {
3c33f0ff
JR
2046 struct pkt_metadata md = PKT_METADATA_INITIALIZER(port->port_no);
2047
2048 *recirc_depth_get() = 0;
65f13b50 2049 dp_netdev_input(pmd, packets, cnt, &md);
e4cfed38 2050 } else if (error != EAGAIN && error != EOPNOTSUPP) {
3c33f0ff 2051 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
e4cfed38
PS
2052
2053 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3c33f0ff 2054 netdev_get_name(port->netdev), ovs_strerror(error));
e4cfed38
PS
2055 }
2056}
2057
2058static void
2059dpif_netdev_run(struct dpif *dpif)
2060{
2061 struct dp_netdev_port *port;
2062 struct dp_netdev *dp = get_dp_netdev(dpif);
65f13b50 2063 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_nonpmd(dp);
e4cfed38 2064
65f13b50 2065 ovs_mutex_lock(&dp->non_pmd_mutex);
a532e683 2066 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2067 if (!netdev_is_pmd(port->netdev)) {
2068 int i;
2069
2070 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
65f13b50 2071 dp_netdev_process_rxq_port(non_pmd, port, port->rxq[i]);
55c955bd 2072 }
e4cfed38
PS
2073 }
2074 }
65f13b50 2075 ovs_mutex_unlock(&dp->non_pmd_mutex);
e4cfed38
PS
2076}
2077
2078static void
2079dpif_netdev_wait(struct dpif *dpif)
2080{
2081 struct dp_netdev_port *port;
2082 struct dp_netdev *dp = get_dp_netdev(dpif);
2083
59e6d833 2084 ovs_mutex_lock(&dp_netdev_mutex);
a532e683 2085 CMAP_FOR_EACH (port, node, &dp->ports) {
55c955bd
PS
2086 if (!netdev_is_pmd(port->netdev)) {
2087 int i;
2088
2089 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2090 netdev_rxq_wait(port->rxq[i]);
2091 }
e4cfed38
PS
2092 }
2093 }
59e6d833 2094 ovs_mutex_unlock(&dp_netdev_mutex);
e4cfed38
PS
2095}
2096
f7791740 2097struct rxq_poll {
e4cfed38 2098 struct dp_netdev_port *port;
55c955bd 2099 struct netdev_rxq *rx;
e4cfed38
PS
2100};
2101
2102static int
65f13b50 2103pmd_load_queues(struct dp_netdev_pmd_thread *pmd,
f7791740 2104 struct rxq_poll **ppoll_list, int poll_cnt)
e4cfed38 2105{
f7791740 2106 struct rxq_poll *poll_list = *ppoll_list;
e4cfed38 2107 struct dp_netdev_port *port;
65f13b50 2108 int n_pmds_on_numa, index, i;
e4cfed38
PS
2109
2110 /* Simple scheduler for netdev rx polling. */
e4cfed38 2111 for (i = 0; i < poll_cnt; i++) {
65f13b50 2112 port_unref(poll_list[i].port);
e4cfed38
PS
2113 }
2114
2115 poll_cnt = 0;
65f13b50 2116 n_pmds_on_numa = get_n_pmd_threads_on_numa(pmd->dp, pmd->numa_id);
e4cfed38
PS
2117 index = 0;
2118
65f13b50 2119 CMAP_FOR_EACH (port, node, &pmd->dp->ports) {
a1fdee13
AW
2120 /* Calls port_try_ref() to prevent the main thread
2121 * from deleting the port. */
2122 if (port_try_ref(port)) {
65f13b50
AW
2123 if (netdev_is_pmd(port->netdev)
2124 && netdev_get_numa_id(port->netdev) == pmd->numa_id) {
a1fdee13
AW
2125 int i;
2126
2127 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
65f13b50 2128 if ((index % n_pmds_on_numa) == pmd->index) {
a1fdee13
AW
2129 poll_list = xrealloc(poll_list,
2130 sizeof *poll_list * (poll_cnt + 1));
2131
2132 port_ref(port);
2133 poll_list[poll_cnt].port = port;
2134 poll_list[poll_cnt].rx = port->rxq[i];
2135 poll_cnt++;
2136 }
2137 index++;
55c955bd 2138 }
e4cfed38 2139 }
a1fdee13
AW
2140 /* Unrefs the port_try_ref(). */
2141 port_unref(port);
e4cfed38
PS
2142 }
2143 }
2144
e4cfed38
PS
2145 *ppoll_list = poll_list;
2146 return poll_cnt;
2147}
2148
6c3eee82 2149static void *
e4cfed38 2150pmd_thread_main(void *f_)
6c3eee82 2151{
65f13b50 2152 struct dp_netdev_pmd_thread *pmd = f_;
e4cfed38 2153 unsigned int lc = 0;
f7791740 2154 struct rxq_poll *poll_list;
84067a4c 2155 unsigned int port_seq = PMD_INITIAL_SEQ;
e4cfed38
PS
2156 int poll_cnt;
2157 int i;
6c3eee82 2158
e4cfed38
PS
2159 poll_cnt = 0;
2160 poll_list = NULL;
2161
65f13b50
AW
2162 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
2163 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
2164 pmd_thread_setaffinity_cpu(pmd->core_id);
e4cfed38 2165reload:
65f13b50
AW
2166 emc_cache_init(&pmd->flow_cache);
2167 poll_cnt = pmd_load_queues(pmd, &poll_list, poll_cnt);
6c3eee82 2168
e4cfed38 2169 for (;;) {
6c3eee82
BP
2170 int i;
2171
e4cfed38 2172 for (i = 0; i < poll_cnt; i++) {
65f13b50 2173 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
e4cfed38
PS
2174 }
2175
2176 if (lc++ > 1024) {
84067a4c 2177 unsigned int seq;
6c3eee82 2178
e4cfed38 2179 lc = 0;
84067a4c
JR
2180
2181 ovsrcu_quiesce();
2182
65f13b50 2183 atomic_read_relaxed(&pmd->change_seq, &seq);
84067a4c
JR
2184 if (seq != port_seq) {
2185 port_seq = seq;
6c3eee82
BP
2186 break;
2187 }
2188 }
e4cfed38 2189 }
6c3eee82 2190
65f13b50 2191 emc_cache_uninit(&pmd->flow_cache);
9bbf1c3d 2192
65f13b50 2193 if (!latch_is_set(&pmd->exit_latch)){
e4cfed38
PS
2194 goto reload;
2195 }
6c3eee82 2196
e4cfed38
PS
2197 for (i = 0; i < poll_cnt; i++) {
2198 port_unref(poll_list[i].port);
6c3eee82 2199 }
6c3eee82 2200
e4cfed38 2201 free(poll_list);
6c3eee82
BP
2202 return NULL;
2203}
2204
6b31e073
RW
2205static void
2206dp_netdev_disable_upcall(struct dp_netdev *dp)
2207 OVS_ACQUIRES(dp->upcall_rwlock)
2208{
2209 fat_rwlock_wrlock(&dp->upcall_rwlock);
2210}
2211
2212static void
2213dpif_netdev_disable_upcall(struct dpif *dpif)
2214 OVS_NO_THREAD_SAFETY_ANALYSIS
2215{
2216 struct dp_netdev *dp = get_dp_netdev(dpif);
2217 dp_netdev_disable_upcall(dp);
2218}
2219
2220static void
2221dp_netdev_enable_upcall(struct dp_netdev *dp)
2222 OVS_RELEASES(dp->upcall_rwlock)
2223{
2224 fat_rwlock_unlock(&dp->upcall_rwlock);
2225}
2226
2227static void
2228dpif_netdev_enable_upcall(struct dpif *dpif)
2229 OVS_NO_THREAD_SAFETY_ANALYSIS
2230{
2231 struct dp_netdev *dp = get_dp_netdev(dpif);
2232 dp_netdev_enable_upcall(dp);
2233}
2234
65f13b50
AW
2235/* Returns the pointer to the dp_netdev_pmd_thread for non-pmd threads. */
2236static struct dp_netdev_pmd_thread *
2237dp_netdev_get_nonpmd(struct dp_netdev *dp)
2238{
2239 struct dp_netdev_pmd_thread *pmd;
2240 struct cmap_node *pnode;
2241
2242 pnode = cmap_find(&dp->poll_threads, hash_int(NON_PMD_CORE_ID, 0));
2243 ovs_assert(pnode);
2244 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
2245
2246 return pmd;
2247}
2248
f2eee189
AW
2249/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
2250static void
2251dp_netdev_set_nonpmd(struct dp_netdev *dp)
2252{
2253 struct dp_netdev_pmd_thread *non_pmd;
2254
2255 non_pmd = xzalloc(sizeof *non_pmd);
2256 dp_netdev_configure_pmd(non_pmd, dp, 0, NON_PMD_CORE_ID,
2257 OVS_NUMA_UNSPEC);
2258}
2259
65f13b50 2260/* Configures the 'pmd' based on the input argument. */
6c3eee82 2261static void
65f13b50
AW
2262dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
2263 int index, int core_id, int numa_id)
2264{
2265 pmd->dp = dp;
2266 pmd->index = index;
2267 pmd->core_id = core_id;
2268 pmd->numa_id = numa_id;
2269 latch_init(&pmd->exit_latch);
2270 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
2271 /* init the 'flow_cache' since there is no
2272 * actual thread created for NON_PMD_CORE_ID. */
2273 if (core_id == NON_PMD_CORE_ID) {
2274 emc_cache_init(&pmd->flow_cache);
2275 }
2276 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
2277 hash_int(core_id, 0));
2278}
2279
2280/* Stops the pmd thread, removes it from the 'dp->poll_threads'
2281 * and destroys the struct. */
2282static void
2283dp_netdev_del_pmd(struct dp_netdev_pmd_thread *pmd)
6c3eee82 2284{
65f13b50
AW
2285 /* Uninit the 'flow_cache' since there is
2286 * no actual thread uninit it. */
2287 if (pmd->core_id == NON_PMD_CORE_ID) {
2288 emc_cache_uninit(&pmd->flow_cache);
2289 } else {
2290 latch_set(&pmd->exit_latch);
2291 dp_netdev_reload_pmd__(pmd);
2292 ovs_numa_unpin_core(pmd->core_id);
2293 xpthread_join(pmd->thread, NULL);
2294 }
2295 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
2296 latch_destroy(&pmd->exit_latch);
2297 free(pmd);
2298}
6c3eee82 2299
65f13b50
AW
2300/* Destroys all pmd threads. */
2301static void
2302dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
2303{
2304 struct dp_netdev_pmd_thread *pmd;
2305
2306 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2307 dp_netdev_del_pmd(pmd);
6c3eee82 2308 }
65f13b50 2309}
6c3eee82 2310
65f13b50
AW
2311/* Deletes all pmd threads on numa node 'numa_id'. */
2312static void
2313dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
2314{
2315 struct dp_netdev_pmd_thread *pmd;
6c3eee82 2316
65f13b50
AW
2317 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2318 if (pmd->numa_id == numa_id) {
2319 dp_netdev_del_pmd(pmd);
2320 }
6c3eee82 2321 }
65f13b50 2322}
6c3eee82 2323
65f13b50
AW
2324/* Checks the numa node id of 'netdev' and starts pmd threads for
2325 * the numa node. */
2326static void
2327dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
2328{
2329 int n_pmds;
e4cfed38 2330
65f13b50
AW
2331 if (!ovs_numa_numa_id_is_valid(numa_id)) {
2332 VLOG_ERR("Cannot create pmd threads due to numa id (%d)"
2333 "invalid", numa_id);
2334 return ;
2335 }
2336
2337 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
2338
2339 /* If there are already pmd threads created for the numa node
2340 * in which 'netdev' is on, do nothing. Else, creates the
2341 * pmd threads for the numa node. */
2342 if (!n_pmds) {
2343 int can_have, n_unpinned, i;
2344
2345 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
2346 if (!n_unpinned) {
2347 VLOG_ERR("Cannot create pmd threads due to out of unpinned "
2348 "cores on numa node");
2349 return;
2350 }
6c3eee82 2351
f2eee189
AW
2352 /* If cpu mask is specified, uses all unpinned cores, otherwise
2353 * tries creating NR_PMD_THREADS pmd threads. */
2354 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
65f13b50
AW
2355 for (i = 0; i < can_have; i++) {
2356 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
2357 int core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
e4cfed38 2358
65f13b50
AW
2359 dp_netdev_configure_pmd(pmd, dp, i, core_id, numa_id);
2360 /* Each thread will distribute all devices rx-queues among
2361 * themselves. */
2362 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
2363 }
2364 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
6c3eee82
BP
2365 }
2366}
e4cfed38 2367
6c3eee82 2368\f
679ba04c
BP
2369static void *
2370dp_netdev_flow_stats_new_cb(void)
2371{
2372 struct dp_netdev_flow_stats *bucket = xzalloc_cacheline(sizeof *bucket);
2373 ovs_mutex_init(&bucket->mutex);
2374 return bucket;
2375}
2376
f2eee189
AW
2377/* Called after pmd threads config change. Restarts pmd threads with
2378 * new configuration. */
2379static void
2380dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
2381{
2382 struct dp_netdev_port *port;
2383
2384 CMAP_FOR_EACH (port, node, &dp->ports) {
2385 if (netdev_is_pmd(port->netdev)) {
2386 int numa_id = netdev_get_numa_id(port->netdev);
2387
2388 dp_netdev_set_pmds_on_numa(dp, numa_id);
2389 }
2390 }
2391}
2392
72865317 2393static void
1763b4b8 2394dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow,
8cbf4f47
DDP
2395 int cnt, int size,
2396 uint16_t tcp_flags)
72865317 2397{
679ba04c
BP
2398 long long int now = time_msec();
2399 struct dp_netdev_flow_stats *bucket;
2400
2401 bucket = ovsthread_stats_bucket_get(&netdev_flow->stats,
2402 dp_netdev_flow_stats_new_cb);
2403
2404 ovs_mutex_lock(&bucket->mutex);
2405 bucket->used = MAX(now, bucket->used);
8cbf4f47
DDP
2406 bucket->packet_count += cnt;
2407 bucket->byte_count += size;
679ba04c
BP
2408 bucket->tcp_flags |= tcp_flags;
2409 ovs_mutex_unlock(&bucket->mutex);
72865317
BP
2410}
2411
51852a57
BP
2412static void *
2413dp_netdev_stats_new_cb(void)
2414{
2415 struct dp_netdev_stats *bucket = xzalloc_cacheline(sizeof *bucket);
2416 ovs_mutex_init(&bucket->mutex);
2417 return bucket;
2418}
2419
2420static void
8cbf4f47 2421dp_netdev_count_packet(struct dp_netdev *dp, enum dp_stat_type type, int cnt)
51852a57
BP
2422{
2423 struct dp_netdev_stats *bucket;
2424
2425 bucket = ovsthread_stats_bucket_get(&dp->stats, dp_netdev_stats_new_cb);
2426 ovs_mutex_lock(&bucket->mutex);
8cbf4f47 2427 bucket->n[type] += cnt;
51852a57
BP
2428 ovs_mutex_unlock(&bucket->mutex);
2429}
2430
623540e4
EJ
2431static int
2432dp_netdev_upcall(struct dp_netdev *dp, struct dpif_packet *packet_,
2433 struct flow *flow, struct flow_wildcards *wc,
2434 enum dpif_upcall_type type, const struct nlattr *userdata,
2435 struct ofpbuf *actions, struct ofpbuf *put_actions)
2436{
2437 struct ofpbuf *packet = &packet_->ofpbuf;
2438
2439 if (type == DPIF_UC_MISS) {
2440 dp_netdev_count_packet(dp, DP_STAT_MISS, 1);
2441 }
2442
2443 if (OVS_UNLIKELY(!dp->upcall_cb)) {
2444 return ENODEV;
2445 }
2446
2447 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
2448 struct ds ds = DS_EMPTY_INITIALIZER;
2449 struct ofpbuf key;
2450 char *packet_str;
2451
2452 ofpbuf_init(&key, 0);
2453 odp_flow_key_from_flow(&key, flow, &wc->masks, flow->in_port.odp_port,
2454 true);
2455
2456 packet_str = ofp_packet_to_string(ofpbuf_data(packet),
2457 ofpbuf_size(packet));
2458
2459 odp_flow_key_format(ofpbuf_data(&key), ofpbuf_size(&key), &ds);
2460
2461 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
2462 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
2463
2464 ofpbuf_uninit(&key);
2465 free(packet_str);
2466 ds_destroy(&ds);
2467 }
2468
2469 return dp->upcall_cb(packet, flow, type, userdata, actions, wc,
2470 put_actions, dp->upcall_aux);
2471}
2472
9bbf1c3d
DDP
2473static inline uint32_t
2474dpif_netdev_packet_get_dp_hash(struct dpif_packet *packet,
2475 const struct miniflow *mf)
2476{
2477 uint32_t hash;
2478
2479 hash = dpif_packet_get_dp_hash(packet);
2480 if (OVS_UNLIKELY(!hash)) {
2481 hash = miniflow_hash_5tuple(mf, 0);
2482 dpif_packet_set_dp_hash(packet, hash);
2483 }
2484 return hash;
2485}
2486
567bbb2e 2487struct packet_batch {
8cbf4f47
DDP
2488 unsigned int packet_count;
2489 unsigned int byte_count;
2490 uint16_t tcp_flags;
2491
2492 struct dp_netdev_flow *flow;
2493
2494 struct dpif_packet *packets[NETDEV_MAX_RX_BATCH];
2495 struct pkt_metadata md;
2496};
2497
2498static inline void
9bbf1c3d
DDP
2499packet_batch_update(struct packet_batch *batch, struct dpif_packet *packet,
2500 const struct miniflow *mf)
8cbf4f47
DDP
2501{
2502 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
2503 batch->packets[batch->packet_count++] = packet;
2504 batch->byte_count += ofpbuf_size(&packet->ofpbuf);
2505}
2506
2507static inline void
567bbb2e 2508packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow,
84d6d5eb 2509 struct pkt_metadata *md)
8cbf4f47
DDP
2510{
2511 batch->flow = flow;
2512 batch->md = *md;
8cbf4f47
DDP
2513
2514 batch->packet_count = 0;
2515 batch->byte_count = 0;
2516 batch->tcp_flags = 0;
8cbf4f47
DDP
2517}
2518
2519static inline void
65f13b50
AW
2520packet_batch_execute(struct packet_batch *batch,
2521 struct dp_netdev_pmd_thread *pmd)
8cbf4f47
DDP
2522{
2523 struct dp_netdev_actions *actions;
2524 struct dp_netdev_flow *flow = batch->flow;
2525
2526 dp_netdev_flow_used(batch->flow, batch->packet_count, batch->byte_count,
2527 batch->tcp_flags);
2528
2529 actions = dp_netdev_flow_get_actions(flow);
2530
65f13b50
AW
2531 dp_netdev_execute_actions(pmd, batch->packets, batch->packet_count, true,
2532 &batch->md, actions->actions, actions->size);
8cbf4f47 2533
65f13b50 2534 dp_netdev_count_packet(pmd->dp, DP_STAT_HIT, batch->packet_count);
8cbf4f47
DDP
2535}
2536
9bbf1c3d
DDP
2537static inline bool
2538dp_netdev_queue_batches(struct dpif_packet *pkt, struct pkt_metadata *md,
2539 struct dp_netdev_flow *flow, const struct miniflow *mf,
2540 struct packet_batch *batches, size_t *n_batches,
2541 size_t max_batches)
2542{
2543 struct packet_batch *batch = NULL;
2544 int j;
2545
2546 if (OVS_UNLIKELY(!flow)) {
2547 return false;
2548 }
2549 /* XXX: This O(n^2) algortihm makes sense if we're operating under the
2550 * assumption that the number of distinct flows (and therefore the
2551 * number of distinct batches) is quite small. If this turns out not
2552 * to be the case, it may make sense to pre sort based on the
2553 * netdev_flow pointer. That done we can get the appropriate batching
2554 * in O(n * log(n)) instead. */
2555 for (j = *n_batches - 1; j >= 0; j--) {
2556 if (batches[j].flow == flow) {
2557 batch = &batches[j];
2558 packet_batch_update(batch, pkt, mf);
2559 return true;
2560 }
2561 }
2562 if (OVS_UNLIKELY(*n_batches >= max_batches)) {
2563 return false;
2564 }
2565
2566 batch = &batches[(*n_batches)++];
2567 packet_batch_init(batch, flow, md);
2568 packet_batch_update(batch, pkt, mf);
2569 return true;
2570}
2571
2572static inline void
2573dpif_packet_swap(struct dpif_packet **a, struct dpif_packet **b)
2574{
2575 struct dpif_packet *tmp = *a;
2576 *a = *b;
2577 *b = tmp;
2578}
2579
2580/* Try to process all ('cnt') the 'packets' using only the exact match cache
2581 * 'flow_cache'. If a flow is not found for a packet 'packets[i]', or if there
2582 * is no matching batch for a packet's flow, the miniflow is copied into 'keys'
2583 * and the packet pointer is moved at the beginning of the 'packets' array.
2584 *
2585 * The function returns the number of packets that needs to be processed in the
2586 * 'packets' array (they have been moved to the beginning of the vector).
2587 */
2588static inline size_t
65f13b50
AW
2589emc_processing(struct dp_netdev_pmd_thread *pmd, struct dpif_packet **packets,
2590 size_t cnt, struct pkt_metadata *md,
2591 struct netdev_flow_key *keys)
72865317 2592{
9bbf1c3d
DDP
2593 struct netdev_flow_key key;
2594 struct packet_batch batches[4];
65f13b50 2595 struct emc_cache *flow_cache = &pmd->flow_cache;
84d6d5eb 2596 size_t n_batches, i;
9bbf1c3d 2597 size_t notfound_cnt = 0;
8cbf4f47 2598
9bbf1c3d
DDP
2599 n_batches = 0;
2600 miniflow_initialize(&key.flow, key.buf);
84d6d5eb 2601 for (i = 0; i < cnt; i++) {
9bbf1c3d
DDP
2602 struct dp_netdev_flow *flow;
2603 uint32_t hash;
2604
84d6d5eb
EJ
2605 if (OVS_UNLIKELY(ofpbuf_size(&packets[i]->ofpbuf) < ETH_HEADER_LEN)) {
2606 dpif_packet_delete(packets[i]);
84d6d5eb
EJ
2607 continue;
2608 }
8cbf4f47 2609
9bbf1c3d
DDP
2610 miniflow_extract(&packets[i]->ofpbuf, md, &key.flow);
2611
2612 hash = dpif_netdev_packet_get_dp_hash(packets[i], &key.flow);
2613
2614 flow = emc_lookup(flow_cache, &key.flow, hash);
2615 if (OVS_UNLIKELY(!dp_netdev_queue_batches(packets[i], md,
2616 flow, &key.flow,
2617 batches, &n_batches,
2618 ARRAY_SIZE(batches)))) {
2619 if (i != notfound_cnt) {
2620 dpif_packet_swap(&packets[i], &packets[notfound_cnt]);
2621 }
2622
2623 keys[notfound_cnt++] = key;
2624 }
2625 }
2626
2627 for (i = 0; i < n_batches; i++) {
65f13b50 2628 packet_batch_execute(&batches[i], pmd);
84d6d5eb 2629 }
4f150744 2630
9bbf1c3d
DDP
2631 return notfound_cnt;
2632}
2633
2634static inline void
65f13b50 2635fast_path_processing(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2636 struct dpif_packet **packets, size_t cnt,
2637 struct pkt_metadata *md, struct netdev_flow_key *keys)
2638{
1a0d5831 2639#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
2640 const size_t PKT_ARRAY_SIZE = cnt;
2641#else
1a0d5831 2642 /* Sparse or MSVC doesn't like variable length array. */
9bbf1c3d
DDP
2643 enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH };
2644#endif
2645 struct packet_batch batches[PKT_ARRAY_SIZE];
2646 const struct miniflow *mfs[PKT_ARRAY_SIZE]; /* NULL at bad packets. */
2647 struct cls_rule *rules[PKT_ARRAY_SIZE];
65f13b50
AW
2648 struct dp_netdev *dp = pmd->dp;
2649 struct emc_cache *flow_cache = &pmd->flow_cache;
9bbf1c3d
DDP
2650 size_t n_batches, i;
2651 bool any_miss;
2652
2653 for (i = 0; i < cnt; i++) {
2654 mfs[i] = &keys[i].flow;
2655 }
623540e4
EJ
2656 any_miss = !classifier_lookup_miniflow_batch(&dp->cls, mfs, rules, cnt);
2657 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
2658 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
2659 struct ofpbuf actions, put_actions;
2660 struct match match;
2661
2662 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
2663 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
2664
2665 for (i = 0; i < cnt; i++) {
2666 const struct dp_netdev_flow *netdev_flow;
2667 struct ofpbuf *add_actions;
2668 int error;
2669
2670 if (OVS_LIKELY(rules[i] || !mfs[i])) {
2671 continue;
2672 }
2673
2674 /* It's possible that an earlier slow path execution installed
2675 * the rule this flow needs. In this case, it's a lot cheaper
2676 * to catch it here than execute a miss. */
2677 netdev_flow = dp_netdev_lookup_flow(dp, mfs[i]);
2678 if (netdev_flow) {
2679 rules[i] = CONST_CAST(struct cls_rule *, &netdev_flow->cr);
2680 continue;
2681 }
2682
2683 miniflow_expand(mfs[i], &match.flow);
2684
2685 ofpbuf_clear(&actions);
2686 ofpbuf_clear(&put_actions);
2687
2688 error = dp_netdev_upcall(dp, packets[i], &match.flow, &match.wc,
2689 DPIF_UC_MISS, NULL, &actions,
2690 &put_actions);
2691 if (OVS_UNLIKELY(error && error != ENOSPC)) {
2692 continue;
2693 }
2694
2695 /* We can't allow the packet batching in the next loop to execute
2696 * the actions. Otherwise, if there are any slow path actions,
2697 * we'll send the packet up twice. */
ac8c2081 2698 dp_netdev_execute_actions(pmd, &packets[i], 1, true, md,
65f13b50 2699 ofpbuf_data(&actions),
623540e4
EJ
2700 ofpbuf_size(&actions));
2701
2702 add_actions = ofpbuf_size(&put_actions)
2703 ? &put_actions
2704 : &actions;
2705
2706 ovs_mutex_lock(&dp->flow_mutex);
2707 /* XXX: There's a brief race where this flow could have already
2708 * been installed since we last did the flow lookup. This could be
2709 * solved by moving the mutex lock outside the loop, but that's an
2710 * awful long time to be locking everyone out of making flow
2711 * installs. If we move to a per-core classifier, it would be
2712 * reasonable. */
2713 if (OVS_LIKELY(error != ENOSPC)
2714 && !dp_netdev_lookup_flow(dp, mfs[i])) {
2715 dp_netdev_flow_add(dp, &match, ofpbuf_data(add_actions),
2716 ofpbuf_size(add_actions));
2717 }
2718 ovs_mutex_unlock(&dp->flow_mutex);
2719 }
2720
2721 ofpbuf_uninit(&actions);
2722 ofpbuf_uninit(&put_actions);
2723 fat_rwlock_unlock(&dp->upcall_rwlock);
ac8c2081
DDP
2724 } else if (OVS_UNLIKELY(any_miss)) {
2725 int dropped_cnt = 0;
2726
2727 for (i = 0; i < cnt; i++) {
2728 if (OVS_UNLIKELY(!rules[i] && mfs[i])) {
2729 dpif_packet_delete(packets[i]);
2730 dropped_cnt++;
2731 }
2732 }
2733
2734 dp_netdev_count_packet(dp, DP_STAT_LOST, dropped_cnt);
623540e4 2735 }
84d6d5eb
EJ
2736
2737 n_batches = 0;
8cbf4f47 2738 for (i = 0; i < cnt; i++) {
9bbf1c3d 2739 struct dpif_packet *packet = packets[i];
84d6d5eb 2740 struct dp_netdev_flow *flow;
8cbf4f47 2741
623540e4 2742 if (OVS_UNLIKELY(!rules[i] || !mfs[i])) {
84d6d5eb
EJ
2743 continue;
2744 }
2745
84d6d5eb 2746 flow = dp_netdev_flow_cast(rules[i]);
65f13b50
AW
2747 emc_insert(flow_cache, mfs[i], dpif_packet_get_dp_hash(packet),
2748 flow);
9bbf1c3d
DDP
2749 dp_netdev_queue_batches(packet, md, flow, mfs[i], batches, &n_batches,
2750 ARRAY_SIZE(batches));
8cbf4f47
DDP
2751 }
2752
84d6d5eb 2753 for (i = 0; i < n_batches; i++) {
65f13b50 2754 packet_batch_execute(&batches[i], pmd);
72865317
BP
2755 }
2756}
2757
adcf00ba 2758static void
65f13b50 2759dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
9bbf1c3d
DDP
2760 struct dpif_packet **packets, int cnt, struct pkt_metadata *md)
2761{
1a0d5831 2762#if !defined(__CHECKER__) && !defined(_WIN32)
9bbf1c3d
DDP
2763 const size_t PKT_ARRAY_SIZE = cnt;
2764#else
1a0d5831 2765 /* Sparse or MSVC doesn't like variable length array. */
9bbf1c3d
DDP
2766 enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH };
2767#endif
2768 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
2769 size_t newcnt;
2770
65f13b50 2771 newcnt = emc_processing(pmd, packets, cnt, md, keys);
9bbf1c3d 2772 if (OVS_UNLIKELY(newcnt)) {
65f13b50 2773 fast_path_processing(pmd, packets, newcnt, md, keys);
9bbf1c3d
DDP
2774 }
2775}
2776
9080a111 2777struct dp_netdev_execute_aux {
65f13b50 2778 struct dp_netdev_pmd_thread *pmd;
9080a111
JR
2779};
2780
6b31e073 2781static void
623540e4
EJ
2782dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
2783 void *aux)
6b31e073
RW
2784{
2785 struct dp_netdev *dp = get_dp_netdev(dpif);
623540e4 2786 dp->upcall_aux = aux;
6b31e073
RW
2787 dp->upcall_cb = cb;
2788}
2789
ac8c2081
DDP
2790static void
2791dp_netdev_drop_packets(struct dpif_packet ** packets, int cnt, bool may_steal)
2792{
2793 int i;
2794
2795 if (may_steal) {
2796 for (i = 0; i < cnt; i++) {
2797 dpif_packet_delete(packets[i]);
2798 }
2799 }
2800}
2801
9080a111 2802static void
8cbf4f47 2803dp_execute_cb(void *aux_, struct dpif_packet **packets, int cnt,
572f732a 2804 struct pkt_metadata *md,
09f9da0b 2805 const struct nlattr *a, bool may_steal)
8a4e3a85 2806 OVS_NO_THREAD_SAFETY_ANALYSIS
9080a111
JR
2807{
2808 struct dp_netdev_execute_aux *aux = aux_;
623540e4 2809 uint32_t *depth = recirc_depth_get();
65f13b50
AW
2810 struct dp_netdev_pmd_thread *pmd= aux->pmd;
2811 struct dp_netdev *dp= pmd->dp;
09f9da0b 2812 int type = nl_attr_type(a);
8a4e3a85 2813 struct dp_netdev_port *p;
8cbf4f47 2814 int i;
9080a111 2815
09f9da0b
JR
2816 switch ((enum ovs_action_attr)type) {
2817 case OVS_ACTION_ATTR_OUTPUT:
623540e4 2818 p = dp_netdev_lookup_port(dp, u32_to_odp(nl_attr_get_u32(a)));
26a5075b 2819 if (OVS_LIKELY(p)) {
65f13b50 2820 netdev_send(p->netdev, pmd->core_id, packets, cnt, may_steal);
ac8c2081 2821 return;
8a4e3a85 2822 }
09f9da0b
JR
2823 break;
2824
623540e4
EJ
2825 case OVS_ACTION_ATTR_USERSPACE:
2826 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
2827 const struct nlattr *userdata;
2828 struct ofpbuf actions;
2829 struct flow flow;
4fc65926 2830
623540e4
EJ
2831 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
2832 ofpbuf_init(&actions, 0);
8cbf4f47 2833
623540e4
EJ
2834 for (i = 0; i < cnt; i++) {
2835 int error;
2836
2837 ofpbuf_clear(&actions);
2838
2839 flow_extract(&packets[i]->ofpbuf, md, &flow);
2840 error = dp_netdev_upcall(dp, packets[i], &flow, NULL,
2841 DPIF_UC_ACTION, userdata, &actions,
2842 NULL);
2843 if (!error || error == ENOSPC) {
ac8c2081
DDP
2844 dp_netdev_execute_actions(pmd, &packets[i], 1, may_steal,
2845 md, ofpbuf_data(&actions),
623540e4 2846 ofpbuf_size(&actions));
ac8c2081 2847 } else if (may_steal) {
623540e4
EJ
2848 dpif_packet_delete(packets[i]);
2849 }
db73f716 2850 }
623540e4
EJ
2851 ofpbuf_uninit(&actions);
2852 fat_rwlock_unlock(&dp->upcall_rwlock);
6b31e073 2853
ac8c2081
DDP
2854 return;
2855 }
09f9da0b 2856 break;
572f732a 2857
347bf289
AZ
2858 case OVS_ACTION_ATTR_HASH: {
2859 const struct ovs_action_hash *hash_act;
2860 uint32_t hash;
2861
2862 hash_act = nl_attr_get(a);
8cbf4f47 2863
8cbf4f47
DDP
2864 for (i = 0; i < cnt; i++) {
2865
8cbf4f47
DDP
2866 if (hash_act->hash_alg == OVS_HASH_ALG_L4) {
2867 /* Hash need not be symmetric, nor does it need to include
2868 * L2 fields. */
9bbf1c3d
DDP
2869 hash = hash_2words(dpif_packet_get_dp_hash(packets[i]),
2870 hash_act->hash_basis);
8cbf4f47
DDP
2871 } else {
2872 VLOG_WARN("Unknown hash algorithm specified "
2873 "for the hash action.");
2874 hash = 2;
2875 }
2876
347bf289
AZ
2877 if (!hash) {
2878 hash = 1; /* 0 is not valid */
2879 }
2880
8cbf4f47
DDP
2881 if (i == 0) {
2882 md->dp_hash = hash;
2883 }
9bbf1c3d 2884 dpif_packet_set_dp_hash(packets[i], hash);
347bf289 2885 }
ac8c2081 2886 return;
347bf289
AZ
2887 }
2888
adcf00ba
AZ
2889 case OVS_ACTION_ATTR_RECIRC:
2890 if (*depth < MAX_RECIRC_DEPTH) {
572f732a 2891
adcf00ba 2892 (*depth)++;
8cbf4f47
DDP
2893 for (i = 0; i < cnt; i++) {
2894 struct dpif_packet *recirc_pkt;
2895 struct pkt_metadata recirc_md = *md;
2896
2897 recirc_pkt = (may_steal) ? packets[i]
2898 : dpif_packet_clone(packets[i]);
2899
2900 recirc_md.recirc_id = nl_attr_get_u32(a);
2901
2902 /* Hash is private to each packet */
61a2647e 2903 recirc_md.dp_hash = dpif_packet_get_dp_hash(packets[i]);
8cbf4f47 2904
65f13b50 2905 dp_netdev_input(pmd, &recirc_pkt, 1,
9bbf1c3d 2906 &recirc_md);
8cbf4f47 2907 }
adcf00ba
AZ
2908 (*depth)--;
2909
ac8c2081 2910 return;
adcf00ba 2911 }
ac8c2081
DDP
2912
2913 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
572f732a 2914 break;
572f732a 2915
09f9da0b
JR
2916 case OVS_ACTION_ATTR_PUSH_VLAN:
2917 case OVS_ACTION_ATTR_POP_VLAN:
2918 case OVS_ACTION_ATTR_PUSH_MPLS:
2919 case OVS_ACTION_ATTR_POP_MPLS:
2920 case OVS_ACTION_ATTR_SET:
6d670e7f 2921 case OVS_ACTION_ATTR_SET_MASKED:
09f9da0b
JR
2922 case OVS_ACTION_ATTR_SAMPLE:
2923 case OVS_ACTION_ATTR_UNSPEC:
2924 case __OVS_ACTION_ATTR_MAX:
2925 OVS_NOT_REACHED();
da546e07 2926 }
ac8c2081
DDP
2927
2928 dp_netdev_drop_packets(packets, cnt, may_steal);
98403001
BP
2929}
2930
4edb9ae9 2931static void
65f13b50 2932dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8cbf4f47
DDP
2933 struct dpif_packet **packets, int cnt,
2934 bool may_steal, struct pkt_metadata *md,
9080a111 2935 const struct nlattr *actions, size_t actions_len)
72865317 2936{
65f13b50 2937 struct dp_netdev_execute_aux aux = {pmd};
9080a111 2938
8cbf4f47
DDP
2939 odp_execute_actions(&aux, packets, cnt, may_steal, md, actions,
2940 actions_len, dp_execute_cb);
72865317
BP
2941}
2942
2943const struct dpif_class dpif_netdev_class = {
72865317 2944 "netdev",
2197d7ab 2945 dpif_netdev_enumerate,
0aeaabc8 2946 dpif_netdev_port_open_type,
72865317
BP
2947 dpif_netdev_open,
2948 dpif_netdev_close,
7dab847a 2949 dpif_netdev_destroy,
e4cfed38
PS
2950 dpif_netdev_run,
2951 dpif_netdev_wait,
72865317 2952 dpif_netdev_get_stats,
72865317
BP
2953 dpif_netdev_port_add,
2954 dpif_netdev_port_del,
2955 dpif_netdev_port_query_by_number,
2956 dpif_netdev_port_query_by_name,
98403001 2957 NULL, /* port_get_pid */
b0ec0f27
BP
2958 dpif_netdev_port_dump_start,
2959 dpif_netdev_port_dump_next,
2960 dpif_netdev_port_dump_done,
72865317
BP
2961 dpif_netdev_port_poll,
2962 dpif_netdev_port_poll_wait,
72865317 2963 dpif_netdev_flow_flush,
ac64794a
BP
2964 dpif_netdev_flow_dump_create,
2965 dpif_netdev_flow_dump_destroy,
2966 dpif_netdev_flow_dump_thread_create,
2967 dpif_netdev_flow_dump_thread_destroy,
704a1e09 2968 dpif_netdev_flow_dump_next,
1a0c894a 2969 dpif_netdev_operate,
6b31e073
RW
2970 NULL, /* recv_set */
2971 NULL, /* handlers_set */
f2eee189 2972 dpif_netdev_pmd_set,
5bf93d67 2973 dpif_netdev_queue_to_priority,
6b31e073
RW
2974 NULL, /* recv */
2975 NULL, /* recv_wait */
2976 NULL, /* recv_purge */
2977 dpif_netdev_register_upcall_cb,
2978 dpif_netdev_enable_upcall,
2979 dpif_netdev_disable_upcall,
72865317 2980};
614c4892 2981
74cc3969
BP
2982static void
2983dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
2984 const char *argv[], void *aux OVS_UNUSED)
2985{
59e6d833
BP
2986 struct dp_netdev_port *old_port;
2987 struct dp_netdev_port *new_port;
74cc3969 2988 struct dp_netdev *dp;
ff073a71 2989 odp_port_t port_no;
74cc3969 2990
8a4e3a85 2991 ovs_mutex_lock(&dp_netdev_mutex);
74cc3969
BP
2992 dp = shash_find_data(&dp_netdevs, argv[1]);
2993 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8a4e3a85 2994 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969
BP
2995 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
2996 return;
2997 }
8a4e3a85
BP
2998 ovs_refcount_ref(&dp->ref_cnt);
2999 ovs_mutex_unlock(&dp_netdev_mutex);
74cc3969 3000
59e6d833
BP
3001 ovs_mutex_lock(&dp->port_mutex);
3002 if (get_port_by_name(dp, argv[2], &old_port)) {
74cc3969 3003 unixctl_command_reply_error(conn, "unknown port");
8a4e3a85 3004 goto exit;
74cc3969
BP
3005 }
3006
ff073a71
BP
3007 port_no = u32_to_odp(atoi(argv[3]));
3008 if (!port_no || port_no == ODPP_NONE) {
74cc3969 3009 unixctl_command_reply_error(conn, "bad port number");
8a4e3a85 3010 goto exit;
74cc3969 3011 }
ff073a71 3012 if (dp_netdev_lookup_port(dp, port_no)) {
74cc3969 3013 unixctl_command_reply_error(conn, "port number already in use");
8a4e3a85 3014 goto exit;
74cc3969 3015 }
59e6d833
BP
3016
3017 /* Remove old port. */
3018 cmap_remove(&dp->ports, &old_port->node, hash_port_no(old_port->port_no));
3019 ovsrcu_postpone(free, old_port);
3020
3021 /* Insert new port (cmap semantics mean we cannot re-insert 'old_port'). */
3022 new_port = xmemdup(old_port, sizeof *old_port);
3023 new_port->port_no = port_no;
3024 cmap_insert(&dp->ports, &new_port->node, hash_port_no(port_no));
3025
d33ed218 3026 seq_change(dp->port_seq);
74cc3969 3027 unixctl_command_reply(conn, NULL);
8a4e3a85
BP
3028
3029exit:
59e6d833 3030 ovs_mutex_unlock(&dp->port_mutex);
8a4e3a85 3031 dp_netdev_unref(dp);
74cc3969
BP
3032}
3033
c40b890f
BP
3034static void
3035dpif_dummy_delete_port(struct unixctl_conn *conn, int argc OVS_UNUSED,
3036 const char *argv[], void *aux OVS_UNUSED)
3037{
3038 struct dp_netdev_port *port;
3039 struct dp_netdev *dp;
3040
3041 ovs_mutex_lock(&dp_netdev_mutex);
3042 dp = shash_find_data(&dp_netdevs, argv[1]);
3043 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
3044 ovs_mutex_unlock(&dp_netdev_mutex);
3045 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
3046 return;
3047 }
3048 ovs_refcount_ref(&dp->ref_cnt);
3049 ovs_mutex_unlock(&dp_netdev_mutex);
3050
3051 ovs_mutex_lock(&dp->port_mutex);
3052 if (get_port_by_name(dp, argv[2], &port)) {
3053 unixctl_command_reply_error(conn, "unknown port");
3054 } else if (port->port_no == ODPP_LOCAL) {
3055 unixctl_command_reply_error(conn, "can't delete local port");
3056 } else {
3057 do_del_port(dp, port);
3058 unixctl_command_reply(conn, NULL);
3059 }
3060 ovs_mutex_unlock(&dp->port_mutex);
3061
3062 dp_netdev_unref(dp);
3063}
3064
0cbfe35d
BP
3065static void
3066dpif_dummy_register__(const char *type)
3067{
3068 struct dpif_class *class;
3069
3070 class = xmalloc(sizeof *class);
3071 *class = dpif_netdev_class;
3072 class->type = xstrdup(type);
3073 dp_register_provider(class);
3074}
3075
614c4892 3076void
0cbfe35d 3077dpif_dummy_register(bool override)
614c4892 3078{
0cbfe35d
BP
3079 if (override) {
3080 struct sset types;
3081 const char *type;
3082
3083 sset_init(&types);
3084 dp_enumerate_types(&types);
3085 SSET_FOR_EACH (type, &types) {
3086 if (!dp_unregister_provider(type)) {
3087 dpif_dummy_register__(type);
3088 }
3089 }
3090 sset_destroy(&types);
614c4892 3091 }
0cbfe35d
BP
3092
3093 dpif_dummy_register__("dummy");
74cc3969
BP
3094
3095 unixctl_command_register("dpif-dummy/change-port-number",
74467d5c 3096 "dp port new-number",
74cc3969 3097 3, 3, dpif_dummy_change_port_number, NULL);
74467d5c 3098 unixctl_command_register("dpif-dummy/delete-port", "dp port",
c40b890f 3099 2, 2, dpif_dummy_delete_port, NULL);
614c4892 3100}