]> git.proxmox.com Git - ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: Use hmap for ports.
[ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19
20 #include <ctype.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <inttypes.h>
24 #include <net/if.h>
25 #include <netinet/in.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/socket.h>
31 #include <sys/stat.h>
32 #include <unistd.h>
33
34 #include "bitmap.h"
35 #include "cmap.h"
36 #include "coverage.h"
37 #include "csum.h"
38 #include "dp-packet.h"
39 #include "dpif.h"
40 #include "dpif-provider.h"
41 #include "dummy.h"
42 #include "fat-rwlock.h"
43 #include "flow.h"
44 #include "hmapx.h"
45 #include "latch.h"
46 #include "netdev.h"
47 #include "netdev-dpdk.h"
48 #include "netdev-vport.h"
49 #include "netlink.h"
50 #include "odp-execute.h"
51 #include "odp-util.h"
52 #include "openvswitch/dynamic-string.h"
53 #include "openvswitch/list.h"
54 #include "openvswitch/match.h"
55 #include "openvswitch/ofp-print.h"
56 #include "openvswitch/ofpbuf.h"
57 #include "openvswitch/vlog.h"
58 #include "ovs-numa.h"
59 #include "ovs-rcu.h"
60 #include "packets.h"
61 #include "poll-loop.h"
62 #include "pvector.h"
63 #include "random.h"
64 #include "seq.h"
65 #include "shash.h"
66 #include "sset.h"
67 #include "timeval.h"
68 #include "tnl-neigh-cache.h"
69 #include "tnl-ports.h"
70 #include "unixctl.h"
71 #include "util.h"
72
73 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
74
75 #define FLOW_DUMP_MAX_BATCH 50
76 /* Use per thread recirc_depth to prevent recirculation loop. */
77 #define MAX_RECIRC_DEPTH 5
78 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
79
80 /* Configuration parameters. */
81 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
82
83 /* Protects against changes to 'dp_netdevs'. */
84 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
85
86 /* Contains all 'struct dp_netdev's. */
87 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
88 = SHASH_INITIALIZER(&dp_netdevs);
89
90 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
91
92 static struct odp_support dp_netdev_support = {
93 .max_mpls_depth = SIZE_MAX,
94 .recirc = true,
95 };
96
97 /* Stores a miniflow with inline values */
98
99 struct netdev_flow_key {
100 uint32_t hash; /* Hash function differs for different users. */
101 uint32_t len; /* Length of the following miniflow (incl. map). */
102 struct miniflow mf;
103 uint64_t buf[FLOW_MAX_PACKET_U64S];
104 };
105
106 /* Exact match cache for frequently used flows
107 *
108 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
109 * search its entries for a miniflow that matches exactly the miniflow of the
110 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
111 *
112 * A cache entry holds a reference to its 'dp_netdev_flow'.
113 *
114 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
115 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
116 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
117 * value is the index of a cache entry where the miniflow could be.
118 *
119 *
120 * Thread-safety
121 * =============
122 *
123 * Each pmd_thread has its own private exact match cache.
124 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
125 */
126
127 #define EM_FLOW_HASH_SHIFT 13
128 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
129 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
130 #define EM_FLOW_HASH_SEGS 2
131
132 struct emc_entry {
133 struct dp_netdev_flow *flow;
134 struct netdev_flow_key key; /* key.hash used for emc hash value. */
135 };
136
137 struct emc_cache {
138 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
139 int sweep_idx; /* For emc_cache_slow_sweep(). */
140 };
141
142 /* Iterate in the exact match cache through every entry that might contain a
143 * miniflow with hash 'HASH'. */
144 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
145 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
146 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
147 i__ < EM_FLOW_HASH_SEGS; \
148 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
149 \f
150 /* Simple non-wildcarding single-priority classifier. */
151
152 struct dpcls {
153 struct cmap subtables_map;
154 struct pvector subtables;
155 };
156
157 /* A rule to be inserted to the classifier. */
158 struct dpcls_rule {
159 struct cmap_node cmap_node; /* Within struct dpcls_subtable 'rules'. */
160 struct netdev_flow_key *mask; /* Subtable's mask. */
161 struct netdev_flow_key flow; /* Matching key. */
162 /* 'flow' must be the last field, additional space is allocated here. */
163 };
164
165 static void dpcls_init(struct dpcls *);
166 static void dpcls_destroy(struct dpcls *);
167 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
168 const struct netdev_flow_key *mask);
169 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
170 static bool dpcls_lookup(const struct dpcls *cls,
171 const struct netdev_flow_key keys[],
172 struct dpcls_rule **rules, size_t cnt);
173 \f
174 /* Datapath based on the network device interface from netdev.h.
175 *
176 *
177 * Thread-safety
178 * =============
179 *
180 * Some members, marked 'const', are immutable. Accessing other members
181 * requires synchronization, as noted in more detail below.
182 *
183 * Acquisition order is, from outermost to innermost:
184 *
185 * dp_netdev_mutex (global)
186 * port_mutex
187 * non_pmd_mutex
188 */
189 struct dp_netdev {
190 const struct dpif_class *const class;
191 const char *const name;
192 struct dpif *dpif;
193 struct ovs_refcount ref_cnt;
194 atomic_flag destroyed;
195
196 /* Ports.
197 *
198 * Any lookup into 'ports' or any access to the dp_netdev_ports found
199 * through 'ports' requires taking 'port_mutex'. */
200 struct ovs_mutex port_mutex;
201 struct hmap ports;
202 struct seq *port_seq; /* Incremented whenever a port changes. */
203
204 /* Protects access to ofproto-dpif-upcall interface during revalidator
205 * thread synchronization. */
206 struct fat_rwlock upcall_rwlock;
207 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
208 void *upcall_aux;
209
210 /* Callback function for notifying the purging of dp flows (during
211 * reseting pmd deletion). */
212 dp_purge_callback *dp_purge_cb;
213 void *dp_purge_aux;
214
215 /* Stores all 'struct dp_netdev_pmd_thread's. */
216 struct cmap poll_threads;
217
218 /* Protects the access of the 'struct dp_netdev_pmd_thread'
219 * instance for non-pmd thread. */
220 struct ovs_mutex non_pmd_mutex;
221
222 /* Each pmd thread will store its pointer to
223 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
224 ovsthread_key_t per_pmd_key;
225
226 /* Cpu mask for pin of pmd threads. */
227 char *pmd_cmask;
228 uint64_t last_tnl_conf_seq;
229 };
230
231 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
232 odp_port_t)
233 OVS_REQUIRES(dp->port_mutex);
234
235 enum dp_stat_type {
236 DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
237 DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
238 DP_STAT_MISS, /* Packets that did not match. */
239 DP_STAT_LOST, /* Packets not passed up to the client. */
240 DP_N_STATS
241 };
242
243 enum pmd_cycles_counter_type {
244 PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */
245 PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */
246 PMD_N_CYCLES
247 };
248
249 /* A port in a netdev-based datapath. */
250 struct dp_netdev_port {
251 odp_port_t port_no;
252 struct netdev *netdev;
253 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
254 struct netdev_saved_flags *sf;
255 unsigned n_rxq; /* Number of elements in 'rxq' */
256 struct netdev_rxq **rxq;
257 char *type; /* Port type as requested by user. */
258 int latest_requested_n_rxq; /* Latest requested from netdev number
259 of rx queues. */
260 };
261
262 /* Contained by struct dp_netdev_flow's 'stats' member. */
263 struct dp_netdev_flow_stats {
264 atomic_llong used; /* Last used time, in monotonic msecs. */
265 atomic_ullong packet_count; /* Number of packets matched. */
266 atomic_ullong byte_count; /* Number of bytes matched. */
267 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
268 };
269
270 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
271 *
272 *
273 * Thread-safety
274 * =============
275 *
276 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
277 * its pmd thread's classifier. The text below calls this classifier 'cls'.
278 *
279 * Motivation
280 * ----------
281 *
282 * The thread safety rules described here for "struct dp_netdev_flow" are
283 * motivated by two goals:
284 *
285 * - Prevent threads that read members of "struct dp_netdev_flow" from
286 * reading bad data due to changes by some thread concurrently modifying
287 * those members.
288 *
289 * - Prevent two threads making changes to members of a given "struct
290 * dp_netdev_flow" from interfering with each other.
291 *
292 *
293 * Rules
294 * -----
295 *
296 * A flow 'flow' may be accessed without a risk of being freed during an RCU
297 * grace period. Code that needs to hold onto a flow for a while
298 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
299 *
300 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
301 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
302 * from modification.
303 *
304 * Some members, marked 'const', are immutable. Accessing other members
305 * requires synchronization, as noted in more detail below.
306 */
307 struct dp_netdev_flow {
308 const struct flow flow; /* Unmasked flow that created this entry. */
309 /* Hash table index by unmasked flow. */
310 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
311 /* 'flow_table'. */
312 const ovs_u128 ufid; /* Unique flow identifier. */
313 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
314 /* flow. */
315
316 /* Number of references.
317 * The classifier owns one reference.
318 * Any thread trying to keep a rule from being freed should hold its own
319 * reference. */
320 struct ovs_refcount ref_cnt;
321
322 bool dead;
323
324 /* Statistics. */
325 struct dp_netdev_flow_stats stats;
326
327 /* Actions. */
328 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
329
330 /* While processing a group of input packets, the datapath uses the next
331 * member to store a pointer to the output batch for the flow. It is
332 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
333 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
334 struct packet_batch_per_flow *batch;
335
336 /* Packet classification. */
337 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
338 /* 'cr' must be the last member. */
339 };
340
341 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
342 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
343 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
344 struct flow *);
345
346 /* A set of datapath actions within a "struct dp_netdev_flow".
347 *
348 *
349 * Thread-safety
350 * =============
351 *
352 * A struct dp_netdev_actions 'actions' is protected with RCU. */
353 struct dp_netdev_actions {
354 /* These members are immutable: they do not change during the struct's
355 * lifetime. */
356 unsigned int size; /* Size of 'actions', in bytes. */
357 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
358 };
359
360 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
361 size_t);
362 struct dp_netdev_actions *dp_netdev_flow_get_actions(
363 const struct dp_netdev_flow *);
364 static void dp_netdev_actions_free(struct dp_netdev_actions *);
365
366 /* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
367 struct dp_netdev_pmd_stats {
368 /* Indexed by DP_STAT_*. */
369 atomic_ullong n[DP_N_STATS];
370 };
371
372 /* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
373 struct dp_netdev_pmd_cycles {
374 /* Indexed by PMD_CYCLES_*. */
375 atomic_ullong n[PMD_N_CYCLES];
376 };
377
378 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
379 struct rxq_poll {
380 struct dp_netdev_port *port;
381 struct netdev_rxq *rx;
382 struct ovs_list node;
383 };
384
385 /* Contained by struct dp_netdev_pmd_thread's 'port_cache' or 'tx_ports'. */
386 struct tx_port {
387 odp_port_t port_no;
388 struct netdev *netdev;
389 struct hmap_node node;
390 };
391
392 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
393 * the performance overhead of interrupt processing. Therefore netdev can
394 * not implement rx-wait for these devices. dpif-netdev needs to poll
395 * these device to check for recv buffer. pmd-thread does polling for
396 * devices assigned to itself.
397 *
398 * DPDK used PMD for accessing NIC.
399 *
400 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
401 * I/O of all non-pmd threads. There will be no actual thread created
402 * for the instance.
403 *
404 * Each struct has its own flow table and classifier. Packets received
405 * from managed ports are looked up in the corresponding pmd thread's
406 * flow table, and are executed with the found actions.
407 * */
408 struct dp_netdev_pmd_thread {
409 struct dp_netdev *dp;
410 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
411 struct cmap_node node; /* In 'dp->poll_threads'. */
412
413 pthread_cond_t cond; /* For synchronizing pmd thread reload. */
414 struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
415
416 /* Per thread exact-match cache. Note, the instance for cpu core
417 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
418 * need to be protected by 'non_pmd_mutex'. Every other instance
419 * will only be accessed by its own pmd thread. */
420 struct emc_cache flow_cache;
421
422 /* Classifier and Flow-Table.
423 *
424 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
425 * changes to 'cls' must be made while still holding the 'flow_mutex'.
426 */
427 struct ovs_mutex flow_mutex;
428 struct dpcls cls;
429 struct cmap flow_table OVS_GUARDED; /* Flow table. */
430
431 /* Statistics. */
432 struct dp_netdev_pmd_stats stats;
433
434 /* Cycles counters */
435 struct dp_netdev_pmd_cycles cycles;
436
437 /* Used to count cicles. See 'cycles_counter_end()' */
438 unsigned long long last_cycles;
439
440 struct latch exit_latch; /* For terminating the pmd thread. */
441 atomic_uint change_seq; /* For reloading pmd ports. */
442 pthread_t thread;
443 unsigned core_id; /* CPU core id of this pmd thread. */
444 int numa_id; /* numa node id of this pmd thread. */
445 atomic_int tx_qid; /* Queue id used by this pmd thread to
446 * send packets on all netdevs */
447
448 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
449 /* List of rx queues to poll. */
450 struct ovs_list poll_list OVS_GUARDED;
451 /* Number of elements in 'poll_list' */
452 int poll_cnt;
453 /* Map of 'tx_port's used for transmission. Written by the main thread,
454 * read by the pmd thread. */
455 struct hmap tx_ports OVS_GUARDED;
456
457 /* Map of 'tx_port' used in the fast path. This is a thread-local copy of
458 * 'tx_ports'. The instance for cpu core NON_PMD_CORE_ID can be accessed
459 * by multiple threads, and thusly need to be protected by 'non_pmd_mutex'.
460 * Every other instance will only be accessed by its own pmd thread. */
461 struct hmap port_cache;
462
463 /* Only a pmd thread can write on its own 'cycles' and 'stats'.
464 * The main thread keeps 'stats_zero' and 'cycles_zero' as base
465 * values and subtracts them from 'stats' and 'cycles' before
466 * reporting to the user */
467 unsigned long long stats_zero[DP_N_STATS];
468 uint64_t cycles_zero[PMD_N_CYCLES];
469 };
470
471 #define PMD_INITIAL_SEQ 1
472
473 /* Interface to netdev-based datapath. */
474 struct dpif_netdev {
475 struct dpif dpif;
476 struct dp_netdev *dp;
477 uint64_t last_port_seq;
478 };
479
480 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
481 struct dp_netdev_port **portp)
482 OVS_REQUIRES(dp->port_mutex);
483 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
484 struct dp_netdev_port **portp)
485 OVS_REQUIRES(dp->port_mutex);
486 static void dp_netdev_free(struct dp_netdev *)
487 OVS_REQUIRES(dp_netdev_mutex);
488 static int do_add_port(struct dp_netdev *dp, const char *devname,
489 const char *type, odp_port_t port_no)
490 OVS_REQUIRES(dp->port_mutex);
491 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
492 OVS_REQUIRES(dp->port_mutex);
493 static int dpif_netdev_open(const struct dpif_class *, const char *name,
494 bool create, struct dpif **);
495 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
496 struct dp_packet_batch *,
497 bool may_steal,
498 const struct nlattr *actions,
499 size_t actions_len);
500 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
501 struct dp_packet_batch *, odp_port_t port_no);
502 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
503 struct dp_packet_batch *);
504
505 static void dp_netdev_disable_upcall(struct dp_netdev *);
506 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
507 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
508 struct dp_netdev *dp, unsigned core_id,
509 int numa_id);
510 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
511 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
512 OVS_REQUIRES(dp->port_mutex);
513
514 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
515 unsigned core_id);
516 static struct dp_netdev_pmd_thread *
517 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
518 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
519 static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
520 static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
521 OVS_REQUIRES(dp->port_mutex);
522 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
523 static void dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
524 struct dp_netdev_port *port);
525 static void dp_netdev_add_port_to_pmds(struct dp_netdev *dp,
526 struct dp_netdev_port *port);
527 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
528 struct dp_netdev_port *port);
529 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
530 struct dp_netdev_port *port,
531 struct netdev_rxq *rx);
532 static struct dp_netdev_pmd_thread *
533 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id);
534 static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
535 OVS_REQUIRES(dp->port_mutex);
536 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
537 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
538 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
539 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
540 OVS_REQUIRES(pmd->port_mutex);
541
542 static inline bool emc_entry_alive(struct emc_entry *ce);
543 static void emc_clear_entry(struct emc_entry *ce);
544
545 static void
546 emc_cache_init(struct emc_cache *flow_cache)
547 {
548 int i;
549
550 flow_cache->sweep_idx = 0;
551 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
552 flow_cache->entries[i].flow = NULL;
553 flow_cache->entries[i].key.hash = 0;
554 flow_cache->entries[i].key.len = sizeof(struct miniflow);
555 flowmap_init(&flow_cache->entries[i].key.mf.map);
556 }
557 }
558
559 static void
560 emc_cache_uninit(struct emc_cache *flow_cache)
561 {
562 int i;
563
564 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
565 emc_clear_entry(&flow_cache->entries[i]);
566 }
567 }
568
569 /* Check and clear dead flow references slowly (one entry at each
570 * invocation). */
571 static void
572 emc_cache_slow_sweep(struct emc_cache *flow_cache)
573 {
574 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
575
576 if (!emc_entry_alive(entry)) {
577 emc_clear_entry(entry);
578 }
579 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
580 }
581
582 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
583 bool
584 dpif_is_netdev(const struct dpif *dpif)
585 {
586 return dpif->dpif_class->open == dpif_netdev_open;
587 }
588
589 static struct dpif_netdev *
590 dpif_netdev_cast(const struct dpif *dpif)
591 {
592 ovs_assert(dpif_is_netdev(dpif));
593 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
594 }
595
596 static struct dp_netdev *
597 get_dp_netdev(const struct dpif *dpif)
598 {
599 return dpif_netdev_cast(dpif)->dp;
600 }
601 \f
602 enum pmd_info_type {
603 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
604 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
605 PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */
606 };
607
608 static void
609 pmd_info_show_stats(struct ds *reply,
610 struct dp_netdev_pmd_thread *pmd,
611 unsigned long long stats[DP_N_STATS],
612 uint64_t cycles[PMD_N_CYCLES])
613 {
614 unsigned long long total_packets = 0;
615 uint64_t total_cycles = 0;
616 int i;
617
618 /* These loops subtracts reference values ('*_zero') from the counters.
619 * Since loads and stores are relaxed, it might be possible for a '*_zero'
620 * value to be more recent than the current value we're reading from the
621 * counter. This is not a big problem, since these numbers are not
622 * supposed to be too accurate, but we should at least make sure that
623 * the result is not negative. */
624 for (i = 0; i < DP_N_STATS; i++) {
625 if (stats[i] > pmd->stats_zero[i]) {
626 stats[i] -= pmd->stats_zero[i];
627 } else {
628 stats[i] = 0;
629 }
630
631 if (i != DP_STAT_LOST) {
632 /* Lost packets are already included in DP_STAT_MISS */
633 total_packets += stats[i];
634 }
635 }
636
637 for (i = 0; i < PMD_N_CYCLES; i++) {
638 if (cycles[i] > pmd->cycles_zero[i]) {
639 cycles[i] -= pmd->cycles_zero[i];
640 } else {
641 cycles[i] = 0;
642 }
643
644 total_cycles += cycles[i];
645 }
646
647 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
648 ? "main thread" : "pmd thread");
649
650 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
651 ds_put_format(reply, " numa_id %d", pmd->numa_id);
652 }
653 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
654 ds_put_format(reply, " core_id %u", pmd->core_id);
655 }
656 ds_put_cstr(reply, ":\n");
657
658 ds_put_format(reply,
659 "\temc hits:%llu\n\tmegaflow hits:%llu\n"
660 "\tmiss:%llu\n\tlost:%llu\n",
661 stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
662 stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
663
664 if (total_cycles == 0) {
665 return;
666 }
667
668 ds_put_format(reply,
669 "\tpolling cycles:%"PRIu64" (%.02f%%)\n"
670 "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
671 cycles[PMD_CYCLES_POLLING],
672 cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100,
673 cycles[PMD_CYCLES_PROCESSING],
674 cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
675
676 if (total_packets == 0) {
677 return;
678 }
679
680 ds_put_format(reply,
681 "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
682 total_cycles / (double)total_packets,
683 total_cycles, total_packets);
684
685 ds_put_format(reply,
686 "\tavg processing cycles per packet: "
687 "%.02f (%"PRIu64"/%llu)\n",
688 cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
689 cycles[PMD_CYCLES_PROCESSING], total_packets);
690 }
691
692 static void
693 pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
694 struct dp_netdev_pmd_thread *pmd,
695 unsigned long long stats[DP_N_STATS],
696 uint64_t cycles[PMD_N_CYCLES])
697 {
698 int i;
699
700 /* We cannot write 'stats' and 'cycles' (because they're written by other
701 * threads) and we shouldn't change 'stats' (because they're used to count
702 * datapath stats, which must not be cleared here). Instead, we save the
703 * current values and subtract them from the values to be displayed in the
704 * future */
705 for (i = 0; i < DP_N_STATS; i++) {
706 pmd->stats_zero[i] = stats[i];
707 }
708 for (i = 0; i < PMD_N_CYCLES; i++) {
709 pmd->cycles_zero[i] = cycles[i];
710 }
711 }
712
713 static void
714 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
715 {
716 if (pmd->core_id != NON_PMD_CORE_ID) {
717 struct rxq_poll *poll;
718 const char *prev_name = NULL;
719
720 ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n",
721 pmd->numa_id, pmd->core_id);
722
723 ovs_mutex_lock(&pmd->port_mutex);
724 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
725 const char *name = netdev_get_name(poll->port->netdev);
726
727 if (!prev_name || strcmp(name, prev_name)) {
728 if (prev_name) {
729 ds_put_cstr(reply, "\n");
730 }
731 ds_put_format(reply, "\tport: %s\tqueue-id:",
732 netdev_get_name(poll->port->netdev));
733 }
734 ds_put_format(reply, " %d", netdev_rxq_get_queue_id(poll->rx));
735 prev_name = name;
736 }
737 ovs_mutex_unlock(&pmd->port_mutex);
738 ds_put_cstr(reply, "\n");
739 }
740 }
741
742 static void
743 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
744 void *aux)
745 {
746 struct ds reply = DS_EMPTY_INITIALIZER;
747 struct dp_netdev_pmd_thread *pmd;
748 struct dp_netdev *dp = NULL;
749 enum pmd_info_type type = *(enum pmd_info_type *) aux;
750
751 ovs_mutex_lock(&dp_netdev_mutex);
752
753 if (argc == 2) {
754 dp = shash_find_data(&dp_netdevs, argv[1]);
755 } else if (shash_count(&dp_netdevs) == 1) {
756 /* There's only one datapath */
757 dp = shash_first(&dp_netdevs)->data;
758 }
759
760 if (!dp) {
761 ovs_mutex_unlock(&dp_netdev_mutex);
762 unixctl_command_reply_error(conn,
763 "please specify an existing datapath");
764 return;
765 }
766
767 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
768 if (type == PMD_INFO_SHOW_RXQ) {
769 pmd_info_show_rxq(&reply, pmd);
770 } else {
771 unsigned long long stats[DP_N_STATS];
772 uint64_t cycles[PMD_N_CYCLES];
773 int i;
774
775 /* Read current stats and cycle counters */
776 for (i = 0; i < ARRAY_SIZE(stats); i++) {
777 atomic_read_relaxed(&pmd->stats.n[i], &stats[i]);
778 }
779 for (i = 0; i < ARRAY_SIZE(cycles); i++) {
780 atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]);
781 }
782
783 if (type == PMD_INFO_CLEAR_STATS) {
784 pmd_info_clear_stats(&reply, pmd, stats, cycles);
785 } else if (type == PMD_INFO_SHOW_STATS) {
786 pmd_info_show_stats(&reply, pmd, stats, cycles);
787 }
788 }
789 }
790
791 ovs_mutex_unlock(&dp_netdev_mutex);
792
793 unixctl_command_reply(conn, ds_cstr(&reply));
794 ds_destroy(&reply);
795 }
796 \f
797 static int
798 dpif_netdev_init(void)
799 {
800 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
801 clear_aux = PMD_INFO_CLEAR_STATS,
802 poll_aux = PMD_INFO_SHOW_RXQ;
803
804 unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
805 0, 1, dpif_netdev_pmd_info,
806 (void *)&show_aux);
807 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
808 0, 1, dpif_netdev_pmd_info,
809 (void *)&clear_aux);
810 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
811 0, 1, dpif_netdev_pmd_info,
812 (void *)&poll_aux);
813 return 0;
814 }
815
816 static int
817 dpif_netdev_enumerate(struct sset *all_dps,
818 const struct dpif_class *dpif_class)
819 {
820 struct shash_node *node;
821
822 ovs_mutex_lock(&dp_netdev_mutex);
823 SHASH_FOR_EACH(node, &dp_netdevs) {
824 struct dp_netdev *dp = node->data;
825 if (dpif_class != dp->class) {
826 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
827 * If the class doesn't match, skip this dpif. */
828 continue;
829 }
830 sset_add(all_dps, node->name);
831 }
832 ovs_mutex_unlock(&dp_netdev_mutex);
833
834 return 0;
835 }
836
837 static bool
838 dpif_netdev_class_is_dummy(const struct dpif_class *class)
839 {
840 return class != &dpif_netdev_class;
841 }
842
843 static const char *
844 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
845 {
846 return strcmp(type, "internal") ? type
847 : dpif_netdev_class_is_dummy(class) ? "dummy"
848 : "tap";
849 }
850
851 static struct dpif *
852 create_dpif_netdev(struct dp_netdev *dp)
853 {
854 uint16_t netflow_id = hash_string(dp->name, 0);
855 struct dpif_netdev *dpif;
856
857 ovs_refcount_ref(&dp->ref_cnt);
858
859 dpif = xmalloc(sizeof *dpif);
860 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
861 dpif->dp = dp;
862 dpif->last_port_seq = seq_read(dp->port_seq);
863
864 return &dpif->dpif;
865 }
866
867 /* Choose an unused, non-zero port number and return it on success.
868 * Return ODPP_NONE on failure. */
869 static odp_port_t
870 choose_port(struct dp_netdev *dp, const char *name)
871 OVS_REQUIRES(dp->port_mutex)
872 {
873 uint32_t port_no;
874
875 if (dp->class != &dpif_netdev_class) {
876 const char *p;
877 int start_no = 0;
878
879 /* If the port name begins with "br", start the number search at
880 * 100 to make writing tests easier. */
881 if (!strncmp(name, "br", 2)) {
882 start_no = 100;
883 }
884
885 /* If the port name contains a number, try to assign that port number.
886 * This can make writing unit tests easier because port numbers are
887 * predictable. */
888 for (p = name; *p != '\0'; p++) {
889 if (isdigit((unsigned char) *p)) {
890 port_no = start_no + strtol(p, NULL, 10);
891 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
892 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
893 return u32_to_odp(port_no);
894 }
895 break;
896 }
897 }
898 }
899
900 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
901 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
902 return u32_to_odp(port_no);
903 }
904 }
905
906 return ODPP_NONE;
907 }
908
909 static int
910 create_dp_netdev(const char *name, const struct dpif_class *class,
911 struct dp_netdev **dpp)
912 OVS_REQUIRES(dp_netdev_mutex)
913 {
914 struct dp_netdev *dp;
915 int error;
916
917 dp = xzalloc(sizeof *dp);
918 shash_add(&dp_netdevs, name, dp);
919
920 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
921 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
922 ovs_refcount_init(&dp->ref_cnt);
923 atomic_flag_clear(&dp->destroyed);
924
925 ovs_mutex_init(&dp->port_mutex);
926 hmap_init(&dp->ports);
927 dp->port_seq = seq_create();
928 fat_rwlock_init(&dp->upcall_rwlock);
929
930 /* Disable upcalls by default. */
931 dp_netdev_disable_upcall(dp);
932 dp->upcall_aux = NULL;
933 dp->upcall_cb = NULL;
934
935 cmap_init(&dp->poll_threads);
936 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
937 ovsthread_key_create(&dp->per_pmd_key, NULL);
938
939 ovs_mutex_lock(&dp->port_mutex);
940 dp_netdev_set_nonpmd(dp);
941
942 error = do_add_port(dp, name, "internal", ODPP_LOCAL);
943 ovs_mutex_unlock(&dp->port_mutex);
944 if (error) {
945 dp_netdev_free(dp);
946 return error;
947 }
948
949 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
950 *dpp = dp;
951 return 0;
952 }
953
954 static int
955 dpif_netdev_open(const struct dpif_class *class, const char *name,
956 bool create, struct dpif **dpifp)
957 {
958 struct dp_netdev *dp;
959 int error;
960
961 ovs_mutex_lock(&dp_netdev_mutex);
962 dp = shash_find_data(&dp_netdevs, name);
963 if (!dp) {
964 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
965 } else {
966 error = (dp->class != class ? EINVAL
967 : create ? EEXIST
968 : 0);
969 }
970 if (!error) {
971 *dpifp = create_dpif_netdev(dp);
972 dp->dpif = *dpifp;
973 }
974 ovs_mutex_unlock(&dp_netdev_mutex);
975
976 return error;
977 }
978
979 static void
980 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
981 OVS_NO_THREAD_SAFETY_ANALYSIS
982 {
983 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
984 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
985
986 /* Before freeing a lock we should release it */
987 fat_rwlock_unlock(&dp->upcall_rwlock);
988 fat_rwlock_destroy(&dp->upcall_rwlock);
989 }
990
991 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
992 * through the 'dp_netdevs' shash while freeing 'dp'. */
993 static void
994 dp_netdev_free(struct dp_netdev *dp)
995 OVS_REQUIRES(dp_netdev_mutex)
996 {
997 struct dp_netdev_port *port, *next;
998
999 shash_find_and_delete(&dp_netdevs, dp->name);
1000
1001 dp_netdev_destroy_all_pmds(dp);
1002 ovs_mutex_destroy(&dp->non_pmd_mutex);
1003 ovsthread_key_delete(dp->per_pmd_key);
1004
1005 ovs_mutex_lock(&dp->port_mutex);
1006 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1007 do_del_port(dp, port);
1008 }
1009 ovs_mutex_unlock(&dp->port_mutex);
1010 cmap_destroy(&dp->poll_threads);
1011
1012 seq_destroy(dp->port_seq);
1013 hmap_destroy(&dp->ports);
1014 ovs_mutex_destroy(&dp->port_mutex);
1015
1016 /* Upcalls must be disabled at this point */
1017 dp_netdev_destroy_upcall_lock(dp);
1018
1019 free(dp->pmd_cmask);
1020 free(CONST_CAST(char *, dp->name));
1021 free(dp);
1022 }
1023
1024 static void
1025 dp_netdev_unref(struct dp_netdev *dp)
1026 {
1027 if (dp) {
1028 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1029 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1030 ovs_mutex_lock(&dp_netdev_mutex);
1031 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1032 dp_netdev_free(dp);
1033 }
1034 ovs_mutex_unlock(&dp_netdev_mutex);
1035 }
1036 }
1037
1038 static void
1039 dpif_netdev_close(struct dpif *dpif)
1040 {
1041 struct dp_netdev *dp = get_dp_netdev(dpif);
1042
1043 dp_netdev_unref(dp);
1044 free(dpif);
1045 }
1046
1047 static int
1048 dpif_netdev_destroy(struct dpif *dpif)
1049 {
1050 struct dp_netdev *dp = get_dp_netdev(dpif);
1051
1052 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1053 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1054 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1055 OVS_NOT_REACHED();
1056 }
1057 }
1058
1059 return 0;
1060 }
1061
1062 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1063 * load/store semantics. While the increment is not atomic, the load and
1064 * store operations are, making it impossible to read inconsistent values.
1065 *
1066 * This is used to update thread local stats counters. */
1067 static void
1068 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1069 {
1070 unsigned long long tmp;
1071
1072 atomic_read_relaxed(var, &tmp);
1073 tmp += n;
1074 atomic_store_relaxed(var, tmp);
1075 }
1076
1077 static int
1078 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1079 {
1080 struct dp_netdev *dp = get_dp_netdev(dpif);
1081 struct dp_netdev_pmd_thread *pmd;
1082
1083 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1084 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1085 unsigned long long n;
1086 stats->n_flows += cmap_count(&pmd->flow_table);
1087
1088 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1089 stats->n_hit += n;
1090 atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
1091 stats->n_hit += n;
1092 atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1093 stats->n_missed += n;
1094 atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1095 stats->n_lost += n;
1096 }
1097 stats->n_masks = UINT32_MAX;
1098 stats->n_mask_hit = UINT64_MAX;
1099
1100 return 0;
1101 }
1102
1103 static void
1104 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1105 {
1106 int old_seq;
1107
1108 if (pmd->core_id == NON_PMD_CORE_ID) {
1109 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1110 ovs_mutex_lock(&pmd->port_mutex);
1111 pmd_load_cached_ports(pmd);
1112 ovs_mutex_unlock(&pmd->port_mutex);
1113 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1114 return;
1115 }
1116
1117 ovs_mutex_lock(&pmd->cond_mutex);
1118 atomic_add_relaxed(&pmd->change_seq, 1, &old_seq);
1119 ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1120 ovs_mutex_unlock(&pmd->cond_mutex);
1121 }
1122
1123 static uint32_t
1124 hash_port_no(odp_port_t port_no)
1125 {
1126 return hash_int(odp_to_u32(port_no), 0);
1127 }
1128
1129 static int
1130 port_create(const char *devname, const char *open_type, const char *type,
1131 odp_port_t port_no, struct dp_netdev_port **portp)
1132 {
1133 struct netdev_saved_flags *sf;
1134 struct dp_netdev_port *port;
1135 enum netdev_flags flags;
1136 struct netdev *netdev;
1137 int n_open_rxqs = 0;
1138 int i, error;
1139
1140 *portp = NULL;
1141
1142 /* Open and validate network device. */
1143 error = netdev_open(devname, open_type, &netdev);
1144 if (error) {
1145 return error;
1146 }
1147 /* XXX reject non-Ethernet devices */
1148
1149 netdev_get_flags(netdev, &flags);
1150 if (flags & NETDEV_LOOPBACK) {
1151 VLOG_ERR("%s: cannot add a loopback device", devname);
1152 error = EINVAL;
1153 goto out;
1154 }
1155
1156 if (netdev_is_pmd(netdev)) {
1157 int n_cores = ovs_numa_get_n_cores();
1158
1159 if (n_cores == OVS_CORE_UNSPEC) {
1160 VLOG_ERR("%s, cannot get cpu core info", devname);
1161 error = ENOENT;
1162 goto out;
1163 }
1164 /* There can only be ovs_numa_get_n_cores() pmd threads,
1165 * so creates a txq for each, and one extra for the non
1166 * pmd threads. */
1167 error = netdev_set_multiq(netdev, n_cores + 1,
1168 netdev_requested_n_rxq(netdev));
1169 if (error && (error != EOPNOTSUPP)) {
1170 VLOG_ERR("%s, cannot set multiq", devname);
1171 goto out;
1172 }
1173 }
1174 port = xzalloc(sizeof *port);
1175 port->port_no = port_no;
1176 port->netdev = netdev;
1177 port->n_rxq = netdev_n_rxq(netdev);
1178 port->rxq = xcalloc(port->n_rxq, sizeof *port->rxq);
1179 port->type = xstrdup(type);
1180 port->latest_requested_n_rxq = netdev_requested_n_rxq(netdev);
1181
1182 for (i = 0; i < port->n_rxq; i++) {
1183 error = netdev_rxq_open(netdev, &port->rxq[i], i);
1184 if (error) {
1185 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
1186 devname, ovs_strerror(errno));
1187 goto out_rxq_close;
1188 }
1189 n_open_rxqs++;
1190 }
1191
1192 error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1193 if (error) {
1194 goto out_rxq_close;
1195 }
1196 port->sf = sf;
1197
1198 *portp = port;
1199
1200 return 0;
1201
1202 out_rxq_close:
1203 for (i = 0; i < n_open_rxqs; i++) {
1204 netdev_rxq_close(port->rxq[i]);
1205 }
1206 free(port->type);
1207 free(port->rxq);
1208 free(port);
1209
1210 out:
1211 netdev_close(netdev);
1212 return error;
1213 }
1214
1215 static int
1216 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1217 odp_port_t port_no)
1218 OVS_REQUIRES(dp->port_mutex)
1219 {
1220 struct dp_netdev_port *port;
1221 int error;
1222
1223 /* Reject devices already in 'dp'. */
1224 if (!get_port_by_name(dp, devname, &port)) {
1225 return EEXIST;
1226 }
1227
1228 error = port_create(devname, dpif_netdev_port_open_type(dp->class, type),
1229 type, port_no, &port);
1230 if (error) {
1231 return error;
1232 }
1233
1234 if (netdev_is_pmd(port->netdev)) {
1235 int numa_id = netdev_get_numa_id(port->netdev);
1236
1237 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1238 dp_netdev_set_pmds_on_numa(dp, numa_id);
1239 }
1240
1241 dp_netdev_add_port_to_pmds(dp, port);
1242
1243 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1244 seq_change(dp->port_seq);
1245
1246 return 0;
1247 }
1248
1249 static int
1250 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1251 odp_port_t *port_nop)
1252 {
1253 struct dp_netdev *dp = get_dp_netdev(dpif);
1254 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1255 const char *dpif_port;
1256 odp_port_t port_no;
1257 int error;
1258
1259 ovs_mutex_lock(&dp->port_mutex);
1260 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1261 if (*port_nop != ODPP_NONE) {
1262 port_no = *port_nop;
1263 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1264 } else {
1265 port_no = choose_port(dp, dpif_port);
1266 error = port_no == ODPP_NONE ? EFBIG : 0;
1267 }
1268 if (!error) {
1269 *port_nop = port_no;
1270 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1271 }
1272 ovs_mutex_unlock(&dp->port_mutex);
1273
1274 return error;
1275 }
1276
1277 static int
1278 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1279 {
1280 struct dp_netdev *dp = get_dp_netdev(dpif);
1281 int error;
1282
1283 ovs_mutex_lock(&dp->port_mutex);
1284 if (port_no == ODPP_LOCAL) {
1285 error = EINVAL;
1286 } else {
1287 struct dp_netdev_port *port;
1288
1289 error = get_port_by_number(dp, port_no, &port);
1290 if (!error) {
1291 do_del_port(dp, port);
1292 }
1293 }
1294 ovs_mutex_unlock(&dp->port_mutex);
1295
1296 return error;
1297 }
1298
1299 static bool
1300 is_valid_port_number(odp_port_t port_no)
1301 {
1302 return port_no != ODPP_NONE;
1303 }
1304
1305 static struct dp_netdev_port *
1306 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1307 OVS_REQUIRES(dp->port_mutex)
1308 {
1309 struct dp_netdev_port *port;
1310
1311 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1312 if (port->port_no == port_no) {
1313 return port;
1314 }
1315 }
1316 return NULL;
1317 }
1318
1319 static int
1320 get_port_by_number(struct dp_netdev *dp,
1321 odp_port_t port_no, struct dp_netdev_port **portp)
1322 OVS_REQUIRES(dp->port_mutex)
1323 {
1324 if (!is_valid_port_number(port_no)) {
1325 *portp = NULL;
1326 return EINVAL;
1327 } else {
1328 *portp = dp_netdev_lookup_port(dp, port_no);
1329 return *portp ? 0 : ENOENT;
1330 }
1331 }
1332
1333 static void
1334 port_destroy(struct dp_netdev_port *port)
1335 {
1336 if (!port) {
1337 return;
1338 }
1339
1340 netdev_close(port->netdev);
1341 netdev_restore_flags(port->sf);
1342
1343 for (unsigned i = 0; i < port->n_rxq; i++) {
1344 netdev_rxq_close(port->rxq[i]);
1345 }
1346
1347 free(port->rxq);
1348 free(port->type);
1349 free(port);
1350 }
1351
1352 static int
1353 get_port_by_name(struct dp_netdev *dp,
1354 const char *devname, struct dp_netdev_port **portp)
1355 OVS_REQUIRES(dp->port_mutex)
1356 {
1357 struct dp_netdev_port *port;
1358
1359 HMAP_FOR_EACH (port, node, &dp->ports) {
1360 if (!strcmp(netdev_get_name(port->netdev), devname)) {
1361 *portp = port;
1362 return 0;
1363 }
1364 }
1365 return ENOENT;
1366 }
1367
1368 static int
1369 get_n_pmd_threads(struct dp_netdev *dp)
1370 {
1371 /* There is one non pmd thread in dp->poll_threads */
1372 return cmap_count(&dp->poll_threads) - 1;
1373 }
1374
1375 static int
1376 get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id)
1377 {
1378 struct dp_netdev_pmd_thread *pmd;
1379 int n_pmds = 0;
1380
1381 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1382 if (pmd->numa_id == numa_id) {
1383 n_pmds++;
1384 }
1385 }
1386
1387 return n_pmds;
1388 }
1389
1390 /* Returns 'true' if there is a port with pmd netdev and the netdev
1391 * is on numa node 'numa_id'. */
1392 static bool
1393 has_pmd_port_for_numa(struct dp_netdev *dp, int numa_id)
1394 OVS_REQUIRES(dp->port_mutex)
1395 {
1396 struct dp_netdev_port *port;
1397
1398 HMAP_FOR_EACH (port, node, &dp->ports) {
1399 if (netdev_is_pmd(port->netdev)
1400 && netdev_get_numa_id(port->netdev) == numa_id) {
1401 return true;
1402 }
1403 }
1404
1405 return false;
1406 }
1407
1408
1409 static void
1410 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1411 OVS_REQUIRES(dp->port_mutex)
1412 {
1413 hmap_remove(&dp->ports, &port->node);
1414 seq_change(dp->port_seq);
1415
1416 dp_netdev_del_port_from_all_pmds(dp, port);
1417
1418 if (netdev_is_pmd(port->netdev)) {
1419 int numa_id = netdev_get_numa_id(port->netdev);
1420
1421 /* PMD threads can not be on invalid numa node. */
1422 ovs_assert(ovs_numa_numa_id_is_valid(numa_id));
1423 /* If there is no netdev on the numa node, deletes the pmd threads
1424 * for that numa. */
1425 if (!has_pmd_port_for_numa(dp, numa_id)) {
1426 dp_netdev_del_pmds_on_numa(dp, numa_id);
1427 }
1428 }
1429
1430 port_destroy(port);
1431 }
1432
1433 static void
1434 answer_port_query(const struct dp_netdev_port *port,
1435 struct dpif_port *dpif_port)
1436 {
1437 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1438 dpif_port->type = xstrdup(port->type);
1439 dpif_port->port_no = port->port_no;
1440 }
1441
1442 static int
1443 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1444 struct dpif_port *dpif_port)
1445 {
1446 struct dp_netdev *dp = get_dp_netdev(dpif);
1447 struct dp_netdev_port *port;
1448 int error;
1449
1450 ovs_mutex_lock(&dp->port_mutex);
1451 error = get_port_by_number(dp, port_no, &port);
1452 if (!error && dpif_port) {
1453 answer_port_query(port, dpif_port);
1454 }
1455 ovs_mutex_unlock(&dp->port_mutex);
1456
1457 return error;
1458 }
1459
1460 static int
1461 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1462 struct dpif_port *dpif_port)
1463 {
1464 struct dp_netdev *dp = get_dp_netdev(dpif);
1465 struct dp_netdev_port *port;
1466 int error;
1467
1468 ovs_mutex_lock(&dp->port_mutex);
1469 error = get_port_by_name(dp, devname, &port);
1470 if (!error && dpif_port) {
1471 answer_port_query(port, dpif_port);
1472 }
1473 ovs_mutex_unlock(&dp->port_mutex);
1474
1475 return error;
1476 }
1477
1478 static void
1479 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1480 {
1481 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1482 free(flow);
1483 }
1484
1485 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1486 {
1487 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1488 ovsrcu_postpone(dp_netdev_flow_free, flow);
1489 }
1490 }
1491
1492 static uint32_t
1493 dp_netdev_flow_hash(const ovs_u128 *ufid)
1494 {
1495 return ufid->u32[0];
1496 }
1497
1498 static void
1499 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1500 struct dp_netdev_flow *flow)
1501 OVS_REQUIRES(pmd->flow_mutex)
1502 {
1503 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
1504
1505 dpcls_remove(&pmd->cls, &flow->cr);
1506 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
1507 flow->dead = true;
1508
1509 dp_netdev_flow_unref(flow);
1510 }
1511
1512 static void
1513 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
1514 {
1515 struct dp_netdev_flow *netdev_flow;
1516
1517 ovs_mutex_lock(&pmd->flow_mutex);
1518 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1519 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
1520 }
1521 ovs_mutex_unlock(&pmd->flow_mutex);
1522 }
1523
1524 static int
1525 dpif_netdev_flow_flush(struct dpif *dpif)
1526 {
1527 struct dp_netdev *dp = get_dp_netdev(dpif);
1528 struct dp_netdev_pmd_thread *pmd;
1529
1530 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1531 dp_netdev_pmd_flow_flush(pmd);
1532 }
1533
1534 return 0;
1535 }
1536
1537 struct dp_netdev_port_state {
1538 struct hmap_position position;
1539 char *name;
1540 };
1541
1542 static int
1543 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1544 {
1545 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1546 return 0;
1547 }
1548
1549 static int
1550 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
1551 struct dpif_port *dpif_port)
1552 {
1553 struct dp_netdev_port_state *state = state_;
1554 struct dp_netdev *dp = get_dp_netdev(dpif);
1555 struct hmap_node *node;
1556 int retval;
1557
1558 ovs_mutex_lock(&dp->port_mutex);
1559 node = hmap_at_position(&dp->ports, &state->position);
1560 if (node) {
1561 struct dp_netdev_port *port;
1562
1563 port = CONTAINER_OF(node, struct dp_netdev_port, node);
1564
1565 free(state->name);
1566 state->name = xstrdup(netdev_get_name(port->netdev));
1567 dpif_port->name = state->name;
1568 dpif_port->type = port->type;
1569 dpif_port->port_no = port->port_no;
1570
1571 retval = 0;
1572 } else {
1573 retval = EOF;
1574 }
1575 ovs_mutex_unlock(&dp->port_mutex);
1576
1577 return retval;
1578 }
1579
1580 static int
1581 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
1582 {
1583 struct dp_netdev_port_state *state = state_;
1584 free(state->name);
1585 free(state);
1586 return 0;
1587 }
1588
1589 static int
1590 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
1591 {
1592 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1593 uint64_t new_port_seq;
1594 int error;
1595
1596 new_port_seq = seq_read(dpif->dp->port_seq);
1597 if (dpif->last_port_seq != new_port_seq) {
1598 dpif->last_port_seq = new_port_seq;
1599 error = ENOBUFS;
1600 } else {
1601 error = EAGAIN;
1602 }
1603
1604 return error;
1605 }
1606
1607 static void
1608 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1609 {
1610 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1611
1612 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
1613 }
1614
1615 static struct dp_netdev_flow *
1616 dp_netdev_flow_cast(const struct dpcls_rule *cr)
1617 {
1618 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
1619 }
1620
1621 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1622 {
1623 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1624 }
1625
1626 /* netdev_flow_key utilities.
1627 *
1628 * netdev_flow_key is basically a miniflow. We use these functions
1629 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1630 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1631 *
1632 * - Since we are dealing exclusively with miniflows created by
1633 * miniflow_extract(), if the map is different the miniflow is different.
1634 * Therefore we can be faster by comparing the map and the miniflow in a
1635 * single memcmp().
1636 * - These functions can be inlined by the compiler. */
1637
1638 /* Given the number of bits set in miniflow's maps, returns the size of the
1639 * 'netdev_flow_key.mf' */
1640 static inline size_t
1641 netdev_flow_key_size(size_t flow_u64s)
1642 {
1643 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
1644 }
1645
1646 static inline bool
1647 netdev_flow_key_equal(const struct netdev_flow_key *a,
1648 const struct netdev_flow_key *b)
1649 {
1650 /* 'b->len' may be not set yet. */
1651 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
1652 }
1653
1654 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1655 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
1656 * generated by miniflow_extract. */
1657 static inline bool
1658 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1659 const struct miniflow *mf)
1660 {
1661 return !memcmp(&key->mf, mf, key->len);
1662 }
1663
1664 static inline void
1665 netdev_flow_key_clone(struct netdev_flow_key *dst,
1666 const struct netdev_flow_key *src)
1667 {
1668 memcpy(dst, src,
1669 offsetof(struct netdev_flow_key, mf) + src->len);
1670 }
1671
1672 /* Slow. */
1673 static void
1674 netdev_flow_key_from_flow(struct netdev_flow_key *dst,
1675 const struct flow *src)
1676 {
1677 struct dp_packet packet;
1678 uint64_t buf_stub[512 / 8];
1679
1680 dp_packet_use_stub(&packet, buf_stub, sizeof buf_stub);
1681 pkt_metadata_from_flow(&packet.md, src);
1682 flow_compose(&packet, src);
1683 miniflow_extract(&packet, &dst->mf);
1684 dp_packet_uninit(&packet);
1685
1686 dst->len = netdev_flow_key_size(miniflow_n_values(&dst->mf));
1687 dst->hash = 0; /* Not computed yet. */
1688 }
1689
1690 /* Initialize a netdev_flow_key 'mask' from 'match'. */
1691 static inline void
1692 netdev_flow_mask_init(struct netdev_flow_key *mask,
1693 const struct match *match)
1694 {
1695 uint64_t *dst = miniflow_values(&mask->mf);
1696 struct flowmap fmap;
1697 uint32_t hash = 0;
1698 size_t idx;
1699
1700 /* Only check masks that make sense for the flow. */
1701 flow_wc_map(&match->flow, &fmap);
1702 flowmap_init(&mask->mf.map);
1703
1704 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1705 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
1706
1707 if (mask_u64) {
1708 flowmap_set(&mask->mf.map, idx, 1);
1709 *dst++ = mask_u64;
1710 hash = hash_add64(hash, mask_u64);
1711 }
1712 }
1713
1714 map_t map;
1715
1716 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1717 hash = hash_add64(hash, map);
1718 }
1719
1720 size_t n = dst - miniflow_get_values(&mask->mf);
1721
1722 mask->hash = hash_finish(hash, n * 8);
1723 mask->len = netdev_flow_key_size(n);
1724 }
1725
1726 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
1727 static inline void
1728 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1729 const struct flow *flow,
1730 const struct netdev_flow_key *mask)
1731 {
1732 uint64_t *dst_u64 = miniflow_values(&dst->mf);
1733 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
1734 uint32_t hash = 0;
1735 uint64_t value;
1736
1737 dst->len = mask->len;
1738 dst->mf = mask->mf; /* Copy maps. */
1739
1740 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
1741 *dst_u64 = value & *mask_u64++;
1742 hash = hash_add64(hash, *dst_u64++);
1743 }
1744 dst->hash = hash_finish(hash,
1745 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
1746 }
1747
1748 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
1749 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
1750 MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
1751
1752 /* Returns a hash value for the bits of 'key' where there are 1-bits in
1753 * 'mask'. */
1754 static inline uint32_t
1755 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
1756 const struct netdev_flow_key *mask)
1757 {
1758 const uint64_t *p = miniflow_get_values(&mask->mf);
1759 uint32_t hash = 0;
1760 uint64_t value;
1761
1762 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
1763 hash = hash_add64(hash, value & *p++);
1764 }
1765
1766 return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
1767 }
1768
1769 static inline bool
1770 emc_entry_alive(struct emc_entry *ce)
1771 {
1772 return ce->flow && !ce->flow->dead;
1773 }
1774
1775 static void
1776 emc_clear_entry(struct emc_entry *ce)
1777 {
1778 if (ce->flow) {
1779 dp_netdev_flow_unref(ce->flow);
1780 ce->flow = NULL;
1781 }
1782 }
1783
1784 static inline void
1785 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
1786 const struct netdev_flow_key *key)
1787 {
1788 if (ce->flow != flow) {
1789 if (ce->flow) {
1790 dp_netdev_flow_unref(ce->flow);
1791 }
1792
1793 if (dp_netdev_flow_ref(flow)) {
1794 ce->flow = flow;
1795 } else {
1796 ce->flow = NULL;
1797 }
1798 }
1799 if (key) {
1800 netdev_flow_key_clone(&ce->key, key);
1801 }
1802 }
1803
1804 static inline void
1805 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
1806 struct dp_netdev_flow *flow)
1807 {
1808 struct emc_entry *to_be_replaced = NULL;
1809 struct emc_entry *current_entry;
1810
1811 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1812 if (netdev_flow_key_equal(&current_entry->key, key)) {
1813 /* We found the entry with the 'mf' miniflow */
1814 emc_change_entry(current_entry, flow, NULL);
1815 return;
1816 }
1817
1818 /* Replacement policy: put the flow in an empty (not alive) entry, or
1819 * in the first entry where it can be */
1820 if (!to_be_replaced
1821 || (emc_entry_alive(to_be_replaced)
1822 && !emc_entry_alive(current_entry))
1823 || current_entry->key.hash < to_be_replaced->key.hash) {
1824 to_be_replaced = current_entry;
1825 }
1826 }
1827 /* We didn't find the miniflow in the cache.
1828 * The 'to_be_replaced' entry is where the new flow will be stored */
1829
1830 emc_change_entry(to_be_replaced, flow, key);
1831 }
1832
1833 static inline struct dp_netdev_flow *
1834 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
1835 {
1836 struct emc_entry *current_entry;
1837
1838 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
1839 if (current_entry->key.hash == key->hash
1840 && emc_entry_alive(current_entry)
1841 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
1842
1843 /* We found the entry with the 'key->mf' miniflow */
1844 return current_entry->flow;
1845 }
1846 }
1847
1848 return NULL;
1849 }
1850
1851 static struct dp_netdev_flow *
1852 dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
1853 const struct netdev_flow_key *key)
1854 {
1855 struct dp_netdev_flow *netdev_flow;
1856 struct dpcls_rule *rule;
1857
1858 dpcls_lookup(&pmd->cls, key, &rule, 1);
1859 netdev_flow = dp_netdev_flow_cast(rule);
1860
1861 return netdev_flow;
1862 }
1863
1864 static struct dp_netdev_flow *
1865 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
1866 const ovs_u128 *ufidp, const struct nlattr *key,
1867 size_t key_len)
1868 {
1869 struct dp_netdev_flow *netdev_flow;
1870 struct flow flow;
1871 ovs_u128 ufid;
1872
1873 /* If a UFID is not provided, determine one based on the key. */
1874 if (!ufidp && key && key_len
1875 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow)) {
1876 dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
1877 ufidp = &ufid;
1878 }
1879
1880 if (ufidp) {
1881 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
1882 &pmd->flow_table) {
1883 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
1884 return netdev_flow;
1885 }
1886 }
1887 }
1888
1889 return NULL;
1890 }
1891
1892 static void
1893 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
1894 struct dpif_flow_stats *stats)
1895 {
1896 struct dp_netdev_flow *netdev_flow;
1897 unsigned long long n;
1898 long long used;
1899 uint16_t flags;
1900
1901 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
1902
1903 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
1904 stats->n_packets = n;
1905 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
1906 stats->n_bytes = n;
1907 atomic_read_relaxed(&netdev_flow->stats.used, &used);
1908 stats->used = used;
1909 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
1910 stats->tcp_flags = flags;
1911 }
1912
1913 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
1914 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
1915 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
1916 * protect them. */
1917 static void
1918 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
1919 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
1920 struct dpif_flow *flow, bool terse)
1921 {
1922 if (terse) {
1923 memset(flow, 0, sizeof *flow);
1924 } else {
1925 struct flow_wildcards wc;
1926 struct dp_netdev_actions *actions;
1927 size_t offset;
1928 struct odp_flow_key_parms odp_parms = {
1929 .flow = &netdev_flow->flow,
1930 .mask = &wc.masks,
1931 .support = dp_netdev_support,
1932 };
1933
1934 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
1935
1936 /* Key */
1937 offset = key_buf->size;
1938 flow->key = ofpbuf_tail(key_buf);
1939 odp_parms.odp_in_port = netdev_flow->flow.in_port.odp_port;
1940 odp_flow_key_from_flow(&odp_parms, key_buf);
1941 flow->key_len = key_buf->size - offset;
1942
1943 /* Mask */
1944 offset = mask_buf->size;
1945 flow->mask = ofpbuf_tail(mask_buf);
1946 odp_parms.odp_in_port = wc.masks.in_port.odp_port;
1947 odp_parms.key_buf = key_buf;
1948 odp_flow_key_from_mask(&odp_parms, mask_buf);
1949 flow->mask_len = mask_buf->size - offset;
1950
1951 /* Actions */
1952 actions = dp_netdev_flow_get_actions(netdev_flow);
1953 flow->actions = actions->actions;
1954 flow->actions_len = actions->size;
1955 }
1956
1957 flow->ufid = netdev_flow->ufid;
1958 flow->ufid_present = true;
1959 flow->pmd_id = netdev_flow->pmd_id;
1960 get_dpif_flow_stats(netdev_flow, &flow->stats);
1961 }
1962
1963 static int
1964 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1965 const struct nlattr *mask_key,
1966 uint32_t mask_key_len, const struct flow *flow,
1967 struct flow_wildcards *wc)
1968 {
1969 enum odp_key_fitness fitness;
1970
1971 fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key,
1972 key_len, wc, flow);
1973 if (fitness) {
1974 /* This should not happen: it indicates that
1975 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
1976 * disagree on the acceptable form of a mask. Log the problem
1977 * as an error, with enough details to enable debugging. */
1978 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1979
1980 if (!VLOG_DROP_ERR(&rl)) {
1981 struct ds s;
1982
1983 ds_init(&s);
1984 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
1985 true);
1986 VLOG_ERR("internal error parsing flow mask %s (%s)",
1987 ds_cstr(&s), odp_key_fitness_to_string(fitness));
1988 ds_destroy(&s);
1989 }
1990
1991 return EINVAL;
1992 }
1993
1994 return 0;
1995 }
1996
1997 static int
1998 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
1999 struct flow *flow)
2000 {
2001 odp_port_t in_port;
2002
2003 if (odp_flow_key_to_flow_udpif(key, key_len, flow)) {
2004 /* This should not happen: it indicates that odp_flow_key_from_flow()
2005 * and odp_flow_key_to_flow() disagree on the acceptable form of a
2006 * flow. Log the problem as an error, with enough details to enable
2007 * debugging. */
2008 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2009
2010 if (!VLOG_DROP_ERR(&rl)) {
2011 struct ds s;
2012
2013 ds_init(&s);
2014 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2015 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2016 ds_destroy(&s);
2017 }
2018
2019 return EINVAL;
2020 }
2021
2022 in_port = flow->in_port.odp_port;
2023 if (!is_valid_port_number(in_port) && in_port != ODPP_NONE) {
2024 return EINVAL;
2025 }
2026
2027 /* Userspace datapath doesn't support conntrack. */
2028 if (flow->ct_state || flow->ct_zone || flow->ct_mark
2029 || !ovs_u128_is_zero(flow->ct_label)) {
2030 return EINVAL;
2031 }
2032
2033 return 0;
2034 }
2035
2036 static int
2037 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2038 {
2039 struct dp_netdev *dp = get_dp_netdev(dpif);
2040 struct dp_netdev_flow *netdev_flow;
2041 struct dp_netdev_pmd_thread *pmd;
2042 unsigned pmd_id = get->pmd_id == PMD_ID_NULL
2043 ? NON_PMD_CORE_ID : get->pmd_id;
2044 int error = 0;
2045
2046 pmd = dp_netdev_get_pmd(dp, pmd_id);
2047 if (!pmd) {
2048 return EINVAL;
2049 }
2050
2051 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2052 get->key_len);
2053 if (netdev_flow) {
2054 dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2055 get->flow, false);
2056 } else {
2057 error = ENOENT;
2058 }
2059 dp_netdev_pmd_unref(pmd);
2060
2061
2062 return error;
2063 }
2064
2065 static struct dp_netdev_flow *
2066 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2067 struct match *match, const ovs_u128 *ufid,
2068 const struct nlattr *actions, size_t actions_len)
2069 OVS_REQUIRES(pmd->flow_mutex)
2070 {
2071 struct dp_netdev_flow *flow;
2072 struct netdev_flow_key mask;
2073
2074 netdev_flow_mask_init(&mask, match);
2075 /* Make sure wc does not have metadata. */
2076 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2077 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2078
2079 /* Do not allocate extra space. */
2080 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2081 memset(&flow->stats, 0, sizeof flow->stats);
2082 flow->dead = false;
2083 flow->batch = NULL;
2084 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2085 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2086 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2087 ovs_refcount_init(&flow->ref_cnt);
2088 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2089
2090 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2091 dpcls_insert(&pmd->cls, &flow->cr, &mask);
2092
2093 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2094 dp_netdev_flow_hash(&flow->ufid));
2095
2096 if (OVS_UNLIKELY(VLOG_IS_DBG_ENABLED())) {
2097 struct match match;
2098 struct ds ds = DS_EMPTY_INITIALIZER;
2099
2100 match.tun_md.valid = false;
2101 match.flow = flow->flow;
2102 miniflow_expand(&flow->cr.mask->mf, &match.wc.masks);
2103
2104 ds_put_cstr(&ds, "flow_add: ");
2105 odp_format_ufid(ufid, &ds);
2106 ds_put_cstr(&ds, " ");
2107 match_format(&match, &ds, OFP_DEFAULT_PRIORITY);
2108 ds_put_cstr(&ds, ", actions:");
2109 format_odp_actions(&ds, actions, actions_len);
2110
2111 VLOG_DBG_RL(&upcall_rl, "%s", ds_cstr(&ds));
2112
2113 ds_destroy(&ds);
2114 }
2115
2116 return flow;
2117 }
2118
2119 static int
2120 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2121 {
2122 struct dp_netdev *dp = get_dp_netdev(dpif);
2123 struct dp_netdev_flow *netdev_flow;
2124 struct netdev_flow_key key;
2125 struct dp_netdev_pmd_thread *pmd;
2126 struct match match;
2127 ovs_u128 ufid;
2128 unsigned pmd_id = put->pmd_id == PMD_ID_NULL
2129 ? NON_PMD_CORE_ID : put->pmd_id;
2130 int error;
2131
2132 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow);
2133 if (error) {
2134 return error;
2135 }
2136 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2137 put->mask, put->mask_len,
2138 &match.flow, &match.wc);
2139 if (error) {
2140 return error;
2141 }
2142
2143 pmd = dp_netdev_get_pmd(dp, pmd_id);
2144 if (!pmd) {
2145 return EINVAL;
2146 }
2147
2148 /* Must produce a netdev_flow_key for lookup.
2149 * This interface is no longer performance critical, since it is not used
2150 * for upcall processing any more. */
2151 netdev_flow_key_from_flow(&key, &match.flow);
2152
2153 if (put->ufid) {
2154 ufid = *put->ufid;
2155 } else {
2156 dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2157 }
2158
2159 ovs_mutex_lock(&pmd->flow_mutex);
2160 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &key);
2161 if (!netdev_flow) {
2162 if (put->flags & DPIF_FP_CREATE) {
2163 if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2164 if (put->stats) {
2165 memset(put->stats, 0, sizeof *put->stats);
2166 }
2167 dp_netdev_flow_add(pmd, &match, &ufid, put->actions,
2168 put->actions_len);
2169 error = 0;
2170 } else {
2171 error = EFBIG;
2172 }
2173 } else {
2174 error = ENOENT;
2175 }
2176 } else {
2177 if (put->flags & DPIF_FP_MODIFY
2178 && flow_equal(&match.flow, &netdev_flow->flow)) {
2179 struct dp_netdev_actions *new_actions;
2180 struct dp_netdev_actions *old_actions;
2181
2182 new_actions = dp_netdev_actions_create(put->actions,
2183 put->actions_len);
2184
2185 old_actions = dp_netdev_flow_get_actions(netdev_flow);
2186 ovsrcu_set(&netdev_flow->actions, new_actions);
2187
2188 if (put->stats) {
2189 get_dpif_flow_stats(netdev_flow, put->stats);
2190 }
2191 if (put->flags & DPIF_FP_ZERO_STATS) {
2192 /* XXX: The userspace datapath uses thread local statistics
2193 * (for flows), which should be updated only by the owning
2194 * thread. Since we cannot write on stats memory here,
2195 * we choose not to support this flag. Please note:
2196 * - This feature is currently used only by dpctl commands with
2197 * option --clear.
2198 * - Should the need arise, this operation can be implemented
2199 * by keeping a base value (to be update here) for each
2200 * counter, and subtracting it before outputting the stats */
2201 error = EOPNOTSUPP;
2202 }
2203
2204 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2205 } else if (put->flags & DPIF_FP_CREATE) {
2206 error = EEXIST;
2207 } else {
2208 /* Overlapping flow. */
2209 error = EINVAL;
2210 }
2211 }
2212 ovs_mutex_unlock(&pmd->flow_mutex);
2213 dp_netdev_pmd_unref(pmd);
2214
2215 return error;
2216 }
2217
2218 static int
2219 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
2220 {
2221 struct dp_netdev *dp = get_dp_netdev(dpif);
2222 struct dp_netdev_flow *netdev_flow;
2223 struct dp_netdev_pmd_thread *pmd;
2224 unsigned pmd_id = del->pmd_id == PMD_ID_NULL
2225 ? NON_PMD_CORE_ID : del->pmd_id;
2226 int error = 0;
2227
2228 pmd = dp_netdev_get_pmd(dp, pmd_id);
2229 if (!pmd) {
2230 return EINVAL;
2231 }
2232
2233 ovs_mutex_lock(&pmd->flow_mutex);
2234 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2235 del->key_len);
2236 if (netdev_flow) {
2237 if (del->stats) {
2238 get_dpif_flow_stats(netdev_flow, del->stats);
2239 }
2240 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2241 } else {
2242 error = ENOENT;
2243 }
2244 ovs_mutex_unlock(&pmd->flow_mutex);
2245 dp_netdev_pmd_unref(pmd);
2246
2247 return error;
2248 }
2249
2250 struct dpif_netdev_flow_dump {
2251 struct dpif_flow_dump up;
2252 struct cmap_position poll_thread_pos;
2253 struct cmap_position flow_pos;
2254 struct dp_netdev_pmd_thread *cur_pmd;
2255 int status;
2256 struct ovs_mutex mutex;
2257 };
2258
2259 static struct dpif_netdev_flow_dump *
2260 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
2261 {
2262 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
2263 }
2264
2265 static struct dpif_flow_dump *
2266 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse)
2267 {
2268 struct dpif_netdev_flow_dump *dump;
2269
2270 dump = xzalloc(sizeof *dump);
2271 dpif_flow_dump_init(&dump->up, dpif_);
2272 dump->up.terse = terse;
2273 ovs_mutex_init(&dump->mutex);
2274
2275 return &dump->up;
2276 }
2277
2278 static int
2279 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
2280 {
2281 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2282
2283 ovs_mutex_destroy(&dump->mutex);
2284 free(dump);
2285 return 0;
2286 }
2287
2288 struct dpif_netdev_flow_dump_thread {
2289 struct dpif_flow_dump_thread up;
2290 struct dpif_netdev_flow_dump *dump;
2291 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2292 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
2293 };
2294
2295 static struct dpif_netdev_flow_dump_thread *
2296 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2297 {
2298 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2299 }
2300
2301 static struct dpif_flow_dump_thread *
2302 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2303 {
2304 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2305 struct dpif_netdev_flow_dump_thread *thread;
2306
2307 thread = xmalloc(sizeof *thread);
2308 dpif_flow_dump_thread_init(&thread->up, &dump->up);
2309 thread->dump = dump;
2310 return &thread->up;
2311 }
2312
2313 static void
2314 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2315 {
2316 struct dpif_netdev_flow_dump_thread *thread
2317 = dpif_netdev_flow_dump_thread_cast(thread_);
2318
2319 free(thread);
2320 }
2321
2322 static int
2323 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
2324 struct dpif_flow *flows, int max_flows)
2325 {
2326 struct dpif_netdev_flow_dump_thread *thread
2327 = dpif_netdev_flow_dump_thread_cast(thread_);
2328 struct dpif_netdev_flow_dump *dump = thread->dump;
2329 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
2330 int n_flows = 0;
2331 int i;
2332
2333 ovs_mutex_lock(&dump->mutex);
2334 if (!dump->status) {
2335 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2336 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2337 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2338 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2339
2340 /* First call to dump_next(), extracts the first pmd thread.
2341 * If there is no pmd thread, returns immediately. */
2342 if (!pmd) {
2343 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2344 if (!pmd) {
2345 ovs_mutex_unlock(&dump->mutex);
2346 return n_flows;
2347
2348 }
2349 }
2350
2351 do {
2352 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2353 struct cmap_node *node;
2354
2355 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2356 if (!node) {
2357 break;
2358 }
2359 netdev_flows[n_flows] = CONTAINER_OF(node,
2360 struct dp_netdev_flow,
2361 node);
2362 }
2363 /* When finishing dumping the current pmd thread, moves to
2364 * the next. */
2365 if (n_flows < flow_limit) {
2366 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2367 dp_netdev_pmd_unref(pmd);
2368 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2369 if (!pmd) {
2370 dump->status = EOF;
2371 break;
2372 }
2373 }
2374 /* Keeps the reference to next caller. */
2375 dump->cur_pmd = pmd;
2376
2377 /* If the current dump is empty, do not exit the loop, since the
2378 * remaining pmds could have flows to be dumped. Just dumps again
2379 * on the new 'pmd'. */
2380 } while (!n_flows);
2381 }
2382 ovs_mutex_unlock(&dump->mutex);
2383
2384 for (i = 0; i < n_flows; i++) {
2385 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2386 struct odputil_keybuf *keybuf = &thread->keybuf[i];
2387 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2388 struct dpif_flow *f = &flows[i];
2389 struct ofpbuf key, mask;
2390
2391 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2392 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
2393 dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2394 dump->up.terse);
2395 }
2396
2397 return n_flows;
2398 }
2399
2400 static int
2401 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
2402 OVS_NO_THREAD_SAFETY_ANALYSIS
2403 {
2404 struct dp_netdev *dp = get_dp_netdev(dpif);
2405 struct dp_netdev_pmd_thread *pmd;
2406 struct dp_packet_batch pp;
2407
2408 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2409 dp_packet_size(execute->packet) > UINT16_MAX) {
2410 return EINVAL;
2411 }
2412
2413 /* Tries finding the 'pmd'. If NULL is returned, that means
2414 * the current thread is a non-pmd thread and should use
2415 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
2416 pmd = ovsthread_getspecific(dp->per_pmd_key);
2417 if (!pmd) {
2418 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2419 }
2420
2421 /* If the current thread is non-pmd thread, acquires
2422 * the 'non_pmd_mutex'. */
2423 if (pmd->core_id == NON_PMD_CORE_ID) {
2424 ovs_mutex_lock(&dp->non_pmd_mutex);
2425 }
2426
2427 /* The action processing expects the RSS hash to be valid, because
2428 * it's always initialized at the beginning of datapath processing.
2429 * In this case, though, 'execute->packet' may not have gone through
2430 * the datapath at all, it may have been generated by the upper layer
2431 * (OpenFlow packet-out, BFD frame, ...). */
2432 if (!dp_packet_rss_valid(execute->packet)) {
2433 dp_packet_set_rss_hash(execute->packet,
2434 flow_hash_5tuple(execute->flow, 0));
2435 }
2436
2437 packet_batch_init_packet(&pp, execute->packet);
2438 dp_netdev_execute_actions(pmd, &pp, false, execute->actions,
2439 execute->actions_len);
2440
2441 if (pmd->core_id == NON_PMD_CORE_ID) {
2442 ovs_mutex_unlock(&dp->non_pmd_mutex);
2443 dp_netdev_pmd_unref(pmd);
2444 }
2445
2446 return 0;
2447 }
2448
2449 static void
2450 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2451 {
2452 size_t i;
2453
2454 for (i = 0; i < n_ops; i++) {
2455 struct dpif_op *op = ops[i];
2456
2457 switch (op->type) {
2458 case DPIF_OP_FLOW_PUT:
2459 op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2460 break;
2461
2462 case DPIF_OP_FLOW_DEL:
2463 op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2464 break;
2465
2466 case DPIF_OP_EXECUTE:
2467 op->error = dpif_netdev_execute(dpif, &op->u.execute);
2468 break;
2469
2470 case DPIF_OP_FLOW_GET:
2471 op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2472 break;
2473 }
2474 }
2475 }
2476
2477 /* Returns true if the configuration for rx queues or cpu mask
2478 * is changed. */
2479 static bool
2480 pmd_config_changed(const struct dp_netdev *dp, const char *cmask)
2481 {
2482 struct dp_netdev_port *port;
2483
2484 ovs_mutex_lock(&dp->port_mutex);
2485 HMAP_FOR_EACH (port, node, &dp->ports) {
2486 struct netdev *netdev = port->netdev;
2487 int requested_n_rxq = netdev_requested_n_rxq(netdev);
2488 if (netdev_is_pmd(netdev)
2489 && port->latest_requested_n_rxq != requested_n_rxq) {
2490 ovs_mutex_unlock(&dp->port_mutex);
2491 return true;
2492 }
2493 }
2494 ovs_mutex_unlock(&dp->port_mutex);
2495
2496 if (dp->pmd_cmask != NULL && cmask != NULL) {
2497 return strcmp(dp->pmd_cmask, cmask);
2498 } else {
2499 return (dp->pmd_cmask != NULL || cmask != NULL);
2500 }
2501 }
2502
2503 /* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
2504 static int
2505 dpif_netdev_pmd_set(struct dpif *dpif, const char *cmask)
2506 {
2507 struct dp_netdev *dp = get_dp_netdev(dpif);
2508
2509 if (pmd_config_changed(dp, cmask)) {
2510 struct dp_netdev_port *port;
2511
2512 dp_netdev_destroy_all_pmds(dp);
2513
2514 ovs_mutex_lock(&dp->port_mutex);
2515 HMAP_FOR_EACH (port, node, &dp->ports) {
2516 struct netdev *netdev = port->netdev;
2517 int requested_n_rxq = netdev_requested_n_rxq(netdev);
2518 if (netdev_is_pmd(port->netdev)
2519 && port->latest_requested_n_rxq != requested_n_rxq) {
2520 int i, err;
2521
2522 /* Closes the existing 'rxq's. */
2523 for (i = 0; i < netdev_n_rxq(port->netdev); i++) {
2524 netdev_rxq_close(port->rxq[i]);
2525 port->rxq[i] = NULL;
2526 }
2527 port->n_rxq = 0;
2528
2529 /* Sets the new rx queue config. */
2530 err = netdev_set_multiq(port->netdev,
2531 ovs_numa_get_n_cores() + 1,
2532 requested_n_rxq);
2533 if (err && (err != EOPNOTSUPP)) {
2534 VLOG_ERR("Failed to set dpdk interface %s rx_queue to:"
2535 " %u", netdev_get_name(port->netdev),
2536 requested_n_rxq);
2537 ovs_mutex_unlock(&dp->port_mutex);
2538 return err;
2539 }
2540 port->latest_requested_n_rxq = requested_n_rxq;
2541 /* If the set_multiq() above succeeds, reopens the 'rxq's. */
2542 port->n_rxq = netdev_n_rxq(port->netdev);
2543 port->rxq = xrealloc(port->rxq, sizeof *port->rxq * port->n_rxq);
2544 for (i = 0; i < port->n_rxq; i++) {
2545 netdev_rxq_open(port->netdev, &port->rxq[i], i);
2546 }
2547 }
2548 }
2549 /* Reconfigures the cpu mask. */
2550 ovs_numa_set_cpu_mask(cmask);
2551 free(dp->pmd_cmask);
2552 dp->pmd_cmask = cmask ? xstrdup(cmask) : NULL;
2553
2554 /* Restores the non-pmd. */
2555 dp_netdev_set_nonpmd(dp);
2556 /* Restores all pmd threads. */
2557 dp_netdev_reset_pmd_threads(dp);
2558 ovs_mutex_unlock(&dp->port_mutex);
2559 }
2560
2561 return 0;
2562 }
2563
2564 static int
2565 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
2566 uint32_t queue_id, uint32_t *priority)
2567 {
2568 *priority = queue_id;
2569 return 0;
2570 }
2571
2572 \f
2573 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
2574 * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */
2575 struct dp_netdev_actions *
2576 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
2577 {
2578 struct dp_netdev_actions *netdev_actions;
2579
2580 netdev_actions = xmalloc(sizeof *netdev_actions + size);
2581 memcpy(netdev_actions->actions, actions, size);
2582 netdev_actions->size = size;
2583
2584 return netdev_actions;
2585 }
2586
2587 struct dp_netdev_actions *
2588 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
2589 {
2590 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
2591 }
2592
2593 static void
2594 dp_netdev_actions_free(struct dp_netdev_actions *actions)
2595 {
2596 free(actions);
2597 }
2598 \f
2599 static inline unsigned long long
2600 cycles_counter(void)
2601 {
2602 #ifdef DPDK_NETDEV
2603 return rte_get_tsc_cycles();
2604 #else
2605 return 0;
2606 #endif
2607 }
2608
2609 /* Fake mutex to make sure that the calls to cycles_count_* are balanced */
2610 extern struct ovs_mutex cycles_counter_fake_mutex;
2611
2612 /* Start counting cycles. Must be followed by 'cycles_count_end()' */
2613 static inline void
2614 cycles_count_start(struct dp_netdev_pmd_thread *pmd)
2615 OVS_ACQUIRES(&cycles_counter_fake_mutex)
2616 OVS_NO_THREAD_SAFETY_ANALYSIS
2617 {
2618 pmd->last_cycles = cycles_counter();
2619 }
2620
2621 /* Stop counting cycles and add them to the counter 'type' */
2622 static inline void
2623 cycles_count_end(struct dp_netdev_pmd_thread *pmd,
2624 enum pmd_cycles_counter_type type)
2625 OVS_RELEASES(&cycles_counter_fake_mutex)
2626 OVS_NO_THREAD_SAFETY_ANALYSIS
2627 {
2628 unsigned long long interval = cycles_counter() - pmd->last_cycles;
2629
2630 non_atomic_ullong_add(&pmd->cycles.n[type], interval);
2631 }
2632
2633 static void
2634 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
2635 struct dp_netdev_port *port,
2636 struct netdev_rxq *rxq)
2637 {
2638 struct dp_packet_batch batch;
2639 int error;
2640
2641 dp_packet_batch_init(&batch);
2642 cycles_count_start(pmd);
2643 error = netdev_rxq_recv(rxq, &batch);
2644 cycles_count_end(pmd, PMD_CYCLES_POLLING);
2645 if (!error) {
2646 *recirc_depth_get() = 0;
2647
2648 cycles_count_start(pmd);
2649 dp_netdev_input(pmd, &batch, port->port_no);
2650 cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
2651 } else if (error != EAGAIN && error != EOPNOTSUPP) {
2652 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2653
2654 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
2655 netdev_get_name(port->netdev), ovs_strerror(error));
2656 }
2657 }
2658
2659 /* Return true if needs to revalidate datapath flows. */
2660 static bool
2661 dpif_netdev_run(struct dpif *dpif)
2662 {
2663 struct dp_netdev_port *port;
2664 struct dp_netdev *dp = get_dp_netdev(dpif);
2665 struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_pmd(dp,
2666 NON_PMD_CORE_ID);
2667 uint64_t new_tnl_seq;
2668
2669 ovs_mutex_lock(&dp->port_mutex);
2670 ovs_mutex_lock(&dp->non_pmd_mutex);
2671 HMAP_FOR_EACH (port, node, &dp->ports) {
2672 if (!netdev_is_pmd(port->netdev)) {
2673 int i;
2674
2675 for (i = 0; i < port->n_rxq; i++) {
2676 dp_netdev_process_rxq_port(non_pmd, port, port->rxq[i]);
2677 }
2678 }
2679 }
2680 ovs_mutex_unlock(&dp->non_pmd_mutex);
2681 ovs_mutex_unlock(&dp->port_mutex);
2682 dp_netdev_pmd_unref(non_pmd);
2683
2684 tnl_neigh_cache_run();
2685 tnl_port_map_run();
2686 new_tnl_seq = seq_read(tnl_conf_seq);
2687
2688 if (dp->last_tnl_conf_seq != new_tnl_seq) {
2689 dp->last_tnl_conf_seq = new_tnl_seq;
2690 return true;
2691 }
2692 return false;
2693 }
2694
2695 static void
2696 dpif_netdev_wait(struct dpif *dpif)
2697 {
2698 struct dp_netdev_port *port;
2699 struct dp_netdev *dp = get_dp_netdev(dpif);
2700
2701 ovs_mutex_lock(&dp_netdev_mutex);
2702 ovs_mutex_lock(&dp->port_mutex);
2703 HMAP_FOR_EACH (port, node, &dp->ports) {
2704 if (!netdev_is_pmd(port->netdev)) {
2705 int i;
2706
2707 for (i = 0; i < port->n_rxq; i++) {
2708 netdev_rxq_wait(port->rxq[i]);
2709 }
2710 }
2711 }
2712 ovs_mutex_unlock(&dp->port_mutex);
2713 ovs_mutex_unlock(&dp_netdev_mutex);
2714 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
2715 }
2716
2717 static void
2718 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
2719 {
2720 struct tx_port *tx_port_cached;
2721
2722 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->port_cache) {
2723 free(tx_port_cached);
2724 }
2725 }
2726
2727 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
2728 * 'pmd->port_cache' (thread local) */
2729 static void
2730 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
2731 OVS_REQUIRES(pmd->port_mutex)
2732 {
2733 struct tx_port *tx_port, *tx_port_cached;
2734
2735 pmd_free_cached_ports(pmd);
2736 hmap_shrink(&pmd->port_cache);
2737
2738 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
2739 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
2740 hmap_insert(&pmd->port_cache, &tx_port_cached->node,
2741 hash_port_no(tx_port_cached->port_no));
2742 }
2743 }
2744
2745 static int
2746 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
2747 struct rxq_poll **ppoll_list)
2748 {
2749 struct rxq_poll *poll_list = *ppoll_list;
2750 struct rxq_poll *poll;
2751 int i;
2752
2753 ovs_mutex_lock(&pmd->port_mutex);
2754 poll_list = xrealloc(poll_list, pmd->poll_cnt * sizeof *poll_list);
2755
2756 i = 0;
2757 LIST_FOR_EACH (poll, node, &pmd->poll_list) {
2758 poll_list[i++] = *poll;
2759 }
2760
2761 pmd_load_cached_ports(pmd);
2762
2763 ovs_mutex_unlock(&pmd->port_mutex);
2764
2765 *ppoll_list = poll_list;
2766 return i;
2767 }
2768
2769 static void *
2770 pmd_thread_main(void *f_)
2771 {
2772 struct dp_netdev_pmd_thread *pmd = f_;
2773 unsigned int lc = 0;
2774 struct rxq_poll *poll_list;
2775 unsigned int port_seq = PMD_INITIAL_SEQ;
2776 bool exiting;
2777 int poll_cnt;
2778 int i;
2779
2780 poll_cnt = 0;
2781 poll_list = NULL;
2782
2783 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
2784 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
2785 pmd_thread_setaffinity_cpu(pmd->core_id);
2786 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
2787 reload:
2788 emc_cache_init(&pmd->flow_cache);
2789
2790 /* List port/core affinity */
2791 for (i = 0; i < poll_cnt; i++) {
2792 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
2793 pmd->core_id, netdev_get_name(poll_list[i].port->netdev),
2794 netdev_rxq_get_queue_id(poll_list[i].rx));
2795 }
2796
2797 for (;;) {
2798 for (i = 0; i < poll_cnt; i++) {
2799 dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
2800 }
2801
2802 if (lc++ > 1024) {
2803 unsigned int seq;
2804
2805 lc = 0;
2806
2807 emc_cache_slow_sweep(&pmd->flow_cache);
2808 coverage_try_clear();
2809 ovsrcu_quiesce();
2810
2811 atomic_read_relaxed(&pmd->change_seq, &seq);
2812 if (seq != port_seq) {
2813 port_seq = seq;
2814 break;
2815 }
2816 }
2817 }
2818
2819 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
2820 exiting = latch_is_set(&pmd->exit_latch);
2821 /* Signal here to make sure the pmd finishes
2822 * reloading the updated configuration. */
2823 dp_netdev_pmd_reload_done(pmd);
2824
2825 emc_cache_uninit(&pmd->flow_cache);
2826
2827 if (!exiting) {
2828 goto reload;
2829 }
2830
2831 free(poll_list);
2832 pmd_free_cached_ports(pmd);
2833 return NULL;
2834 }
2835
2836 static void
2837 dp_netdev_disable_upcall(struct dp_netdev *dp)
2838 OVS_ACQUIRES(dp->upcall_rwlock)
2839 {
2840 fat_rwlock_wrlock(&dp->upcall_rwlock);
2841 }
2842
2843 static void
2844 dpif_netdev_disable_upcall(struct dpif *dpif)
2845 OVS_NO_THREAD_SAFETY_ANALYSIS
2846 {
2847 struct dp_netdev *dp = get_dp_netdev(dpif);
2848 dp_netdev_disable_upcall(dp);
2849 }
2850
2851 static void
2852 dp_netdev_enable_upcall(struct dp_netdev *dp)
2853 OVS_RELEASES(dp->upcall_rwlock)
2854 {
2855 fat_rwlock_unlock(&dp->upcall_rwlock);
2856 }
2857
2858 static void
2859 dpif_netdev_enable_upcall(struct dpif *dpif)
2860 OVS_NO_THREAD_SAFETY_ANALYSIS
2861 {
2862 struct dp_netdev *dp = get_dp_netdev(dpif);
2863 dp_netdev_enable_upcall(dp);
2864 }
2865
2866 static void
2867 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
2868 {
2869 ovs_mutex_lock(&pmd->cond_mutex);
2870 xpthread_cond_signal(&pmd->cond);
2871 ovs_mutex_unlock(&pmd->cond_mutex);
2872 }
2873
2874 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
2875 * the pointer if succeeds, otherwise, NULL.
2876 *
2877 * Caller must unrefs the returned reference. */
2878 static struct dp_netdev_pmd_thread *
2879 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
2880 {
2881 struct dp_netdev_pmd_thread *pmd;
2882 const struct cmap_node *pnode;
2883
2884 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
2885 if (!pnode) {
2886 return NULL;
2887 }
2888 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
2889
2890 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
2891 }
2892
2893 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
2894 static void
2895 dp_netdev_set_nonpmd(struct dp_netdev *dp)
2896 OVS_REQUIRES(dp->port_mutex)
2897 {
2898 struct dp_netdev_pmd_thread *non_pmd;
2899 struct dp_netdev_port *port;
2900
2901 non_pmd = xzalloc(sizeof *non_pmd);
2902 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
2903
2904 HMAP_FOR_EACH (port, node, &dp->ports) {
2905 dp_netdev_add_port_tx_to_pmd(non_pmd, port);
2906 }
2907
2908 dp_netdev_reload_pmd__(non_pmd);
2909 }
2910
2911 /* Caller must have valid pointer to 'pmd'. */
2912 static bool
2913 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
2914 {
2915 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
2916 }
2917
2918 static void
2919 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
2920 {
2921 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
2922 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
2923 }
2924 }
2925
2926 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
2927 * fails, keeps checking for next node until reaching the end of cmap.
2928 *
2929 * Caller must unrefs the returned reference. */
2930 static struct dp_netdev_pmd_thread *
2931 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
2932 {
2933 struct dp_netdev_pmd_thread *next;
2934
2935 do {
2936 struct cmap_node *node;
2937
2938 node = cmap_next_position(&dp->poll_threads, pos);
2939 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
2940 : NULL;
2941 } while (next && !dp_netdev_pmd_try_ref(next));
2942
2943 return next;
2944 }
2945
2946 /* Configures the 'pmd' based on the input argument. */
2947 static void
2948 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
2949 unsigned core_id, int numa_id)
2950 {
2951 pmd->dp = dp;
2952 pmd->core_id = core_id;
2953 pmd->numa_id = numa_id;
2954 pmd->poll_cnt = 0;
2955
2956 atomic_init(&pmd->tx_qid,
2957 (core_id == NON_PMD_CORE_ID)
2958 ? ovs_numa_get_n_cores()
2959 : get_n_pmd_threads(dp));
2960
2961 ovs_refcount_init(&pmd->ref_cnt);
2962 latch_init(&pmd->exit_latch);
2963 atomic_init(&pmd->change_seq, PMD_INITIAL_SEQ);
2964 xpthread_cond_init(&pmd->cond, NULL);
2965 ovs_mutex_init(&pmd->cond_mutex);
2966 ovs_mutex_init(&pmd->flow_mutex);
2967 ovs_mutex_init(&pmd->port_mutex);
2968 dpcls_init(&pmd->cls);
2969 cmap_init(&pmd->flow_table);
2970 ovs_list_init(&pmd->poll_list);
2971 hmap_init(&pmd->tx_ports);
2972 hmap_init(&pmd->port_cache);
2973 /* init the 'flow_cache' since there is no
2974 * actual thread created for NON_PMD_CORE_ID. */
2975 if (core_id == NON_PMD_CORE_ID) {
2976 emc_cache_init(&pmd->flow_cache);
2977 }
2978 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
2979 hash_int(core_id, 0));
2980 }
2981
2982 static void
2983 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
2984 {
2985 dp_netdev_pmd_flow_flush(pmd);
2986 dpcls_destroy(&pmd->cls);
2987 hmap_destroy(&pmd->port_cache);
2988 hmap_destroy(&pmd->tx_ports);
2989 cmap_destroy(&pmd->flow_table);
2990 ovs_mutex_destroy(&pmd->flow_mutex);
2991 latch_destroy(&pmd->exit_latch);
2992 xpthread_cond_destroy(&pmd->cond);
2993 ovs_mutex_destroy(&pmd->cond_mutex);
2994 ovs_mutex_destroy(&pmd->port_mutex);
2995 free(pmd);
2996 }
2997
2998 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
2999 * and unrefs the struct. */
3000 static void
3001 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
3002 {
3003 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
3004 * but extra cleanup is necessary */
3005 if (pmd->core_id == NON_PMD_CORE_ID) {
3006 emc_cache_uninit(&pmd->flow_cache);
3007 pmd_free_cached_ports(pmd);
3008 } else {
3009 latch_set(&pmd->exit_latch);
3010 dp_netdev_reload_pmd__(pmd);
3011 ovs_numa_unpin_core(pmd->core_id);
3012 xpthread_join(pmd->thread, NULL);
3013 }
3014
3015 dp_netdev_pmd_clear_ports(pmd);
3016
3017 /* Purges the 'pmd''s flows after stopping the thread, but before
3018 * destroying the flows, so that the flow stats can be collected. */
3019 if (dp->dp_purge_cb) {
3020 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
3021 }
3022 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
3023 dp_netdev_pmd_unref(pmd);
3024 }
3025
3026 /* Destroys all pmd threads. */
3027 static void
3028 dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
3029 {
3030 struct dp_netdev_pmd_thread *pmd;
3031 struct dp_netdev_pmd_thread **pmd_list;
3032 size_t k = 0, n_pmds;
3033
3034 n_pmds = cmap_count(&dp->poll_threads);
3035 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
3036
3037 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3038 /* We cannot call dp_netdev_del_pmd(), since it alters
3039 * 'dp->poll_threads' (while we're iterating it) and it
3040 * might quiesce. */
3041 ovs_assert(k < n_pmds);
3042 pmd_list[k++] = pmd;
3043 }
3044
3045 for (size_t i = 0; i < k; i++) {
3046 dp_netdev_del_pmd(dp, pmd_list[i]);
3047 }
3048 free(pmd_list);
3049 }
3050
3051 /* Deletes all pmd threads on numa node 'numa_id' and
3052 * fixes tx_qids of other threads to keep them sequential. */
3053 static void
3054 dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3055 {
3056 struct dp_netdev_pmd_thread *pmd;
3057 int n_pmds_on_numa, n_pmds;
3058 int *free_idx, k = 0;
3059 struct dp_netdev_pmd_thread **pmd_list;
3060
3061 n_pmds_on_numa = get_n_pmd_threads_on_numa(dp, numa_id);
3062 free_idx = xcalloc(n_pmds_on_numa, sizeof *free_idx);
3063 pmd_list = xcalloc(n_pmds_on_numa, sizeof *pmd_list);
3064
3065 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3066 /* We cannot call dp_netdev_del_pmd(), since it alters
3067 * 'dp->poll_threads' (while we're iterating it) and it
3068 * might quiesce. */
3069 if (pmd->numa_id == numa_id) {
3070 atomic_read_relaxed(&pmd->tx_qid, &free_idx[k]);
3071 pmd_list[k] = pmd;
3072 ovs_assert(k < n_pmds_on_numa);
3073 k++;
3074 }
3075 }
3076
3077 for (int i = 0; i < k; i++) {
3078 dp_netdev_del_pmd(dp, pmd_list[i]);
3079 }
3080
3081 n_pmds = get_n_pmd_threads(dp);
3082 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3083 int old_tx_qid;
3084
3085 atomic_read_relaxed(&pmd->tx_qid, &old_tx_qid);
3086
3087 if (old_tx_qid >= n_pmds) {
3088 int new_tx_qid = free_idx[--k];
3089
3090 atomic_store_relaxed(&pmd->tx_qid, new_tx_qid);
3091 }
3092 }
3093
3094 free(pmd_list);
3095 free(free_idx);
3096 }
3097
3098 /* Deletes all rx queues from pmd->poll_list and all the ports from
3099 * pmd->tx_ports. */
3100 static void
3101 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
3102 {
3103 struct rxq_poll *poll;
3104 struct tx_port *port;
3105
3106 ovs_mutex_lock(&pmd->port_mutex);
3107 LIST_FOR_EACH_POP (poll, node, &pmd->poll_list) {
3108 free(poll);
3109 }
3110 pmd->poll_cnt = 0;
3111 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
3112 free(port);
3113 }
3114 ovs_mutex_unlock(&pmd->port_mutex);
3115 }
3116
3117 static struct tx_port *
3118 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3119 {
3120 struct tx_port *tx;
3121
3122 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3123 if (tx->port_no == port_no) {
3124 return tx;
3125 }
3126 }
3127
3128 return NULL;
3129 }
3130
3131 /* Deletes all rx queues of 'port' from 'poll_list', and the 'port' from
3132 * 'tx_ports' of 'pmd' thread. Returns true if 'port' was found in 'pmd'
3133 * (therefore a restart is required). */
3134 static bool
3135 dp_netdev_del_port_from_pmd__(struct dp_netdev_port *port,
3136 struct dp_netdev_pmd_thread *pmd)
3137 {
3138 struct rxq_poll *poll, *next;
3139 struct tx_port *tx;
3140 bool found = false;
3141
3142 ovs_mutex_lock(&pmd->port_mutex);
3143 LIST_FOR_EACH_SAFE (poll, next, node, &pmd->poll_list) {
3144 if (poll->port == port) {
3145 found = true;
3146 ovs_list_remove(&poll->node);
3147 pmd->poll_cnt--;
3148 free(poll);
3149 }
3150 }
3151
3152 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
3153 if (tx) {
3154 hmap_remove(&pmd->tx_ports, &tx->node);
3155 free(tx);
3156 found = true;
3157 }
3158 ovs_mutex_unlock(&pmd->port_mutex);
3159
3160 return found;
3161 }
3162
3163 /* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3164 * threads. The pmd threads that need to be restarted are inserted in
3165 * 'to_reload'. */
3166 static void
3167 dp_netdev_del_port_from_all_pmds__(struct dp_netdev *dp,
3168 struct dp_netdev_port *port,
3169 struct hmapx *to_reload)
3170 {
3171 struct dp_netdev_pmd_thread *pmd;
3172
3173 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3174 bool found;
3175
3176 found = dp_netdev_del_port_from_pmd__(port, pmd);
3177
3178 if (found) {
3179 hmapx_add(to_reload, pmd);
3180 }
3181 }
3182 }
3183
3184 /* Deletes 'port' from the 'poll_list' and from the 'tx_ports' of all the pmd
3185 * threads. Reloads the threads if needed. */
3186 static void
3187 dp_netdev_del_port_from_all_pmds(struct dp_netdev *dp,
3188 struct dp_netdev_port *port)
3189 {
3190 struct dp_netdev_pmd_thread *pmd;
3191 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3192 struct hmapx_node *node;
3193
3194 dp_netdev_del_port_from_all_pmds__(dp, port, &to_reload);
3195
3196 HMAPX_FOR_EACH (node, &to_reload) {
3197 pmd = (struct dp_netdev_pmd_thread *) node->data;
3198 dp_netdev_reload_pmd__(pmd);
3199 }
3200
3201 hmapx_destroy(&to_reload);
3202 }
3203
3204
3205 /* Returns PMD thread from this numa node with fewer rx queues to poll.
3206 * Returns NULL if there is no PMD threads on this numa node.
3207 * Can be called safely only by main thread. */
3208 static struct dp_netdev_pmd_thread *
3209 dp_netdev_less_loaded_pmd_on_numa(struct dp_netdev *dp, int numa_id)
3210 {
3211 int min_cnt = -1;
3212 struct dp_netdev_pmd_thread *pmd, *res = NULL;
3213
3214 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3215 if (pmd->numa_id == numa_id
3216 && (min_cnt > pmd->poll_cnt || res == NULL)) {
3217 min_cnt = pmd->poll_cnt;
3218 res = pmd;
3219 }
3220 }
3221
3222 return res;
3223 }
3224
3225 /* Adds rx queue to poll_list of PMD thread. */
3226 static void
3227 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
3228 struct dp_netdev_port *port, struct netdev_rxq *rx)
3229 OVS_REQUIRES(pmd->port_mutex)
3230 {
3231 struct rxq_poll *poll = xmalloc(sizeof *poll);
3232
3233 poll->port = port;
3234 poll->rx = rx;
3235
3236 ovs_list_push_back(&pmd->poll_list, &poll->node);
3237 pmd->poll_cnt++;
3238 }
3239
3240 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
3241 * changes to take effect. */
3242 static void
3243 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
3244 struct dp_netdev_port *port)
3245 {
3246 struct tx_port *tx = xzalloc(sizeof *tx);
3247
3248 tx->netdev = port->netdev;
3249 tx->port_no = port->port_no;
3250
3251 ovs_mutex_lock(&pmd->port_mutex);
3252 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port_no));
3253 ovs_mutex_unlock(&pmd->port_mutex);
3254 }
3255
3256 /* Distribute all rx queues of 'port' between PMD threads in 'dp'. The pmd
3257 * threads that need to be restarted are inserted in 'to_reload'. */
3258 static void
3259 dp_netdev_add_port_rx_to_pmds(struct dp_netdev *dp,
3260 struct dp_netdev_port *port,
3261 struct hmapx *to_reload)
3262 {
3263 int numa_id = netdev_get_numa_id(port->netdev);
3264 int i;
3265
3266 if (!netdev_is_pmd(port->netdev)) {
3267 return;
3268 }
3269
3270 for (i = 0; i < port->n_rxq; i++) {
3271 struct dp_netdev_pmd_thread *pmd;
3272
3273 pmd = dp_netdev_less_loaded_pmd_on_numa(dp, numa_id);
3274 if (!pmd) {
3275 VLOG_WARN("There's no pmd thread on numa node %d", numa_id);
3276 break;
3277 }
3278
3279 ovs_mutex_lock(&pmd->port_mutex);
3280 dp_netdev_add_rxq_to_pmd(pmd, port, port->rxq[i]);
3281 ovs_mutex_unlock(&pmd->port_mutex);
3282
3283 hmapx_add(to_reload, pmd);
3284 }
3285 }
3286
3287 /* Distributes all rx queues of 'port' between all PMD threads in 'dp' and
3288 * inserts 'port' in the PMD threads 'tx_ports'. The pmd threads that need to
3289 * be restarted are inserted in 'to_reload'. */
3290 static void
3291 dp_netdev_add_port_to_pmds__(struct dp_netdev *dp, struct dp_netdev_port *port,
3292 struct hmapx *to_reload)
3293 {
3294 struct dp_netdev_pmd_thread *pmd;
3295
3296 dp_netdev_add_port_rx_to_pmds(dp, port, to_reload);
3297
3298 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3299 dp_netdev_add_port_tx_to_pmd(pmd, port);
3300 hmapx_add(to_reload, pmd);
3301 }
3302 }
3303
3304 /* Distributes all rx queues of 'port' between all PMD threads in 'dp', inserts
3305 * 'port' in the PMD threads 'tx_ports' and reloads them, if needed. */
3306 static void
3307 dp_netdev_add_port_to_pmds(struct dp_netdev *dp, struct dp_netdev_port *port)
3308 {
3309 struct dp_netdev_pmd_thread *pmd;
3310 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3311 struct hmapx_node *node;
3312
3313 dp_netdev_add_port_to_pmds__(dp, port, &to_reload);
3314
3315 HMAPX_FOR_EACH (node, &to_reload) {
3316 pmd = (struct dp_netdev_pmd_thread *) node->data;
3317 dp_netdev_reload_pmd__(pmd);
3318 }
3319
3320 hmapx_destroy(&to_reload);
3321 }
3322
3323 /* Starts pmd threads for the numa node 'numa_id', if not already started.
3324 * The function takes care of filling the threads tx port cache. */
3325 static void
3326 dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
3327 OVS_REQUIRES(dp->port_mutex)
3328 {
3329 int n_pmds;
3330
3331 if (!ovs_numa_numa_id_is_valid(numa_id)) {
3332 VLOG_WARN("Cannot create pmd threads due to numa id (%d) invalid",
3333 numa_id);
3334 return;
3335 }
3336
3337 n_pmds = get_n_pmd_threads_on_numa(dp, numa_id);
3338
3339 /* If there are already pmd threads created for the numa node
3340 * in which 'netdev' is on, do nothing. Else, creates the
3341 * pmd threads for the numa node. */
3342 if (!n_pmds) {
3343 int can_have, n_unpinned, i;
3344
3345 n_unpinned = ovs_numa_get_n_unpinned_cores_on_numa(numa_id);
3346 if (!n_unpinned) {
3347 VLOG_WARN("Cannot create pmd threads due to out of unpinned "
3348 "cores on numa node %d", numa_id);
3349 return;
3350 }
3351
3352 /* If cpu mask is specified, uses all unpinned cores, otherwise
3353 * tries creating NR_PMD_THREADS pmd threads. */
3354 can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
3355 for (i = 0; i < can_have; i++) {
3356 unsigned core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
3357 struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
3358 struct dp_netdev_port *port;
3359
3360 dp_netdev_configure_pmd(pmd, dp, core_id, numa_id);
3361
3362 HMAP_FOR_EACH (port, node, &dp->ports) {
3363 dp_netdev_add_port_tx_to_pmd(pmd, port);
3364 }
3365
3366 pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
3367 }
3368 VLOG_INFO("Created %d pmd threads on numa node %d", can_have, numa_id);
3369 }
3370 }
3371
3372 \f
3373 /* Called after pmd threads config change. Restarts pmd threads with
3374 * new configuration. */
3375 static void
3376 dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
3377 OVS_REQUIRES(dp->port_mutex)
3378 {
3379 struct hmapx to_reload = HMAPX_INITIALIZER(&to_reload);
3380 struct dp_netdev_pmd_thread *pmd;
3381 struct dp_netdev_port *port;
3382 struct hmapx_node *node;
3383
3384 HMAP_FOR_EACH (port, node, &dp->ports) {
3385 if (netdev_is_pmd(port->netdev)) {
3386 int numa_id = netdev_get_numa_id(port->netdev);
3387
3388 dp_netdev_set_pmds_on_numa(dp, numa_id);
3389 }
3390 dp_netdev_add_port_rx_to_pmds(dp, port, &to_reload);
3391 }
3392
3393 HMAPX_FOR_EACH (node, &to_reload) {
3394 pmd = (struct dp_netdev_pmd_thread *) node->data;
3395 dp_netdev_reload_pmd__(pmd);
3396 }
3397
3398 hmapx_destroy(&to_reload);
3399 }
3400
3401 static char *
3402 dpif_netdev_get_datapath_version(void)
3403 {
3404 return xstrdup("<built-in>");
3405 }
3406
3407 static void
3408 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
3409 uint16_t tcp_flags, long long now)
3410 {
3411 uint16_t flags;
3412
3413 atomic_store_relaxed(&netdev_flow->stats.used, now);
3414 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
3415 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
3416 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3417 flags |= tcp_flags;
3418 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
3419 }
3420
3421 static void
3422 dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
3423 enum dp_stat_type type, int cnt)
3424 {
3425 non_atomic_ullong_add(&pmd->stats.n[type], cnt);
3426 }
3427
3428 static int
3429 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
3430 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
3431 enum dpif_upcall_type type, const struct nlattr *userdata,
3432 struct ofpbuf *actions, struct ofpbuf *put_actions)
3433 {
3434 struct dp_netdev *dp = pmd->dp;
3435 struct flow_tnl orig_tunnel;
3436 int err;
3437
3438 if (OVS_UNLIKELY(!dp->upcall_cb)) {
3439 return ENODEV;
3440 }
3441
3442 /* Upcall processing expects the Geneve options to be in the translated
3443 * format but we need to retain the raw format for datapath use. */
3444 orig_tunnel.flags = flow->tunnel.flags;
3445 if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
3446 orig_tunnel.metadata.present.len = flow->tunnel.metadata.present.len;
3447 memcpy(orig_tunnel.metadata.opts.gnv, flow->tunnel.metadata.opts.gnv,
3448 flow->tunnel.metadata.present.len);
3449 err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel,
3450 &flow->tunnel);
3451 if (err) {
3452 return err;
3453 }
3454 }
3455
3456 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
3457 struct ds ds = DS_EMPTY_INITIALIZER;
3458 char *packet_str;
3459 struct ofpbuf key;
3460 struct odp_flow_key_parms odp_parms = {
3461 .flow = flow,
3462 .mask = &wc->masks,
3463 .odp_in_port = flow->in_port.odp_port,
3464 .support = dp_netdev_support,
3465 };
3466
3467 ofpbuf_init(&key, 0);
3468 odp_flow_key_from_flow(&odp_parms, &key);
3469 packet_str = ofp_packet_to_string(dp_packet_data(packet_),
3470 dp_packet_size(packet_));
3471
3472 odp_flow_key_format(key.data, key.size, &ds);
3473
3474 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
3475 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
3476
3477 ofpbuf_uninit(&key);
3478 free(packet_str);
3479
3480 ds_destroy(&ds);
3481 }
3482
3483 err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
3484 actions, wc, put_actions, dp->upcall_aux);
3485 if (err && err != ENOSPC) {
3486 return err;
3487 }
3488
3489 /* Translate tunnel metadata masks to datapath format. */
3490 if (wc) {
3491 if (wc->masks.tunnel.metadata.present.map) {
3492 struct geneve_opt opts[TLV_TOT_OPT_SIZE /
3493 sizeof(struct geneve_opt)];
3494
3495 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3496 tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
3497 &wc->masks.tunnel,
3498 orig_tunnel.metadata.opts.gnv,
3499 orig_tunnel.metadata.present.len,
3500 opts);
3501 } else {
3502 orig_tunnel.metadata.present.len = 0;
3503 }
3504
3505 memset(&wc->masks.tunnel.metadata, 0,
3506 sizeof wc->masks.tunnel.metadata);
3507 memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts,
3508 orig_tunnel.metadata.present.len);
3509 }
3510 wc->masks.tunnel.metadata.present.len = 0xff;
3511 }
3512
3513 /* Restore tunnel metadata. We need to use the saved options to ensure
3514 * that any unknown options are not lost. The generated mask will have
3515 * the same structure, matching on types and lengths but wildcarding
3516 * option data we don't care about. */
3517 if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
3518 memcpy(&flow->tunnel.metadata.opts.gnv, orig_tunnel.metadata.opts.gnv,
3519 orig_tunnel.metadata.present.len);
3520 flow->tunnel.metadata.present.len = orig_tunnel.metadata.present.len;
3521 flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
3522 }
3523
3524 return err;
3525 }
3526
3527 static inline uint32_t
3528 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
3529 const struct miniflow *mf)
3530 {
3531 uint32_t hash, recirc_depth;
3532
3533 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
3534 hash = dp_packet_get_rss_hash(packet);
3535 } else {
3536 hash = miniflow_hash_5tuple(mf, 0);
3537 dp_packet_set_rss_hash(packet, hash);
3538 }
3539
3540 /* The RSS hash must account for the recirculation depth to avoid
3541 * collisions in the exact match cache */
3542 recirc_depth = *recirc_depth_get_unsafe();
3543 if (OVS_UNLIKELY(recirc_depth)) {
3544 hash = hash_finish(hash, recirc_depth);
3545 dp_packet_set_rss_hash(packet, hash);
3546 }
3547 return hash;
3548 }
3549
3550 struct packet_batch_per_flow {
3551 unsigned int byte_count;
3552 uint16_t tcp_flags;
3553 struct dp_netdev_flow *flow;
3554
3555 struct dp_packet_batch array;
3556 };
3557
3558 static inline void
3559 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
3560 struct dp_packet *packet,
3561 const struct miniflow *mf)
3562 {
3563 batch->byte_count += dp_packet_size(packet);
3564 batch->tcp_flags |= miniflow_get_tcp_flags(mf);
3565 batch->array.packets[batch->array.count++] = packet;
3566 }
3567
3568 static inline void
3569 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
3570 struct dp_netdev_flow *flow)
3571 {
3572 flow->batch = batch;
3573
3574 batch->flow = flow;
3575 dp_packet_batch_init(&batch->array);
3576 batch->byte_count = 0;
3577 batch->tcp_flags = 0;
3578 }
3579
3580 static inline void
3581 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
3582 struct dp_netdev_pmd_thread *pmd,
3583 long long now)
3584 {
3585 struct dp_netdev_actions *actions;
3586 struct dp_netdev_flow *flow = batch->flow;
3587
3588 dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
3589 batch->tcp_flags, now);
3590
3591 actions = dp_netdev_flow_get_actions(flow);
3592
3593 dp_netdev_execute_actions(pmd, &batch->array, true,
3594 actions->actions, actions->size);
3595 }
3596
3597 static inline void
3598 dp_netdev_queue_batches(struct dp_packet *pkt,
3599 struct dp_netdev_flow *flow, const struct miniflow *mf,
3600 struct packet_batch_per_flow *batches, size_t *n_batches)
3601 {
3602 struct packet_batch_per_flow *batch = flow->batch;
3603
3604 if (OVS_UNLIKELY(!batch)) {
3605 batch = &batches[(*n_batches)++];
3606 packet_batch_per_flow_init(batch, flow);
3607 }
3608
3609 packet_batch_per_flow_update(batch, pkt, mf);
3610 }
3611
3612 /* Try to process all ('cnt') the 'packets' using only the exact match cache
3613 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
3614 * miniflow is copied into 'keys' and the packet pointer is moved at the
3615 * beginning of the 'packets' array.
3616 *
3617 * The function returns the number of packets that needs to be processed in the
3618 * 'packets' array (they have been moved to the beginning of the vector).
3619 *
3620 * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must be
3621 * initialized by this function using 'port_no'.
3622 */
3623 static inline size_t
3624 emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
3625 struct netdev_flow_key *keys,
3626 struct packet_batch_per_flow batches[], size_t *n_batches,
3627 bool md_is_valid, odp_port_t port_no)
3628 {
3629 struct emc_cache *flow_cache = &pmd->flow_cache;
3630 struct netdev_flow_key *key = &keys[0];
3631 size_t i, n_missed = 0, n_dropped = 0;
3632 struct dp_packet **packets = packets_->packets;
3633 int cnt = packets_->count;
3634
3635 for (i = 0; i < cnt; i++) {
3636 struct dp_netdev_flow *flow;
3637 struct dp_packet *packet = packets[i];
3638
3639 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
3640 dp_packet_delete(packet);
3641 n_dropped++;
3642 continue;
3643 }
3644
3645 if (i != cnt - 1) {
3646 /* Prefetch next packet data and metadata. */
3647 OVS_PREFETCH(dp_packet_data(packets[i+1]));
3648 pkt_metadata_prefetch_init(&packets[i+1]->md);
3649 }
3650
3651 if (!md_is_valid) {
3652 pkt_metadata_init(&packet->md, port_no);
3653 }
3654 miniflow_extract(packet, &key->mf);
3655 key->len = 0; /* Not computed yet. */
3656 key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
3657
3658 flow = emc_lookup(flow_cache, key);
3659 if (OVS_LIKELY(flow)) {
3660 dp_netdev_queue_batches(packet, flow, &key->mf, batches,
3661 n_batches);
3662 } else {
3663 /* Exact match cache missed. Group missed packets together at
3664 * the beginning of the 'packets' array. */
3665 packets[n_missed] = packet;
3666 /* 'key[n_missed]' contains the key of the current packet and it
3667 * must be returned to the caller. The next key should be extracted
3668 * to 'keys[n_missed + 1]'. */
3669 key = &keys[++n_missed];
3670 }
3671 }
3672
3673 dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, cnt - n_dropped - n_missed);
3674
3675 return n_missed;
3676 }
3677
3678 static inline void
3679 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet,
3680 const struct netdev_flow_key *key,
3681 struct ofpbuf *actions, struct ofpbuf *put_actions,
3682 int *lost_cnt)
3683 {
3684 struct ofpbuf *add_actions;
3685 struct dp_packet_batch b;
3686 struct match match;
3687 ovs_u128 ufid;
3688 int error;
3689
3690 match.tun_md.valid = false;
3691 miniflow_expand(&key->mf, &match.flow);
3692
3693 ofpbuf_clear(actions);
3694 ofpbuf_clear(put_actions);
3695
3696 dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
3697 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
3698 &ufid, DPIF_UC_MISS, NULL, actions,
3699 put_actions);
3700 if (OVS_UNLIKELY(error && error != ENOSPC)) {
3701 dp_packet_delete(packet);
3702 (*lost_cnt)++;
3703 return;
3704 }
3705
3706 /* The Netlink encoding of datapath flow keys cannot express
3707 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3708 * tag is interpreted as exact match on the fact that there is no
3709 * VLAN. Unless we refactor a lot of code that translates between
3710 * Netlink and struct flow representations, we have to do the same
3711 * here. */
3712 if (!match.wc.masks.vlan_tci) {
3713 match.wc.masks.vlan_tci = htons(0xffff);
3714 }
3715
3716 /* We can't allow the packet batching in the next loop to execute
3717 * the actions. Otherwise, if there are any slow path actions,
3718 * we'll send the packet up twice. */
3719 packet_batch_init_packet(&b, packet);
3720 dp_netdev_execute_actions(pmd, &b, true,
3721 actions->data, actions->size);
3722
3723 add_actions = put_actions->size ? put_actions : actions;
3724 if (OVS_LIKELY(error != ENOSPC)) {
3725 struct dp_netdev_flow *netdev_flow;
3726
3727 /* XXX: There's a race window where a flow covering this packet
3728 * could have already been installed since we last did the flow
3729 * lookup before upcall. This could be solved by moving the
3730 * mutex lock outside the loop, but that's an awful long time
3731 * to be locking everyone out of making flow installs. If we
3732 * move to a per-core classifier, it would be reasonable. */
3733 ovs_mutex_lock(&pmd->flow_mutex);
3734 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key);
3735 if (OVS_LIKELY(!netdev_flow)) {
3736 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
3737 add_actions->data,
3738 add_actions->size);
3739 }
3740 ovs_mutex_unlock(&pmd->flow_mutex);
3741
3742 emc_insert(&pmd->flow_cache, key, netdev_flow);
3743 }
3744 }
3745
3746 static inline void
3747 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
3748 struct dp_packet_batch *packets_,
3749 struct netdev_flow_key *keys,
3750 struct packet_batch_per_flow batches[], size_t *n_batches)
3751 {
3752 int cnt = packets_->count;
3753 #if !defined(__CHECKER__) && !defined(_WIN32)
3754 const size_t PKT_ARRAY_SIZE = cnt;
3755 #else
3756 /* Sparse or MSVC doesn't like variable length array. */
3757 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
3758 #endif
3759 struct dp_packet **packets = packets_->packets;
3760 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
3761 struct dp_netdev *dp = pmd->dp;
3762 struct emc_cache *flow_cache = &pmd->flow_cache;
3763 int miss_cnt = 0, lost_cnt = 0;
3764 bool any_miss;
3765 size_t i;
3766
3767 for (i = 0; i < cnt; i++) {
3768 /* Key length is needed in all the cases, hash computed on demand. */
3769 keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
3770 }
3771 any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
3772 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
3773 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
3774 struct ofpbuf actions, put_actions;
3775
3776 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
3777 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
3778
3779 for (i = 0; i < cnt; i++) {
3780 struct dp_netdev_flow *netdev_flow;
3781
3782 if (OVS_LIKELY(rules[i])) {
3783 continue;
3784 }
3785
3786 /* It's possible that an earlier slow path execution installed
3787 * a rule covering this flow. In this case, it's a lot cheaper
3788 * to catch it here than execute a miss. */
3789 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i]);
3790 if (netdev_flow) {
3791 rules[i] = &netdev_flow->cr;
3792 continue;
3793 }
3794
3795 miss_cnt++;
3796 handle_packet_upcall(pmd, packets[i], &keys[i], &actions, &put_actions,
3797 &lost_cnt);
3798 }
3799
3800 ofpbuf_uninit(&actions);
3801 ofpbuf_uninit(&put_actions);
3802 fat_rwlock_unlock(&dp->upcall_rwlock);
3803 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
3804 } else if (OVS_UNLIKELY(any_miss)) {
3805 for (i = 0; i < cnt; i++) {
3806 if (OVS_UNLIKELY(!rules[i])) {
3807 dp_packet_delete(packets[i]);
3808 lost_cnt++;
3809 miss_cnt++;
3810 }
3811 }
3812 }
3813
3814 for (i = 0; i < cnt; i++) {
3815 struct dp_packet *packet = packets[i];
3816 struct dp_netdev_flow *flow;
3817
3818 if (OVS_UNLIKELY(!rules[i])) {
3819 continue;
3820 }
3821
3822 flow = dp_netdev_flow_cast(rules[i]);
3823
3824 emc_insert(flow_cache, &keys[i], flow);
3825 dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
3826 }
3827
3828 dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
3829 dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
3830 dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
3831 }
3832
3833 /* Packets enter the datapath from a port (or from recirculation) here.
3834 *
3835 * For performance reasons a caller may choose not to initialize the metadata
3836 * in 'packets': in this case 'mdinit' is false and this function needs to
3837 * initialize it using 'port_no'. If the metadata in 'packets' is already
3838 * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
3839 static void
3840 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
3841 struct dp_packet_batch *packets,
3842 bool md_is_valid, odp_port_t port_no)
3843 {
3844 int cnt = packets->count;
3845 #if !defined(__CHECKER__) && !defined(_WIN32)
3846 const size_t PKT_ARRAY_SIZE = cnt;
3847 #else
3848 /* Sparse or MSVC doesn't like variable length array. */
3849 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
3850 #endif
3851 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
3852 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
3853 long long now = time_msec();
3854 size_t newcnt, n_batches, i;
3855
3856 n_batches = 0;
3857 newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
3858 md_is_valid, port_no);
3859 if (OVS_UNLIKELY(newcnt)) {
3860 packets->count = newcnt;
3861 fast_path_processing(pmd, packets, keys, batches, &n_batches);
3862 }
3863
3864 for (i = 0; i < n_batches; i++) {
3865 batches[i].flow->batch = NULL;
3866 }
3867
3868 for (i = 0; i < n_batches; i++) {
3869 packet_batch_per_flow_execute(&batches[i], pmd, now);
3870 }
3871 }
3872
3873 static void
3874 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
3875 struct dp_packet_batch *packets,
3876 odp_port_t port_no)
3877 {
3878 dp_netdev_input__(pmd, packets, false, port_no);
3879 }
3880
3881 static void
3882 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
3883 struct dp_packet_batch *packets)
3884 {
3885 dp_netdev_input__(pmd, packets, true, 0);
3886 }
3887
3888 struct dp_netdev_execute_aux {
3889 struct dp_netdev_pmd_thread *pmd;
3890 };
3891
3892 static void
3893 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
3894 void *aux)
3895 {
3896 struct dp_netdev *dp = get_dp_netdev(dpif);
3897 dp->dp_purge_aux = aux;
3898 dp->dp_purge_cb = cb;
3899 }
3900
3901 static void
3902 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
3903 void *aux)
3904 {
3905 struct dp_netdev *dp = get_dp_netdev(dpif);
3906 dp->upcall_aux = aux;
3907 dp->upcall_cb = cb;
3908 }
3909
3910 static struct tx_port *
3911 pmd_tx_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
3912 odp_port_t port_no)
3913 {
3914 return tx_port_lookup(&pmd->port_cache, port_no);
3915 }
3916
3917 static int
3918 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
3919 const struct nlattr *attr,
3920 struct dp_packet_batch *batch)
3921 {
3922 struct tx_port *tun_port;
3923 const struct ovs_action_push_tnl *data;
3924 int err;
3925
3926 data = nl_attr_get(attr);
3927
3928 tun_port = pmd_tx_port_cache_lookup(pmd, u32_to_odp(data->tnl_port));
3929 if (!tun_port) {
3930 err = -EINVAL;
3931 goto error;
3932 }
3933 err = netdev_push_header(tun_port->netdev, batch, data);
3934 if (!err) {
3935 return 0;
3936 }
3937 error:
3938 dp_packet_delete_batch(batch, true);
3939 return err;
3940 }
3941
3942 static void
3943 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
3944 struct dp_packet *packet, bool may_steal,
3945 struct flow *flow, ovs_u128 *ufid,
3946 struct ofpbuf *actions,
3947 const struct nlattr *userdata)
3948 {
3949 struct dp_packet_batch b;
3950 int error;
3951
3952 ofpbuf_clear(actions);
3953
3954 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
3955 DPIF_UC_ACTION, userdata, actions,
3956 NULL);
3957 if (!error || error == ENOSPC) {
3958 packet_batch_init_packet(&b, packet);
3959 dp_netdev_execute_actions(pmd, &b, may_steal,
3960 actions->data, actions->size);
3961 } else if (may_steal) {
3962 dp_packet_delete(packet);
3963 }
3964 }
3965
3966 static void
3967 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
3968 const struct nlattr *a, bool may_steal)
3969 {
3970 struct dp_netdev_execute_aux *aux = aux_;
3971 uint32_t *depth = recirc_depth_get();
3972 struct dp_netdev_pmd_thread *pmd = aux->pmd;
3973 struct dp_netdev *dp = pmd->dp;
3974 int type = nl_attr_type(a);
3975 struct tx_port *p;
3976
3977 switch ((enum ovs_action_attr)type) {
3978 case OVS_ACTION_ATTR_OUTPUT:
3979 p = pmd_tx_port_cache_lookup(pmd, u32_to_odp(nl_attr_get_u32(a)));
3980 if (OVS_LIKELY(p)) {
3981 int tx_qid;
3982
3983 atomic_read_relaxed(&pmd->tx_qid, &tx_qid);
3984
3985 netdev_send(p->netdev, tx_qid, packets_, may_steal);
3986 return;
3987 }
3988 break;
3989
3990 case OVS_ACTION_ATTR_TUNNEL_PUSH:
3991 if (*depth < MAX_RECIRC_DEPTH) {
3992 struct dp_packet_batch tnl_pkt;
3993 int err;
3994
3995 if (!may_steal) {
3996 dp_packet_batch_clone(&tnl_pkt, packets_);
3997 packets_ = &tnl_pkt;
3998 }
3999
4000 err = push_tnl_action(pmd, a, packets_);
4001 if (!err) {
4002 (*depth)++;
4003 dp_netdev_recirculate(pmd, packets_);
4004 (*depth)--;
4005 }
4006 return;
4007 }
4008 break;
4009
4010 case OVS_ACTION_ATTR_TUNNEL_POP:
4011 if (*depth < MAX_RECIRC_DEPTH) {
4012 odp_port_t portno = u32_to_odp(nl_attr_get_u32(a));
4013
4014 p = pmd_tx_port_cache_lookup(pmd, portno);
4015 if (p) {
4016 struct dp_packet_batch tnl_pkt;
4017 int i;
4018
4019 if (!may_steal) {
4020 dp_packet_batch_clone(&tnl_pkt, packets_);
4021 packets_ = &tnl_pkt;
4022 }
4023
4024 netdev_pop_header(p->netdev, packets_);
4025 if (!packets_->count) {
4026 return;
4027 }
4028
4029 for (i = 0; i < packets_->count; i++) {
4030 packets_->packets[i]->md.in_port.odp_port = portno;
4031 }
4032
4033 (*depth)++;
4034 dp_netdev_recirculate(pmd, packets_);
4035 (*depth)--;
4036 return;
4037 }
4038 }
4039 break;
4040
4041 case OVS_ACTION_ATTR_USERSPACE:
4042 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4043 struct dp_packet **packets = packets_->packets;
4044 const struct nlattr *userdata;
4045 struct ofpbuf actions;
4046 struct flow flow;
4047 ovs_u128 ufid;
4048 int i;
4049
4050 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
4051 ofpbuf_init(&actions, 0);
4052
4053 for (i = 0; i < packets_->count; i++) {
4054 flow_extract(packets[i], &flow);
4055 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
4056 dp_execute_userspace_action(pmd, packets[i], may_steal, &flow,
4057 &ufid, &actions, userdata);
4058 }
4059 ofpbuf_uninit(&actions);
4060 fat_rwlock_unlock(&dp->upcall_rwlock);
4061
4062 return;
4063 }
4064 break;
4065
4066 case OVS_ACTION_ATTR_RECIRC:
4067 if (*depth < MAX_RECIRC_DEPTH) {
4068 struct dp_packet_batch recirc_pkts;
4069 int i;
4070
4071 if (!may_steal) {
4072 dp_packet_batch_clone(&recirc_pkts, packets_);
4073 packets_ = &recirc_pkts;
4074 }
4075
4076 for (i = 0; i < packets_->count; i++) {
4077 packets_->packets[i]->md.recirc_id = nl_attr_get_u32(a);
4078 }
4079
4080 (*depth)++;
4081 dp_netdev_recirculate(pmd, packets_);
4082 (*depth)--;
4083
4084 return;
4085 }
4086
4087 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
4088 break;
4089
4090 case OVS_ACTION_ATTR_CT:
4091 /* If a flow with this action is slow-pathed, datapath assistance is
4092 * required to implement it. However, we don't support this action
4093 * in the userspace datapath. */
4094 VLOG_WARN("Cannot execute conntrack action in userspace.");
4095 break;
4096
4097 case OVS_ACTION_ATTR_PUSH_VLAN:
4098 case OVS_ACTION_ATTR_POP_VLAN:
4099 case OVS_ACTION_ATTR_PUSH_MPLS:
4100 case OVS_ACTION_ATTR_POP_MPLS:
4101 case OVS_ACTION_ATTR_SET:
4102 case OVS_ACTION_ATTR_SET_MASKED:
4103 case OVS_ACTION_ATTR_SAMPLE:
4104 case OVS_ACTION_ATTR_HASH:
4105 case OVS_ACTION_ATTR_UNSPEC:
4106 case __OVS_ACTION_ATTR_MAX:
4107 OVS_NOT_REACHED();
4108 }
4109
4110 dp_packet_delete_batch(packets_, may_steal);
4111 }
4112
4113 static void
4114 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
4115 struct dp_packet_batch *packets,
4116 bool may_steal,
4117 const struct nlattr *actions, size_t actions_len)
4118 {
4119 struct dp_netdev_execute_aux aux = { pmd };
4120
4121 odp_execute_actions(&aux, packets, may_steal, actions,
4122 actions_len, dp_execute_cb);
4123 }
4124
4125 const struct dpif_class dpif_netdev_class = {
4126 "netdev",
4127 dpif_netdev_init,
4128 dpif_netdev_enumerate,
4129 dpif_netdev_port_open_type,
4130 dpif_netdev_open,
4131 dpif_netdev_close,
4132 dpif_netdev_destroy,
4133 dpif_netdev_run,
4134 dpif_netdev_wait,
4135 dpif_netdev_get_stats,
4136 dpif_netdev_port_add,
4137 dpif_netdev_port_del,
4138 dpif_netdev_port_query_by_number,
4139 dpif_netdev_port_query_by_name,
4140 NULL, /* port_get_pid */
4141 dpif_netdev_port_dump_start,
4142 dpif_netdev_port_dump_next,
4143 dpif_netdev_port_dump_done,
4144 dpif_netdev_port_poll,
4145 dpif_netdev_port_poll_wait,
4146 dpif_netdev_flow_flush,
4147 dpif_netdev_flow_dump_create,
4148 dpif_netdev_flow_dump_destroy,
4149 dpif_netdev_flow_dump_thread_create,
4150 dpif_netdev_flow_dump_thread_destroy,
4151 dpif_netdev_flow_dump_next,
4152 dpif_netdev_operate,
4153 NULL, /* recv_set */
4154 NULL, /* handlers_set */
4155 dpif_netdev_pmd_set,
4156 dpif_netdev_queue_to_priority,
4157 NULL, /* recv */
4158 NULL, /* recv_wait */
4159 NULL, /* recv_purge */
4160 dpif_netdev_register_dp_purge_cb,
4161 dpif_netdev_register_upcall_cb,
4162 dpif_netdev_enable_upcall,
4163 dpif_netdev_disable_upcall,
4164 dpif_netdev_get_datapath_version,
4165 NULL, /* ct_dump_start */
4166 NULL, /* ct_dump_next */
4167 NULL, /* ct_dump_done */
4168 NULL, /* ct_flush */
4169 };
4170
4171 static void
4172 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
4173 const char *argv[], void *aux OVS_UNUSED)
4174 {
4175 struct dp_netdev_port *port;
4176 struct dp_netdev *dp;
4177 odp_port_t port_no;
4178
4179 ovs_mutex_lock(&dp_netdev_mutex);
4180 dp = shash_find_data(&dp_netdevs, argv[1]);
4181 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
4182 ovs_mutex_unlock(&dp_netdev_mutex);
4183 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
4184 return;
4185 }
4186 ovs_refcount_ref(&dp->ref_cnt);
4187 ovs_mutex_unlock(&dp_netdev_mutex);
4188
4189 ovs_mutex_lock(&dp->port_mutex);
4190 if (get_port_by_name(dp, argv[2], &port)) {
4191 unixctl_command_reply_error(conn, "unknown port");
4192 goto exit;
4193 }
4194
4195 port_no = u32_to_odp(atoi(argv[3]));
4196 if (!port_no || port_no == ODPP_NONE) {
4197 unixctl_command_reply_error(conn, "bad port number");
4198 goto exit;
4199 }
4200 if (dp_netdev_lookup_port(dp, port_no)) {
4201 unixctl_command_reply_error(conn, "port number already in use");
4202 goto exit;
4203 }
4204
4205 /* Remove port. */
4206 hmap_remove(&dp->ports, &port->node);
4207 dp_netdev_del_port_from_all_pmds(dp, port);
4208
4209 /* Reinsert with new port number. */
4210 port->port_no = port_no;
4211 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
4212 dp_netdev_add_port_to_pmds(dp, port);
4213
4214 seq_change(dp->port_seq);
4215 unixctl_command_reply(conn, NULL);
4216
4217 exit:
4218 ovs_mutex_unlock(&dp->port_mutex);
4219 dp_netdev_unref(dp);
4220 }
4221
4222 static void
4223 dpif_dummy_register__(const char *type)
4224 {
4225 struct dpif_class *class;
4226
4227 class = xmalloc(sizeof *class);
4228 *class = dpif_netdev_class;
4229 class->type = xstrdup(type);
4230 dp_register_provider(class);
4231 }
4232
4233 static void
4234 dpif_dummy_override(const char *type)
4235 {
4236 int error;
4237
4238 /*
4239 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
4240 * a userland-only build. It's useful for testsuite.
4241 */
4242 error = dp_unregister_provider(type);
4243 if (error == 0 || error == EAFNOSUPPORT) {
4244 dpif_dummy_register__(type);
4245 }
4246 }
4247
4248 void
4249 dpif_dummy_register(enum dummy_level level)
4250 {
4251 if (level == DUMMY_OVERRIDE_ALL) {
4252 struct sset types;
4253 const char *type;
4254
4255 sset_init(&types);
4256 dp_enumerate_types(&types);
4257 SSET_FOR_EACH (type, &types) {
4258 dpif_dummy_override(type);
4259 }
4260 sset_destroy(&types);
4261 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
4262 dpif_dummy_override("system");
4263 }
4264
4265 dpif_dummy_register__("dummy");
4266
4267 unixctl_command_register("dpif-dummy/change-port-number",
4268 "dp port new-number",
4269 3, 3, dpif_dummy_change_port_number, NULL);
4270 }
4271 \f
4272 /* Datapath Classifier. */
4273
4274 /* A set of rules that all have the same fields wildcarded. */
4275 struct dpcls_subtable {
4276 /* The fields are only used by writers. */
4277 struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
4278
4279 /* These fields are accessed by readers. */
4280 struct cmap rules; /* Contains "struct dpcls_rule"s. */
4281 struct netdev_flow_key mask; /* Wildcards for fields (const). */
4282 /* 'mask' must be the last field, additional space is allocated here. */
4283 };
4284
4285 /* Initializes 'cls' as a classifier that initially contains no classification
4286 * rules. */
4287 static void
4288 dpcls_init(struct dpcls *cls)
4289 {
4290 cmap_init(&cls->subtables_map);
4291 pvector_init(&cls->subtables);
4292 }
4293
4294 static void
4295 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
4296 {
4297 pvector_remove(&cls->subtables, subtable);
4298 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
4299 subtable->mask.hash);
4300 cmap_destroy(&subtable->rules);
4301 ovsrcu_postpone(free, subtable);
4302 }
4303
4304 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
4305 * caller's responsibility.
4306 * May only be called after all the readers have been terminated. */
4307 static void
4308 dpcls_destroy(struct dpcls *cls)
4309 {
4310 if (cls) {
4311 struct dpcls_subtable *subtable;
4312
4313 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
4314 ovs_assert(cmap_count(&subtable->rules) == 0);
4315 dpcls_destroy_subtable(cls, subtable);
4316 }
4317 cmap_destroy(&cls->subtables_map);
4318 pvector_destroy(&cls->subtables);
4319 }
4320 }
4321
4322 static struct dpcls_subtable *
4323 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4324 {
4325 struct dpcls_subtable *subtable;
4326
4327 /* Need to add one. */
4328 subtable = xmalloc(sizeof *subtable
4329 - sizeof subtable->mask.mf + mask->len);
4330 cmap_init(&subtable->rules);
4331 netdev_flow_key_clone(&subtable->mask, mask);
4332 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
4333 pvector_insert(&cls->subtables, subtable, 0);
4334 pvector_publish(&cls->subtables);
4335
4336 return subtable;
4337 }
4338
4339 static inline struct dpcls_subtable *
4340 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
4341 {
4342 struct dpcls_subtable *subtable;
4343
4344 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
4345 &cls->subtables_map) {
4346 if (netdev_flow_key_equal(&subtable->mask, mask)) {
4347 return subtable;
4348 }
4349 }
4350 return dpcls_create_subtable(cls, mask);
4351 }
4352
4353 /* Insert 'rule' into 'cls'. */
4354 static void
4355 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
4356 const struct netdev_flow_key *mask)
4357 {
4358 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
4359
4360 rule->mask = &subtable->mask;
4361 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
4362 }
4363
4364 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
4365 static void
4366 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
4367 {
4368 struct dpcls_subtable *subtable;
4369
4370 ovs_assert(rule->mask);
4371
4372 INIT_CONTAINER(subtable, rule->mask, mask);
4373
4374 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
4375 == 0) {
4376 dpcls_destroy_subtable(cls, subtable);
4377 pvector_publish(&cls->subtables);
4378 }
4379 }
4380
4381 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
4382 * in 'mask' the values in 'key' and 'target' are the same. */
4383 static inline bool
4384 dpcls_rule_matches_key(const struct dpcls_rule *rule,
4385 const struct netdev_flow_key *target)
4386 {
4387 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
4388 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
4389 uint64_t value;
4390
4391 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
4392 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
4393 return false;
4394 }
4395 }
4396 return true;
4397 }
4398
4399 /* For each miniflow in 'flows' performs a classifier lookup writing the result
4400 * into the corresponding slot in 'rules'. If a particular entry in 'flows' is
4401 * NULL it is skipped.
4402 *
4403 * This function is optimized for use in the userspace datapath and therefore
4404 * does not implement a lot of features available in the standard
4405 * classifier_lookup() function. Specifically, it does not implement
4406 * priorities, instead returning any rule which matches the flow.
4407 *
4408 * Returns true if all flows found a corresponding rule. */
4409 static bool
4410 dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
4411 struct dpcls_rule **rules, const size_t cnt)
4412 {
4413 /* The batch size 16 was experimentally found faster than 8 or 32. */
4414 typedef uint16_t map_type;
4415 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
4416
4417 #if !defined(__CHECKER__) && !defined(_WIN32)
4418 const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
4419 #else
4420 enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
4421 #endif
4422 map_type maps[N_MAPS];
4423 struct dpcls_subtable *subtable;
4424
4425 memset(maps, 0xff, sizeof maps);
4426 if (cnt % MAP_BITS) {
4427 maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
4428 }
4429 memset(rules, 0, cnt * sizeof *rules);
4430
4431 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
4432 const struct netdev_flow_key *mkeys = keys;
4433 struct dpcls_rule **mrules = rules;
4434 map_type remains = 0;
4435 int m;
4436
4437 BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
4438
4439 for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
4440 uint32_t hashes[MAP_BITS];
4441 const struct cmap_node *nodes[MAP_BITS];
4442 unsigned long map = maps[m];
4443 int i;
4444
4445 if (!map) {
4446 continue; /* Skip empty maps. */
4447 }
4448
4449 /* Compute hashes for the remaining keys. */
4450 ULLONG_FOR_EACH_1(i, map) {
4451 hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
4452 &subtable->mask);
4453 }
4454 /* Lookup. */
4455 map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
4456 /* Check results. */
4457 ULLONG_FOR_EACH_1(i, map) {
4458 struct dpcls_rule *rule;
4459
4460 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
4461 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
4462 mrules[i] = rule;
4463 goto next;
4464 }
4465 }
4466 ULLONG_SET0(map, i); /* Did not match. */
4467 next:
4468 ; /* Keep Sparse happy. */
4469 }
4470 maps[m] &= ~map; /* Clear the found rules. */
4471 remains |= maps[m];
4472 }
4473 if (!remains) {
4474 return true; /* All found. */
4475 }
4476 }
4477 return false; /* Some misses. */
4478 }