]> git.proxmox.com Git - mirror_ovs.git/blob - lib/dpif-netdev.c
dpif-netdev: Return error code when no mark available.
[mirror_ovs.git] / lib / dpif-netdev.c
1 /*
2 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "dpif-netdev.h"
19 #include "dpif-netdev-private.h"
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <inttypes.h>
25 #include <net/if.h>
26 #include <sys/types.h>
27 #include <netinet/in.h>
28 #include <stdint.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/ioctl.h>
32 #include <sys/socket.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35
36 #include "bitmap.h"
37 #include "cmap.h"
38 #include "conntrack.h"
39 #include "conntrack-tp.h"
40 #include "coverage.h"
41 #include "ct-dpif.h"
42 #include "csum.h"
43 #include "dp-packet.h"
44 #include "dpif.h"
45 #include "dpif-netdev-perf.h"
46 #include "dpif-provider.h"
47 #include "dummy.h"
48 #include "fat-rwlock.h"
49 #include "flow.h"
50 #include "hmapx.h"
51 #include "id-pool.h"
52 #include "ipf.h"
53 #include "netdev.h"
54 #include "netdev-offload.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
57 #include "netlink.h"
58 #include "odp-execute.h"
59 #include "odp-util.h"
60 #include "openvswitch/dynamic-string.h"
61 #include "openvswitch/list.h"
62 #include "openvswitch/match.h"
63 #include "openvswitch/ofp-parse.h"
64 #include "openvswitch/ofp-print.h"
65 #include "openvswitch/ofpbuf.h"
66 #include "openvswitch/shash.h"
67 #include "openvswitch/vlog.h"
68 #include "ovs-numa.h"
69 #include "ovs-rcu.h"
70 #include "packets.h"
71 #include "openvswitch/poll-loop.h"
72 #include "pvector.h"
73 #include "random.h"
74 #include "seq.h"
75 #include "smap.h"
76 #include "sset.h"
77 #include "timeval.h"
78 #include "tnl-neigh-cache.h"
79 #include "tnl-ports.h"
80 #include "unixctl.h"
81 #include "util.h"
82 #include "uuid.h"
83
84 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
85
86 /* Auto Load Balancing Defaults */
87 #define ALB_ACCEPTABLE_IMPROVEMENT 25
88 #define ALB_PMD_LOAD_THRESHOLD 95
89 #define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */
90 #define MIN_TO_MSEC 60000
91
92 #define FLOW_DUMP_MAX_BATCH 50
93 /* Use per thread recirc_depth to prevent recirculation loop. */
94 #define MAX_RECIRC_DEPTH 6
95 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
96
97 /* Use instant packet send by default. */
98 #define DEFAULT_TX_FLUSH_INTERVAL 0
99
100 /* Configuration parameters. */
101 enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
102 enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
103 enum { N_METER_LOCKS = 64 }; /* Maximum number of meters. */
104
105 COVERAGE_DEFINE(datapath_drop_meter);
106 COVERAGE_DEFINE(datapath_drop_upcall_error);
107 COVERAGE_DEFINE(datapath_drop_lock_error);
108 COVERAGE_DEFINE(datapath_drop_userspace_action_error);
109 COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
110 COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
111 COVERAGE_DEFINE(datapath_drop_recirc_error);
112 COVERAGE_DEFINE(datapath_drop_invalid_port);
113 COVERAGE_DEFINE(datapath_drop_invalid_bond);
114 COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
115 COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
116
117 /* Protects against changes to 'dp_netdevs'. */
118 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
119
120 /* Contains all 'struct dp_netdev's. */
121 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
122 = SHASH_INITIALIZER(&dp_netdevs);
123
124 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
125
126 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
127 | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
128 | CS_SRC_NAT | CS_DST_NAT)
129 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
130
131 static struct odp_support dp_netdev_support = {
132 .max_vlan_headers = SIZE_MAX,
133 .max_mpls_depth = SIZE_MAX,
134 .recirc = true,
135 .ct_state = true,
136 .ct_zone = true,
137 .ct_mark = true,
138 .ct_label = true,
139 .ct_state_nat = true,
140 .ct_orig_tuple = true,
141 .ct_orig_tuple6 = true,
142 };
143
144 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
145 *
146 * Exact match cache for frequently used flows
147 *
148 * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
149 * search its entries for a miniflow that matches exactly the miniflow of the
150 * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
151 *
152 * A cache entry holds a reference to its 'dp_netdev_flow'.
153 *
154 * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
155 * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
156 * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
157 * value is the index of a cache entry where the miniflow could be.
158 *
159 *
160 * Signature match cache (SMC)
161 *
162 * This cache stores a 16-bit signature for each flow without storing keys, and
163 * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
164 * Each flow thus occupies 32bit which is much more memory efficient than EMC.
165 * SMC uses a set-associative design that each bucket contains
166 * SMC_ENTRY_PER_BUCKET number of entries.
167 * Since 16-bit flow_table index is used, if there are more than 2^16
168 * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
169 *
170 *
171 * Thread-safety
172 * =============
173 *
174 * Each pmd_thread has its own private exact match cache.
175 * If dp_netdev_input is not called from a pmd thread, a mutex is used.
176 */
177
178 #define EM_FLOW_HASH_SHIFT 13
179 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
180 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
181 #define EM_FLOW_HASH_SEGS 2
182
183 /* SMC uses a set-associative design. A bucket contains a set of entries that
184 * a flow item can occupy. For now, it uses one hash function rather than two
185 * as for the EMC design. */
186 #define SMC_ENTRY_PER_BUCKET 4
187 #define SMC_ENTRIES (1u << 20)
188 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
189 #define SMC_MASK (SMC_BUCKET_CNT - 1)
190
191 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
192 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
193 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \
194 DEFAULT_EM_FLOW_INSERT_INV_PROB)
195
196 struct emc_entry {
197 struct dp_netdev_flow *flow;
198 struct netdev_flow_key key; /* key.hash used for emc hash value. */
199 };
200
201 struct emc_cache {
202 struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
203 int sweep_idx; /* For emc_cache_slow_sweep(). */
204 };
205
206 struct smc_bucket {
207 uint16_t sig[SMC_ENTRY_PER_BUCKET];
208 uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
209 };
210
211 /* Signature match cache, differentiate from EMC cache */
212 struct smc_cache {
213 struct smc_bucket buckets[SMC_BUCKET_CNT];
214 };
215
216 struct dfc_cache {
217 struct emc_cache emc_cache;
218 struct smc_cache smc_cache;
219 };
220
221 /* Iterate in the exact match cache through every entry that might contain a
222 * miniflow with hash 'HASH'. */
223 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
224 for (uint32_t i__ = 0, srch_hash__ = (HASH); \
225 (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
226 i__ < EM_FLOW_HASH_SEGS; \
227 i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
228 \f
229 /* Simple non-wildcarding single-priority classifier. */
230
231 /* Time in microseconds between successive optimizations of the dpcls
232 * subtable vector */
233 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
234
235 /* Time in microseconds of the interval in which rxq processing cycles used
236 * in rxq to pmd assignments is measured and stored. */
237 #define PMD_RXQ_INTERVAL_LEN 10000000LL
238
239 /* Number of intervals for which cycles are stored
240 * and used during rxq to pmd assignment. */
241 #define PMD_RXQ_INTERVAL_MAX 6
242
243 /* Time in microseconds to try RCU quiescing. */
244 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
245
246 struct dpcls {
247 struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */
248 odp_port_t in_port;
249 struct cmap subtables_map;
250 struct pvector subtables;
251 };
252
253 /* Data structure to keep packet order till fastpath processing. */
254 struct dp_packet_flow_map {
255 struct dp_packet *packet;
256 struct dp_netdev_flow *flow;
257 uint16_t tcp_flags;
258 };
259
260 static void dpcls_init(struct dpcls *);
261 static void dpcls_destroy(struct dpcls *);
262 static void dpcls_sort_subtable_vector(struct dpcls *);
263 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
264 const struct netdev_flow_key *mask);
265 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
266 static bool dpcls_lookup(struct dpcls *cls,
267 const struct netdev_flow_key *keys[],
268 struct dpcls_rule **rules, size_t cnt,
269 int *num_lookups_p);
270
271 /* Set of supported meter flags */
272 #define DP_SUPPORTED_METER_FLAGS_MASK \
273 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
274
275 /* Set of supported meter band types */
276 #define DP_SUPPORTED_METER_BAND_TYPES \
277 ( 1 << OFPMBT13_DROP )
278
279 struct dp_meter_band {
280 struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
281 uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
282 uint64_t packet_count;
283 uint64_t byte_count;
284 };
285
286 struct dp_meter {
287 uint16_t flags;
288 uint16_t n_bands;
289 uint32_t max_delta_t;
290 uint64_t used;
291 uint64_t packet_count;
292 uint64_t byte_count;
293 struct dp_meter_band bands[];
294 };
295
296 struct pmd_auto_lb {
297 bool auto_lb_requested; /* Auto load balancing requested by user. */
298 bool is_enabled; /* Current status of Auto load balancing. */
299 uint64_t rebalance_intvl;
300 uint64_t rebalance_poll_timer;
301 };
302
303 /* Datapath based on the network device interface from netdev.h.
304 *
305 *
306 * Thread-safety
307 * =============
308 *
309 * Some members, marked 'const', are immutable. Accessing other members
310 * requires synchronization, as noted in more detail below.
311 *
312 * Acquisition order is, from outermost to innermost:
313 *
314 * dp_netdev_mutex (global)
315 * port_mutex
316 * bond_mutex
317 * non_pmd_mutex
318 */
319 struct dp_netdev {
320 const struct dpif_class *const class;
321 const char *const name;
322 struct ovs_refcount ref_cnt;
323 atomic_flag destroyed;
324
325 /* Ports.
326 *
327 * Any lookup into 'ports' or any access to the dp_netdev_ports found
328 * through 'ports' requires taking 'port_mutex'. */
329 struct ovs_mutex port_mutex;
330 struct hmap ports;
331 struct seq *port_seq; /* Incremented whenever a port changes. */
332
333 /* The time that a packet can wait in output batch for sending. */
334 atomic_uint32_t tx_flush_interval;
335
336 /* Meters. */
337 struct ovs_mutex meter_locks[N_METER_LOCKS];
338 struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
339
340 /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
341 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
342 /* Enable collection of PMD performance metrics. */
343 atomic_bool pmd_perf_metrics;
344 /* Enable the SMC cache from ovsdb config */
345 atomic_bool smc_enable_db;
346
347 /* Protects access to ofproto-dpif-upcall interface during revalidator
348 * thread synchronization. */
349 struct fat_rwlock upcall_rwlock;
350 upcall_callback *upcall_cb; /* Callback function for executing upcalls. */
351 void *upcall_aux;
352
353 /* Callback function for notifying the purging of dp flows (during
354 * reseting pmd deletion). */
355 dp_purge_callback *dp_purge_cb;
356 void *dp_purge_aux;
357
358 /* Stores all 'struct dp_netdev_pmd_thread's. */
359 struct cmap poll_threads;
360 /* id pool for per thread static_tx_qid. */
361 struct id_pool *tx_qid_pool;
362 struct ovs_mutex tx_qid_pool_mutex;
363 /* Use measured cycles for rxq to pmd assignment. */
364 bool pmd_rxq_assign_cyc;
365
366 /* Protects the access of the 'struct dp_netdev_pmd_thread'
367 * instance for non-pmd thread. */
368 struct ovs_mutex non_pmd_mutex;
369
370 /* Each pmd thread will store its pointer to
371 * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
372 ovsthread_key_t per_pmd_key;
373
374 struct seq *reconfigure_seq;
375 uint64_t last_reconfigure_seq;
376
377 /* Cpu mask for pin of pmd threads. */
378 char *pmd_cmask;
379
380 uint64_t last_tnl_conf_seq;
381
382 struct conntrack *conntrack;
383 struct pmd_auto_lb pmd_alb;
384
385 /* Bonds. */
386 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
387 struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
388 };
389
390 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
391 OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
392 {
393 ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
394 }
395
396 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
397 OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
398 {
399 ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
400 }
401
402
403 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
404 odp_port_t)
405 OVS_REQUIRES(dp->port_mutex);
406
407 enum rxq_cycles_counter_type {
408 RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
409 processing packets during the current
410 interval. */
411 RXQ_CYCLES_PROC_HIST, /* Total cycles of all intervals that are used
412 during rxq to pmd assignment. */
413 RXQ_N_CYCLES
414 };
415
416 enum {
417 DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
418 DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
419 DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
420 };
421
422 struct dp_flow_offload_item {
423 struct dp_netdev_pmd_thread *pmd;
424 struct dp_netdev_flow *flow;
425 int op;
426 struct match match;
427 struct nlattr *actions;
428 size_t actions_len;
429
430 struct ovs_list node;
431 };
432
433 struct dp_flow_offload {
434 struct ovs_mutex mutex;
435 struct ovs_list list;
436 pthread_cond_t cond;
437 };
438
439 static struct dp_flow_offload dp_flow_offload = {
440 .mutex = OVS_MUTEX_INITIALIZER,
441 .list = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
442 };
443
444 static struct ovsthread_once offload_thread_once
445 = OVSTHREAD_ONCE_INITIALIZER;
446
447 #define XPS_TIMEOUT 500000LL /* In microseconds. */
448
449 /* Contained by struct dp_netdev_port's 'rxqs' member. */
450 struct dp_netdev_rxq {
451 struct dp_netdev_port *port;
452 struct netdev_rxq *rx;
453 unsigned core_id; /* Core to which this queue should be
454 pinned. OVS_CORE_UNSPEC if the
455 queue doesn't need to be pinned to a
456 particular core. */
457 unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */
458 struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */
459 bool is_vhost; /* Is rxq of a vhost port. */
460
461 /* Counters of cycles spent successfully polling and processing pkts. */
462 atomic_ullong cycles[RXQ_N_CYCLES];
463 /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
464 sum them to yield the cycles used for an rxq. */
465 atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
466 };
467
468 /* A port in a netdev-based datapath. */
469 struct dp_netdev_port {
470 odp_port_t port_no;
471 bool dynamic_txqs; /* If true XPS will be used. */
472 bool need_reconfigure; /* True if we should reconfigure netdev. */
473 struct netdev *netdev;
474 struct hmap_node node; /* Node in dp_netdev's 'ports'. */
475 struct netdev_saved_flags *sf;
476 struct dp_netdev_rxq *rxqs;
477 unsigned n_rxq; /* Number of elements in 'rxqs' */
478 unsigned *txq_used; /* Number of threads that use each tx queue. */
479 struct ovs_mutex txq_used_mutex;
480 bool emc_enabled; /* If true EMC will be used. */
481 char *type; /* Port type as requested by user. */
482 char *rxq_affinity_list; /* Requested affinity of rx queues. */
483 };
484
485 /* Contained by struct dp_netdev_flow's 'stats' member. */
486 struct dp_netdev_flow_stats {
487 atomic_llong used; /* Last used time, in monotonic msecs. */
488 atomic_ullong packet_count; /* Number of packets matched. */
489 atomic_ullong byte_count; /* Number of bytes matched. */
490 atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */
491 };
492
493 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
494 *
495 *
496 * Thread-safety
497 * =============
498 *
499 * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
500 * its pmd thread's classifier. The text below calls this classifier 'cls'.
501 *
502 * Motivation
503 * ----------
504 *
505 * The thread safety rules described here for "struct dp_netdev_flow" are
506 * motivated by two goals:
507 *
508 * - Prevent threads that read members of "struct dp_netdev_flow" from
509 * reading bad data due to changes by some thread concurrently modifying
510 * those members.
511 *
512 * - Prevent two threads making changes to members of a given "struct
513 * dp_netdev_flow" from interfering with each other.
514 *
515 *
516 * Rules
517 * -----
518 *
519 * A flow 'flow' may be accessed without a risk of being freed during an RCU
520 * grace period. Code that needs to hold onto a flow for a while
521 * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
522 *
523 * 'flow->ref_cnt' protects 'flow' from being freed. It doesn't protect the
524 * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
525 * from modification.
526 *
527 * Some members, marked 'const', are immutable. Accessing other members
528 * requires synchronization, as noted in more detail below.
529 */
530 struct dp_netdev_flow {
531 const struct flow flow; /* Unmasked flow that created this entry. */
532 /* Hash table index by unmasked flow. */
533 const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
534 /* 'flow_table'. */
535 const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
536 const ovs_u128 ufid; /* Unique flow identifier. */
537 const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
538 const unsigned pmd_id; /* The 'core_id' of pmd thread owning this */
539 /* flow. */
540
541 /* Number of references.
542 * The classifier owns one reference.
543 * Any thread trying to keep a rule from being freed should hold its own
544 * reference. */
545 struct ovs_refcount ref_cnt;
546
547 bool dead;
548 uint32_t mark; /* Unique flow mark assigned to a flow */
549
550 /* Statistics. */
551 struct dp_netdev_flow_stats stats;
552
553 /* Actions. */
554 OVSRCU_TYPE(struct dp_netdev_actions *) actions;
555
556 /* While processing a group of input packets, the datapath uses the next
557 * member to store a pointer to the output batch for the flow. It is
558 * reset after the batch has been sent out (See dp_netdev_queue_batches(),
559 * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
560 struct packet_batch_per_flow *batch;
561
562 /* Packet classification. */
563 char *dp_extra_info; /* String to return in a flow dump/get. */
564 struct dpcls_rule cr; /* In owning dp_netdev's 'cls'. */
565 /* 'cr' must be the last member. */
566 };
567
568 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
569 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
570 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
571 struct flow *, bool);
572
573 /* A set of datapath actions within a "struct dp_netdev_flow".
574 *
575 *
576 * Thread-safety
577 * =============
578 *
579 * A struct dp_netdev_actions 'actions' is protected with RCU. */
580 struct dp_netdev_actions {
581 /* These members are immutable: they do not change during the struct's
582 * lifetime. */
583 unsigned int size; /* Size of 'actions', in bytes. */
584 struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */
585 };
586
587 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
588 size_t);
589 struct dp_netdev_actions *dp_netdev_flow_get_actions(
590 const struct dp_netdev_flow *);
591 static void dp_netdev_actions_free(struct dp_netdev_actions *);
592
593 struct polled_queue {
594 struct dp_netdev_rxq *rxq;
595 odp_port_t port_no;
596 bool emc_enabled;
597 bool rxq_enabled;
598 uint64_t change_seq;
599 };
600
601 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
602 struct rxq_poll {
603 struct dp_netdev_rxq *rxq;
604 struct hmap_node node;
605 };
606
607 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
608 * 'tnl_port_cache' or 'tx_ports'. */
609 struct tx_port {
610 struct dp_netdev_port *port;
611 int qid;
612 long long last_used;
613 struct hmap_node node;
614 long long flush_time;
615 struct dp_packet_batch output_pkts;
616 struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
617 };
618
619 /* Contained by struct tx_bond 'slave_buckets'. */
620 struct slave_entry {
621 odp_port_t slave_id;
622 atomic_ullong n_packets;
623 atomic_ullong n_bytes;
624 };
625
626 /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
627 struct tx_bond {
628 struct cmap_node node;
629 uint32_t bond_id;
630 struct slave_entry slave_buckets[BOND_BUCKETS];
631 };
632
633 /* A set of properties for the current processing loop that is not directly
634 * associated with the pmd thread itself, but with the packets being
635 * processed or the short-term system configuration (for example, time).
636 * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
637 struct dp_netdev_pmd_thread_ctx {
638 /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
639 long long now;
640 /* RX queue from which last packet was received. */
641 struct dp_netdev_rxq *last_rxq;
642 /* EMC insertion probability context for the current processing cycle. */
643 uint32_t emc_insert_min;
644 };
645
646 /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
647 * the performance overhead of interrupt processing. Therefore netdev can
648 * not implement rx-wait for these devices. dpif-netdev needs to poll
649 * these device to check for recv buffer. pmd-thread does polling for
650 * devices assigned to itself.
651 *
652 * DPDK used PMD for accessing NIC.
653 *
654 * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
655 * I/O of all non-pmd threads. There will be no actual thread created
656 * for the instance.
657 *
658 * Each struct has its own flow cache and classifier per managed ingress port.
659 * For packets received on ingress port, a look up is done on corresponding PMD
660 * thread's flow cache and in case of a miss, lookup is performed in the
661 * corresponding classifier of port. Packets are executed with the found
662 * actions in either case.
663 * */
664 struct dp_netdev_pmd_thread {
665 struct dp_netdev *dp;
666 struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
667 struct cmap_node node; /* In 'dp->poll_threads'. */
668
669 /* Per thread exact-match cache. Note, the instance for cpu core
670 * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
671 * need to be protected by 'non_pmd_mutex'. Every other instance
672 * will only be accessed by its own pmd thread. */
673 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
674
675 /* Flow-Table and classifiers
676 *
677 * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
678 * changes to 'classifiers' must be made while still holding the
679 * 'flow_mutex'.
680 */
681 struct ovs_mutex flow_mutex;
682 struct cmap flow_table OVS_GUARDED; /* Flow table. */
683
684 /* One classifier per in_port polled by the pmd */
685 struct cmap classifiers;
686 /* Periodically sort subtable vectors according to hit frequencies */
687 long long int next_optimization;
688 /* End of the next time interval for which processing cycles
689 are stored for each polled rxq. */
690 long long int rxq_next_cycle_store;
691
692 /* Last interval timestamp. */
693 uint64_t intrvl_tsc_prev;
694 /* Last interval cycles. */
695 atomic_ullong intrvl_cycles;
696
697 /* Current context of the PMD thread. */
698 struct dp_netdev_pmd_thread_ctx ctx;
699
700 struct seq *reload_seq;
701 uint64_t last_reload_seq;
702
703 /* These are atomic variables used as a synchronization and configuration
704 * points for thread reload/exit.
705 *
706 * 'reload' atomic is the main one and it's used as a memory
707 * synchronization point for all other knobs and data.
708 *
709 * For a thread that requests PMD reload:
710 *
711 * * All changes that should be visible to the PMD thread must be made
712 * before setting the 'reload'. These changes could use any memory
713 * ordering model including 'relaxed'.
714 * * Setting the 'reload' atomic should occur in the same thread where
715 * all other PMD configuration options updated.
716 * * Setting the 'reload' atomic should be done with 'release' memory
717 * ordering model or stricter. This will guarantee that all previous
718 * changes (including non-atomic and 'relaxed') will be visible to
719 * the PMD thread.
720 * * To check that reload is done, thread should poll the 'reload' atomic
721 * to become 'false'. Polling should be done with 'acquire' memory
722 * ordering model or stricter. This ensures that PMD thread completed
723 * the reload process.
724 *
725 * For the PMD thread:
726 *
727 * * PMD thread should read 'reload' atomic with 'acquire' memory
728 * ordering model or stricter. This will guarantee that all changes
729 * made before setting the 'reload' in the requesting thread will be
730 * visible to the PMD thread.
731 * * All other configuration data could be read with any memory
732 * ordering model (including non-atomic and 'relaxed') but *only after*
733 * reading the 'reload' atomic set to 'true'.
734 * * When the PMD reload done, PMD should (optionally) set all the below
735 * knobs except the 'reload' to their default ('false') values and
736 * (mandatory), as the last step, set the 'reload' to 'false' using
737 * 'release' memory ordering model or stricter. This will inform the
738 * requesting thread that PMD has completed a reload cycle.
739 */
740 atomic_bool reload; /* Do we need to reload ports? */
741 atomic_bool wait_for_reload; /* Can we busy wait for the next reload? */
742 atomic_bool reload_tx_qid; /* Do we need to reload static_tx_qid? */
743 atomic_bool exit; /* For terminating the pmd thread. */
744
745 pthread_t thread;
746 unsigned core_id; /* CPU core id of this pmd thread. */
747 int numa_id; /* numa node id of this pmd thread. */
748 bool isolated;
749
750 /* Queue id used by this pmd thread to send packets on all netdevs if
751 * XPS disabled for this netdev. All static_tx_qid's are unique and less
752 * than 'cmap_count(dp->poll_threads)'. */
753 uint32_t static_tx_qid;
754
755 /* Number of filled output batches. */
756 int n_output_batches;
757
758 struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
759 /* List of rx queues to poll. */
760 struct hmap poll_list OVS_GUARDED;
761 /* Map of 'tx_port's used for transmission. Written by the main thread,
762 * read by the pmd thread. */
763 struct hmap tx_ports OVS_GUARDED;
764
765 struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
766 /* Map of 'tx_bond's used for transmission. Written by the main thread
767 * and read by the pmd thread. */
768 struct cmap tx_bonds;
769
770 /* These are thread-local copies of 'tx_ports'. One contains only tunnel
771 * ports (that support push_tunnel/pop_tunnel), the other contains ports
772 * with at least one txq (that support send). A port can be in both.
773 *
774 * There are two separate maps to make sure that we don't try to execute
775 * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
776 *
777 * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
778 * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
779 * other instance will only be accessed by its own pmd thread. */
780 struct hmap tnl_port_cache;
781 struct hmap send_port_cache;
782
783 /* Keep track of detailed PMD performance statistics. */
784 struct pmd_perf_stats perf_stats;
785
786 /* Stats from previous iteration used by automatic pmd
787 * load balance logic. */
788 uint64_t prev_stats[PMD_N_STATS];
789 atomic_count pmd_overloaded;
790
791 /* Set to true if the pmd thread needs to be reloaded. */
792 bool need_reload;
793
794 /* Next time when PMD should try RCU quiescing. */
795 long long next_rcu_quiesce;
796 };
797
798 /* Interface to netdev-based datapath. */
799 struct dpif_netdev {
800 struct dpif dpif;
801 struct dp_netdev *dp;
802 uint64_t last_port_seq;
803 };
804
805 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
806 struct dp_netdev_port **portp)
807 OVS_REQUIRES(dp->port_mutex);
808 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
809 struct dp_netdev_port **portp)
810 OVS_REQUIRES(dp->port_mutex);
811 static void dp_netdev_free(struct dp_netdev *)
812 OVS_REQUIRES(dp_netdev_mutex);
813 static int do_add_port(struct dp_netdev *dp, const char *devname,
814 const char *type, odp_port_t port_no)
815 OVS_REQUIRES(dp->port_mutex);
816 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
817 OVS_REQUIRES(dp->port_mutex);
818 static int dpif_netdev_open(const struct dpif_class *, const char *name,
819 bool create, struct dpif **);
820 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
821 struct dp_packet_batch *,
822 bool should_steal,
823 const struct flow *flow,
824 const struct nlattr *actions,
825 size_t actions_len);
826 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
827 struct dp_packet_batch *, odp_port_t port_no);
828 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
829 struct dp_packet_batch *);
830
831 static void dp_netdev_disable_upcall(struct dp_netdev *);
832 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
833 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
834 struct dp_netdev *dp, unsigned core_id,
835 int numa_id);
836 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
837 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
838 OVS_REQUIRES(dp->port_mutex);
839
840 static void *pmd_thread_main(void *);
841 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
842 unsigned core_id);
843 static struct dp_netdev_pmd_thread *
844 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
845 static void dp_netdev_del_pmd(struct dp_netdev *dp,
846 struct dp_netdev_pmd_thread *pmd);
847 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
848 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
849 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
850 struct dp_netdev_port *port)
851 OVS_REQUIRES(pmd->port_mutex);
852 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
853 struct tx_port *tx)
854 OVS_REQUIRES(pmd->port_mutex);
855 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
856 struct dp_netdev_rxq *rxq)
857 OVS_REQUIRES(pmd->port_mutex);
858 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
859 struct rxq_poll *poll)
860 OVS_REQUIRES(pmd->port_mutex);
861 static int
862 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
863 bool force);
864 static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
865 struct tx_bond *bond, bool update)
866 OVS_EXCLUDED(pmd->bond_mutex);
867 static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
868 uint32_t bond_id)
869 OVS_EXCLUDED(pmd->bond_mutex);
870
871 static void reconfigure_datapath(struct dp_netdev *dp)
872 OVS_REQUIRES(dp->port_mutex);
873 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
874 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
875 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
876 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
877 OVS_REQUIRES(pmd->port_mutex);
878 static inline void
879 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
880 struct polled_queue *poll_list, int poll_cnt);
881 static void
882 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
883 enum rxq_cycles_counter_type type,
884 unsigned long long cycles);
885 static uint64_t
886 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
887 enum rxq_cycles_counter_type type);
888 static void
889 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
890 unsigned long long cycles);
891 static uint64_t
892 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
893 static void
894 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
895 bool purge);
896 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
897 struct tx_port *tx);
898
899 static inline bool emc_entry_alive(struct emc_entry *ce);
900 static void emc_clear_entry(struct emc_entry *ce);
901 static void smc_clear_entry(struct smc_bucket *b, int idx);
902
903 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
904 static inline bool
905 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
906 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
907 struct dp_netdev_flow *flow);
908
909 static void
910 emc_cache_init(struct emc_cache *flow_cache)
911 {
912 int i;
913
914 flow_cache->sweep_idx = 0;
915 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
916 flow_cache->entries[i].flow = NULL;
917 flow_cache->entries[i].key.hash = 0;
918 flow_cache->entries[i].key.len = sizeof(struct miniflow);
919 flowmap_init(&flow_cache->entries[i].key.mf.map);
920 }
921 }
922
923 static void
924 smc_cache_init(struct smc_cache *smc_cache)
925 {
926 int i, j;
927 for (i = 0; i < SMC_BUCKET_CNT; i++) {
928 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
929 smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
930 }
931 }
932 }
933
934 static void
935 dfc_cache_init(struct dfc_cache *flow_cache)
936 {
937 emc_cache_init(&flow_cache->emc_cache);
938 smc_cache_init(&flow_cache->smc_cache);
939 }
940
941 static void
942 emc_cache_uninit(struct emc_cache *flow_cache)
943 {
944 int i;
945
946 for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
947 emc_clear_entry(&flow_cache->entries[i]);
948 }
949 }
950
951 static void
952 smc_cache_uninit(struct smc_cache *smc)
953 {
954 int i, j;
955
956 for (i = 0; i < SMC_BUCKET_CNT; i++) {
957 for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
958 smc_clear_entry(&(smc->buckets[i]), j);
959 }
960 }
961 }
962
963 static void
964 dfc_cache_uninit(struct dfc_cache *flow_cache)
965 {
966 smc_cache_uninit(&flow_cache->smc_cache);
967 emc_cache_uninit(&flow_cache->emc_cache);
968 }
969
970 /* Check and clear dead flow references slowly (one entry at each
971 * invocation). */
972 static void
973 emc_cache_slow_sweep(struct emc_cache *flow_cache)
974 {
975 struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
976
977 if (!emc_entry_alive(entry)) {
978 emc_clear_entry(entry);
979 }
980 flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
981 }
982
983 /* Updates the time in PMD threads context and should be called in three cases:
984 *
985 * 1. PMD structure initialization:
986 * - dp_netdev_configure_pmd()
987 *
988 * 2. Before processing of the new packet batch:
989 * - dpif_netdev_execute()
990 * - dp_netdev_process_rxq_port()
991 *
992 * 3. At least once per polling iteration in main polling threads if no
993 * packets received on current iteration:
994 * - dpif_netdev_run()
995 * - pmd_thread_main()
996 *
997 * 'pmd->ctx.now' should be used without update in all other cases if possible.
998 */
999 static inline void
1000 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
1001 {
1002 pmd->ctx.now = time_usec();
1003 }
1004
1005 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
1006 bool
1007 dpif_is_netdev(const struct dpif *dpif)
1008 {
1009 return dpif->dpif_class->open == dpif_netdev_open;
1010 }
1011
1012 static struct dpif_netdev *
1013 dpif_netdev_cast(const struct dpif *dpif)
1014 {
1015 ovs_assert(dpif_is_netdev(dpif));
1016 return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
1017 }
1018
1019 static struct dp_netdev *
1020 get_dp_netdev(const struct dpif *dpif)
1021 {
1022 return dpif_netdev_cast(dpif)->dp;
1023 }
1024 \f
1025 enum pmd_info_type {
1026 PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */
1027 PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
1028 PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */
1029 PMD_INFO_PERF_SHOW, /* Show pmd performance details. */
1030 };
1031
1032 static void
1033 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1034 {
1035 ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1036 ? "main thread" : "pmd thread");
1037 if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1038 ds_put_format(reply, " numa_id %d", pmd->numa_id);
1039 }
1040 if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1041 ds_put_format(reply, " core_id %u", pmd->core_id);
1042 }
1043 ds_put_cstr(reply, ":\n");
1044 }
1045
1046 static void
1047 pmd_info_show_stats(struct ds *reply,
1048 struct dp_netdev_pmd_thread *pmd)
1049 {
1050 uint64_t stats[PMD_N_STATS];
1051 uint64_t total_cycles, total_packets;
1052 double passes_per_pkt = 0;
1053 double lookups_per_hit = 0;
1054 double packets_per_batch = 0;
1055
1056 pmd_perf_read_counters(&pmd->perf_stats, stats);
1057 total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1058 + stats[PMD_CYCLES_ITER_BUSY];
1059 total_packets = stats[PMD_STAT_RECV];
1060
1061 format_pmd_thread(reply, pmd);
1062
1063 if (total_packets > 0) {
1064 passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1065 / (double) total_packets;
1066 }
1067 if (stats[PMD_STAT_MASKED_HIT] > 0) {
1068 lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1069 / (double) stats[PMD_STAT_MASKED_HIT];
1070 }
1071 if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1072 packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1073 / (double) stats[PMD_STAT_SENT_BATCHES];
1074 }
1075
1076 ds_put_format(reply,
1077 " packets received: %"PRIu64"\n"
1078 " packet recirculations: %"PRIu64"\n"
1079 " avg. datapath passes per packet: %.02f\n"
1080 " emc hits: %"PRIu64"\n"
1081 " smc hits: %"PRIu64"\n"
1082 " megaflow hits: %"PRIu64"\n"
1083 " avg. subtable lookups per megaflow hit: %.02f\n"
1084 " miss with success upcall: %"PRIu64"\n"
1085 " miss with failed upcall: %"PRIu64"\n"
1086 " avg. packets per output batch: %.02f\n",
1087 total_packets, stats[PMD_STAT_RECIRC],
1088 passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1089 stats[PMD_STAT_SMC_HIT],
1090 stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1091 stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1092 packets_per_batch);
1093
1094 if (total_cycles == 0) {
1095 return;
1096 }
1097
1098 ds_put_format(reply,
1099 " idle cycles: %"PRIu64" (%.02f%%)\n"
1100 " processing cycles: %"PRIu64" (%.02f%%)\n",
1101 stats[PMD_CYCLES_ITER_IDLE],
1102 stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1103 stats[PMD_CYCLES_ITER_BUSY],
1104 stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1105
1106 if (total_packets == 0) {
1107 return;
1108 }
1109
1110 ds_put_format(reply,
1111 " avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1112 total_cycles / (double) total_packets,
1113 total_cycles, total_packets);
1114
1115 ds_put_format(reply,
1116 " avg processing cycles per packet: "
1117 "%.02f (%"PRIu64"/%"PRIu64")\n",
1118 stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1119 stats[PMD_CYCLES_ITER_BUSY], total_packets);
1120 }
1121
1122 static void
1123 pmd_info_show_perf(struct ds *reply,
1124 struct dp_netdev_pmd_thread *pmd,
1125 struct pmd_perf_params *par)
1126 {
1127 if (pmd->core_id != NON_PMD_CORE_ID) {
1128 char *time_str =
1129 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1130 long long now = time_msec();
1131 double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1132
1133 ds_put_cstr(reply, "\n");
1134 ds_put_format(reply, "Time: %s\n", time_str);
1135 ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1136 ds_put_cstr(reply, "\n");
1137 format_pmd_thread(reply, pmd);
1138 ds_put_cstr(reply, "\n");
1139 pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1140 if (pmd_perf_metrics_enabled(pmd)) {
1141 /* Prevent parallel clearing of perf metrics. */
1142 ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1143 if (par->histograms) {
1144 ds_put_cstr(reply, "\n");
1145 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1146 }
1147 if (par->iter_hist_len > 0) {
1148 ds_put_cstr(reply, "\n");
1149 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1150 par->iter_hist_len);
1151 }
1152 if (par->ms_hist_len > 0) {
1153 ds_put_cstr(reply, "\n");
1154 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1155 par->ms_hist_len);
1156 }
1157 ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1158 }
1159 free(time_str);
1160 }
1161 }
1162
1163 static int
1164 compare_poll_list(const void *a_, const void *b_)
1165 {
1166 const struct rxq_poll *a = a_;
1167 const struct rxq_poll *b = b_;
1168
1169 const char *namea = netdev_rxq_get_name(a->rxq->rx);
1170 const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1171
1172 int cmp = strcmp(namea, nameb);
1173 if (!cmp) {
1174 return netdev_rxq_get_queue_id(a->rxq->rx)
1175 - netdev_rxq_get_queue_id(b->rxq->rx);
1176 } else {
1177 return cmp;
1178 }
1179 }
1180
1181 static void
1182 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1183 size_t *n)
1184 OVS_REQUIRES(pmd->port_mutex)
1185 {
1186 struct rxq_poll *ret, *poll;
1187 size_t i;
1188
1189 *n = hmap_count(&pmd->poll_list);
1190 if (!*n) {
1191 ret = NULL;
1192 } else {
1193 ret = xcalloc(*n, sizeof *ret);
1194 i = 0;
1195 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1196 ret[i] = *poll;
1197 i++;
1198 }
1199 ovs_assert(i == *n);
1200 qsort(ret, *n, sizeof *ret, compare_poll_list);
1201 }
1202
1203 *list = ret;
1204 }
1205
1206 static void
1207 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1208 {
1209 if (pmd->core_id != NON_PMD_CORE_ID) {
1210 struct rxq_poll *list;
1211 size_t n_rxq;
1212 uint64_t total_cycles = 0;
1213
1214 ds_put_format(reply,
1215 "pmd thread numa_id %d core_id %u:\n isolated : %s\n",
1216 pmd->numa_id, pmd->core_id, (pmd->isolated)
1217 ? "true" : "false");
1218
1219 ovs_mutex_lock(&pmd->port_mutex);
1220 sorted_poll_list(pmd, &list, &n_rxq);
1221
1222 /* Get the total pmd cycles for an interval. */
1223 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1224 /* Estimate the cycles to cover all intervals. */
1225 total_cycles *= PMD_RXQ_INTERVAL_MAX;
1226
1227 for (int i = 0; i < n_rxq; i++) {
1228 struct dp_netdev_rxq *rxq = list[i].rxq;
1229 const char *name = netdev_rxq_get_name(rxq->rx);
1230 uint64_t proc_cycles = 0;
1231
1232 for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1233 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1234 }
1235 ds_put_format(reply, " port: %-16s queue-id: %2d", name,
1236 netdev_rxq_get_queue_id(list[i].rxq->rx));
1237 ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1238 ? "(enabled) " : "(disabled)");
1239 ds_put_format(reply, " pmd usage: ");
1240 if (total_cycles) {
1241 ds_put_format(reply, "%2"PRIu64"",
1242 proc_cycles * 100 / total_cycles);
1243 ds_put_cstr(reply, " %");
1244 } else {
1245 ds_put_format(reply, "%s", "NOT AVAIL");
1246 }
1247 ds_put_cstr(reply, "\n");
1248 }
1249 ovs_mutex_unlock(&pmd->port_mutex);
1250 free(list);
1251 }
1252 }
1253
1254 static int
1255 compare_poll_thread_list(const void *a_, const void *b_)
1256 {
1257 const struct dp_netdev_pmd_thread *a, *b;
1258
1259 a = *(struct dp_netdev_pmd_thread **)a_;
1260 b = *(struct dp_netdev_pmd_thread **)b_;
1261
1262 if (a->core_id < b->core_id) {
1263 return -1;
1264 }
1265 if (a->core_id > b->core_id) {
1266 return 1;
1267 }
1268 return 0;
1269 }
1270
1271 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1272 * this list, as long as we do not go to quiescent state. */
1273 static void
1274 sorted_poll_thread_list(struct dp_netdev *dp,
1275 struct dp_netdev_pmd_thread ***list,
1276 size_t *n)
1277 {
1278 struct dp_netdev_pmd_thread *pmd;
1279 struct dp_netdev_pmd_thread **pmd_list;
1280 size_t k = 0, n_pmds;
1281
1282 n_pmds = cmap_count(&dp->poll_threads);
1283 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1284
1285 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1286 if (k >= n_pmds) {
1287 break;
1288 }
1289 pmd_list[k++] = pmd;
1290 }
1291
1292 qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1293
1294 *list = pmd_list;
1295 *n = k;
1296 }
1297
1298 static void
1299 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1300 const char *argv[], void *aux OVS_UNUSED)
1301 {
1302 struct ds reply = DS_EMPTY_INITIALIZER;
1303 struct dp_netdev *dp = NULL;
1304
1305 ovs_mutex_lock(&dp_netdev_mutex);
1306
1307 if (argc == 2) {
1308 dp = shash_find_data(&dp_netdevs, argv[1]);
1309 } else if (shash_count(&dp_netdevs) == 1) {
1310 /* There's only one datapath */
1311 dp = shash_first(&dp_netdevs)->data;
1312 }
1313
1314 if (!dp) {
1315 ovs_mutex_unlock(&dp_netdev_mutex);
1316 unixctl_command_reply_error(conn,
1317 "please specify an existing datapath");
1318 return;
1319 }
1320
1321 dp_netdev_request_reconfigure(dp);
1322 ovs_mutex_unlock(&dp_netdev_mutex);
1323 ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1324 unixctl_command_reply(conn, ds_cstr(&reply));
1325 ds_destroy(&reply);
1326 }
1327
1328 static void
1329 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1330 void *aux)
1331 {
1332 struct ds reply = DS_EMPTY_INITIALIZER;
1333 struct dp_netdev_pmd_thread **pmd_list;
1334 struct dp_netdev *dp = NULL;
1335 enum pmd_info_type type = *(enum pmd_info_type *) aux;
1336 unsigned int core_id;
1337 bool filter_on_pmd = false;
1338 size_t n;
1339
1340 ovs_mutex_lock(&dp_netdev_mutex);
1341
1342 while (argc > 1) {
1343 if (!strcmp(argv[1], "-pmd") && argc > 2) {
1344 if (str_to_uint(argv[2], 10, &core_id)) {
1345 filter_on_pmd = true;
1346 }
1347 argc -= 2;
1348 argv += 2;
1349 } else {
1350 dp = shash_find_data(&dp_netdevs, argv[1]);
1351 argc -= 1;
1352 argv += 1;
1353 }
1354 }
1355
1356 if (!dp) {
1357 if (shash_count(&dp_netdevs) == 1) {
1358 /* There's only one datapath */
1359 dp = shash_first(&dp_netdevs)->data;
1360 } else {
1361 ovs_mutex_unlock(&dp_netdev_mutex);
1362 unixctl_command_reply_error(conn,
1363 "please specify an existing datapath");
1364 return;
1365 }
1366 }
1367
1368 sorted_poll_thread_list(dp, &pmd_list, &n);
1369 for (size_t i = 0; i < n; i++) {
1370 struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1371 if (!pmd) {
1372 break;
1373 }
1374 if (filter_on_pmd && pmd->core_id != core_id) {
1375 continue;
1376 }
1377 if (type == PMD_INFO_SHOW_RXQ) {
1378 pmd_info_show_rxq(&reply, pmd);
1379 } else if (type == PMD_INFO_CLEAR_STATS) {
1380 pmd_perf_stats_clear(&pmd->perf_stats);
1381 } else if (type == PMD_INFO_SHOW_STATS) {
1382 pmd_info_show_stats(&reply, pmd);
1383 } else if (type == PMD_INFO_PERF_SHOW) {
1384 pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1385 }
1386 }
1387 free(pmd_list);
1388
1389 ovs_mutex_unlock(&dp_netdev_mutex);
1390
1391 unixctl_command_reply(conn, ds_cstr(&reply));
1392 ds_destroy(&reply);
1393 }
1394
1395 static void
1396 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1397 const char *argv[],
1398 void *aux OVS_UNUSED)
1399 {
1400 struct pmd_perf_params par;
1401 long int it_hist = 0, ms_hist = 0;
1402 par.histograms = true;
1403
1404 while (argc > 1) {
1405 if (!strcmp(argv[1], "-nh")) {
1406 par.histograms = false;
1407 argc -= 1;
1408 argv += 1;
1409 } else if (!strcmp(argv[1], "-it") && argc > 2) {
1410 it_hist = strtol(argv[2], NULL, 10);
1411 if (it_hist < 0) {
1412 it_hist = 0;
1413 } else if (it_hist > HISTORY_LEN) {
1414 it_hist = HISTORY_LEN;
1415 }
1416 argc -= 2;
1417 argv += 2;
1418 } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1419 ms_hist = strtol(argv[2], NULL, 10);
1420 if (ms_hist < 0) {
1421 ms_hist = 0;
1422 } else if (ms_hist > HISTORY_LEN) {
1423 ms_hist = HISTORY_LEN;
1424 }
1425 argc -= 2;
1426 argv += 2;
1427 } else {
1428 break;
1429 }
1430 }
1431 par.iter_hist_len = it_hist;
1432 par.ms_hist_len = ms_hist;
1433 par.command_type = PMD_INFO_PERF_SHOW;
1434 dpif_netdev_pmd_info(conn, argc, argv, &par);
1435 }
1436
1437 static void
1438 dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1439 const char *argv[], void *aux OVS_UNUSED)
1440 {
1441 struct ds reply = DS_EMPTY_INITIALIZER;
1442 struct dp_netdev *dp = NULL;
1443
1444 ovs_mutex_lock(&dp_netdev_mutex);
1445 if (argc == 2) {
1446 dp = shash_find_data(&dp_netdevs, argv[1]);
1447 } else if (shash_count(&dp_netdevs) == 1) {
1448 /* There's only one datapath. */
1449 dp = shash_first(&dp_netdevs)->data;
1450 }
1451 if (!dp) {
1452 ovs_mutex_unlock(&dp_netdev_mutex);
1453 unixctl_command_reply_error(conn,
1454 "please specify an existing datapath");
1455 return;
1456 }
1457
1458 if (cmap_count(&dp->tx_bonds) > 0) {
1459 struct tx_bond *dp_bond_entry;
1460 uint32_t slave_id;
1461
1462 ds_put_cstr(&reply, "Bonds:\n");
1463 CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1464 ds_put_format(&reply, " bond-id %"PRIu32":\n",
1465 dp_bond_entry->bond_id);
1466 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1467 slave_id =
1468 odp_to_u32(dp_bond_entry->slave_buckets[bucket].slave_id);
1469 ds_put_format(&reply, " bucket %d - slave %"PRIu32"\n",
1470 bucket, slave_id);
1471 }
1472 }
1473 }
1474 ovs_mutex_unlock(&dp_netdev_mutex);
1475 unixctl_command_reply(conn, ds_cstr(&reply));
1476 ds_destroy(&reply);
1477 }
1478
1479 \f
1480 static int
1481 dpif_netdev_init(void)
1482 {
1483 static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1484 clear_aux = PMD_INFO_CLEAR_STATS,
1485 poll_aux = PMD_INFO_SHOW_RXQ;
1486
1487 unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1488 0, 3, dpif_netdev_pmd_info,
1489 (void *)&show_aux);
1490 unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1491 0, 3, dpif_netdev_pmd_info,
1492 (void *)&clear_aux);
1493 unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1494 0, 3, dpif_netdev_pmd_info,
1495 (void *)&poll_aux);
1496 unixctl_command_register("dpif-netdev/pmd-perf-show",
1497 "[-nh] [-it iter-history-len]"
1498 " [-ms ms-history-len]"
1499 " [-pmd core] [dp]",
1500 0, 8, pmd_perf_show_cmd,
1501 NULL);
1502 unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1503 0, 1, dpif_netdev_pmd_rebalance,
1504 NULL);
1505 unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1506 "on|off [-b before] [-a after] [-e|-ne] "
1507 "[-us usec] [-q qlen]",
1508 0, 10, pmd_perf_log_set_cmd,
1509 NULL);
1510 unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1511 0, 1, dpif_netdev_bond_show,
1512 NULL);
1513 return 0;
1514 }
1515
1516 static int
1517 dpif_netdev_enumerate(struct sset *all_dps,
1518 const struct dpif_class *dpif_class)
1519 {
1520 struct shash_node *node;
1521
1522 ovs_mutex_lock(&dp_netdev_mutex);
1523 SHASH_FOR_EACH(node, &dp_netdevs) {
1524 struct dp_netdev *dp = node->data;
1525 if (dpif_class != dp->class) {
1526 /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1527 * If the class doesn't match, skip this dpif. */
1528 continue;
1529 }
1530 sset_add(all_dps, node->name);
1531 }
1532 ovs_mutex_unlock(&dp_netdev_mutex);
1533
1534 return 0;
1535 }
1536
1537 static bool
1538 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1539 {
1540 return class != &dpif_netdev_class;
1541 }
1542
1543 static const char *
1544 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1545 {
1546 return strcmp(type, "internal") ? type
1547 : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1548 : "tap";
1549 }
1550
1551 static struct dpif *
1552 create_dpif_netdev(struct dp_netdev *dp)
1553 {
1554 uint16_t netflow_id = hash_string(dp->name, 0);
1555 struct dpif_netdev *dpif;
1556
1557 ovs_refcount_ref(&dp->ref_cnt);
1558
1559 dpif = xmalloc(sizeof *dpif);
1560 dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1561 dpif->dp = dp;
1562 dpif->last_port_seq = seq_read(dp->port_seq);
1563
1564 return &dpif->dpif;
1565 }
1566
1567 /* Choose an unused, non-zero port number and return it on success.
1568 * Return ODPP_NONE on failure. */
1569 static odp_port_t
1570 choose_port(struct dp_netdev *dp, const char *name)
1571 OVS_REQUIRES(dp->port_mutex)
1572 {
1573 uint32_t port_no;
1574
1575 if (dp->class != &dpif_netdev_class) {
1576 const char *p;
1577 int start_no = 0;
1578
1579 /* If the port name begins with "br", start the number search at
1580 * 100 to make writing tests easier. */
1581 if (!strncmp(name, "br", 2)) {
1582 start_no = 100;
1583 }
1584
1585 /* If the port name contains a number, try to assign that port number.
1586 * This can make writing unit tests easier because port numbers are
1587 * predictable. */
1588 for (p = name; *p != '\0'; p++) {
1589 if (isdigit((unsigned char) *p)) {
1590 port_no = start_no + strtol(p, NULL, 10);
1591 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1592 && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1593 return u32_to_odp(port_no);
1594 }
1595 break;
1596 }
1597 }
1598 }
1599
1600 for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1601 if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1602 return u32_to_odp(port_no);
1603 }
1604 }
1605
1606 return ODPP_NONE;
1607 }
1608
1609 static int
1610 create_dp_netdev(const char *name, const struct dpif_class *class,
1611 struct dp_netdev **dpp)
1612 OVS_REQUIRES(dp_netdev_mutex)
1613 {
1614 static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1615 struct dp_netdev *dp;
1616 int error;
1617
1618 /* Avoid estimating TSC frequency for dummy datapath to not slow down
1619 * unit tests. */
1620 if (!dpif_netdev_class_is_dummy(class)
1621 && ovsthread_once_start(&tsc_freq_check)) {
1622 pmd_perf_estimate_tsc_frequency();
1623 ovsthread_once_done(&tsc_freq_check);
1624 }
1625
1626 dp = xzalloc(sizeof *dp);
1627 shash_add(&dp_netdevs, name, dp);
1628
1629 *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1630 *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1631 ovs_refcount_init(&dp->ref_cnt);
1632 atomic_flag_clear(&dp->destroyed);
1633
1634 ovs_mutex_init_recursive(&dp->port_mutex);
1635 hmap_init(&dp->ports);
1636 dp->port_seq = seq_create();
1637 ovs_mutex_init(&dp->bond_mutex);
1638 cmap_init(&dp->tx_bonds);
1639
1640 fat_rwlock_init(&dp->upcall_rwlock);
1641
1642 dp->reconfigure_seq = seq_create();
1643 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1644
1645 for (int i = 0; i < N_METER_LOCKS; ++i) {
1646 ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1647 }
1648
1649 /* Disable upcalls by default. */
1650 dp_netdev_disable_upcall(dp);
1651 dp->upcall_aux = NULL;
1652 dp->upcall_cb = NULL;
1653
1654 dp->conntrack = conntrack_init();
1655
1656 atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1657 atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1658
1659 cmap_init(&dp->poll_threads);
1660 dp->pmd_rxq_assign_cyc = true;
1661
1662 ovs_mutex_init(&dp->tx_qid_pool_mutex);
1663 /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1664 dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1665
1666 ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1667 ovsthread_key_create(&dp->per_pmd_key, NULL);
1668
1669 ovs_mutex_lock(&dp->port_mutex);
1670 /* non-PMD will be created before all other threads and will
1671 * allocate static_tx_qid = 0. */
1672 dp_netdev_set_nonpmd(dp);
1673
1674 error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1675 "internal"),
1676 ODPP_LOCAL);
1677 ovs_mutex_unlock(&dp->port_mutex);
1678 if (error) {
1679 dp_netdev_free(dp);
1680 return error;
1681 }
1682
1683 dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1684 *dpp = dp;
1685 return 0;
1686 }
1687
1688 static void
1689 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1690 {
1691 seq_change(dp->reconfigure_seq);
1692 }
1693
1694 static bool
1695 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1696 {
1697 return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1698 }
1699
1700 static int
1701 dpif_netdev_open(const struct dpif_class *class, const char *name,
1702 bool create, struct dpif **dpifp)
1703 {
1704 struct dp_netdev *dp;
1705 int error;
1706
1707 ovs_mutex_lock(&dp_netdev_mutex);
1708 dp = shash_find_data(&dp_netdevs, name);
1709 if (!dp) {
1710 error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1711 } else {
1712 error = (dp->class != class ? EINVAL
1713 : create ? EEXIST
1714 : 0);
1715 }
1716 if (!error) {
1717 *dpifp = create_dpif_netdev(dp);
1718 }
1719 ovs_mutex_unlock(&dp_netdev_mutex);
1720
1721 return error;
1722 }
1723
1724 static void
1725 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1726 OVS_NO_THREAD_SAFETY_ANALYSIS
1727 {
1728 /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1729 ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1730
1731 /* Before freeing a lock we should release it */
1732 fat_rwlock_unlock(&dp->upcall_rwlock);
1733 fat_rwlock_destroy(&dp->upcall_rwlock);
1734 }
1735
1736 static void
1737 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1738 OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1739 {
1740 if (dp->meters[meter_id]) {
1741 free(dp->meters[meter_id]);
1742 dp->meters[meter_id] = NULL;
1743 }
1744 }
1745
1746 static uint32_t
1747 hash_bond_id(uint32_t bond_id)
1748 {
1749 return hash_int(bond_id, 0);
1750 }
1751
1752 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1753 * through the 'dp_netdevs' shash while freeing 'dp'. */
1754 static void
1755 dp_netdev_free(struct dp_netdev *dp)
1756 OVS_REQUIRES(dp_netdev_mutex)
1757 {
1758 struct dp_netdev_port *port, *next;
1759 struct tx_bond *bond;
1760
1761 shash_find_and_delete(&dp_netdevs, dp->name);
1762
1763 ovs_mutex_lock(&dp->port_mutex);
1764 HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1765 do_del_port(dp, port);
1766 }
1767 ovs_mutex_unlock(&dp->port_mutex);
1768
1769 ovs_mutex_lock(&dp->bond_mutex);
1770 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1771 cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1772 ovsrcu_postpone(free, bond);
1773 }
1774 ovs_mutex_unlock(&dp->bond_mutex);
1775
1776 dp_netdev_destroy_all_pmds(dp, true);
1777 cmap_destroy(&dp->poll_threads);
1778
1779 ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1780 id_pool_destroy(dp->tx_qid_pool);
1781
1782 ovs_mutex_destroy(&dp->non_pmd_mutex);
1783 ovsthread_key_delete(dp->per_pmd_key);
1784
1785 conntrack_destroy(dp->conntrack);
1786
1787
1788 seq_destroy(dp->reconfigure_seq);
1789
1790 seq_destroy(dp->port_seq);
1791 hmap_destroy(&dp->ports);
1792 ovs_mutex_destroy(&dp->port_mutex);
1793
1794 cmap_destroy(&dp->tx_bonds);
1795 ovs_mutex_destroy(&dp->bond_mutex);
1796
1797 /* Upcalls must be disabled at this point */
1798 dp_netdev_destroy_upcall_lock(dp);
1799
1800 int i;
1801
1802 for (i = 0; i < MAX_METERS; ++i) {
1803 meter_lock(dp, i);
1804 dp_delete_meter(dp, i);
1805 meter_unlock(dp, i);
1806 }
1807 for (i = 0; i < N_METER_LOCKS; ++i) {
1808 ovs_mutex_destroy(&dp->meter_locks[i]);
1809 }
1810
1811 free(dp->pmd_cmask);
1812 free(CONST_CAST(char *, dp->name));
1813 free(dp);
1814 }
1815
1816 static void
1817 dp_netdev_unref(struct dp_netdev *dp)
1818 {
1819 if (dp) {
1820 /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1821 * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1822 ovs_mutex_lock(&dp_netdev_mutex);
1823 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1824 dp_netdev_free(dp);
1825 }
1826 ovs_mutex_unlock(&dp_netdev_mutex);
1827 }
1828 }
1829
1830 static void
1831 dpif_netdev_close(struct dpif *dpif)
1832 {
1833 struct dp_netdev *dp = get_dp_netdev(dpif);
1834
1835 dp_netdev_unref(dp);
1836 free(dpif);
1837 }
1838
1839 static int
1840 dpif_netdev_destroy(struct dpif *dpif)
1841 {
1842 struct dp_netdev *dp = get_dp_netdev(dpif);
1843
1844 if (!atomic_flag_test_and_set(&dp->destroyed)) {
1845 if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1846 /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1847 OVS_NOT_REACHED();
1848 }
1849 }
1850
1851 return 0;
1852 }
1853
1854 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1855 * load/store semantics. While the increment is not atomic, the load and
1856 * store operations are, making it impossible to read inconsistent values.
1857 *
1858 * This is used to update thread local stats counters. */
1859 static void
1860 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1861 {
1862 unsigned long long tmp;
1863
1864 atomic_read_relaxed(var, &tmp);
1865 tmp += n;
1866 atomic_store_relaxed(var, tmp);
1867 }
1868
1869 static int
1870 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1871 {
1872 struct dp_netdev *dp = get_dp_netdev(dpif);
1873 struct dp_netdev_pmd_thread *pmd;
1874 uint64_t pmd_stats[PMD_N_STATS];
1875
1876 stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1877 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1878 stats->n_flows += cmap_count(&pmd->flow_table);
1879 pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1880 stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1881 stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1882 stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1883 stats->n_missed += pmd_stats[PMD_STAT_MISS];
1884 stats->n_lost += pmd_stats[PMD_STAT_LOST];
1885 }
1886 stats->n_masks = UINT32_MAX;
1887 stats->n_mask_hit = UINT64_MAX;
1888
1889 return 0;
1890 }
1891
1892 static void
1893 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1894 {
1895 if (pmd->core_id == NON_PMD_CORE_ID) {
1896 ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1897 ovs_mutex_lock(&pmd->port_mutex);
1898 pmd_load_cached_ports(pmd);
1899 ovs_mutex_unlock(&pmd->port_mutex);
1900 ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1901 return;
1902 }
1903
1904 seq_change(pmd->reload_seq);
1905 atomic_store_explicit(&pmd->reload, true, memory_order_release);
1906 }
1907
1908 static uint32_t
1909 hash_port_no(odp_port_t port_no)
1910 {
1911 return hash_int(odp_to_u32(port_no), 0);
1912 }
1913
1914 static int
1915 port_create(const char *devname, const char *type,
1916 odp_port_t port_no, struct dp_netdev_port **portp)
1917 {
1918 struct dp_netdev_port *port;
1919 enum netdev_flags flags;
1920 struct netdev *netdev;
1921 int error;
1922
1923 *portp = NULL;
1924
1925 /* Open and validate network device. */
1926 error = netdev_open(devname, type, &netdev);
1927 if (error) {
1928 return error;
1929 }
1930 /* XXX reject non-Ethernet devices */
1931
1932 netdev_get_flags(netdev, &flags);
1933 if (flags & NETDEV_LOOPBACK) {
1934 VLOG_ERR("%s: cannot add a loopback device", devname);
1935 error = EINVAL;
1936 goto out;
1937 }
1938
1939 port = xzalloc(sizeof *port);
1940 port->port_no = port_no;
1941 port->netdev = netdev;
1942 port->type = xstrdup(type);
1943 port->sf = NULL;
1944 port->emc_enabled = true;
1945 port->need_reconfigure = true;
1946 ovs_mutex_init(&port->txq_used_mutex);
1947
1948 *portp = port;
1949
1950 return 0;
1951
1952 out:
1953 netdev_close(netdev);
1954 return error;
1955 }
1956
1957 static int
1958 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1959 odp_port_t port_no)
1960 OVS_REQUIRES(dp->port_mutex)
1961 {
1962 struct netdev_saved_flags *sf;
1963 struct dp_netdev_port *port;
1964 int error;
1965
1966 /* Reject devices already in 'dp'. */
1967 if (!get_port_by_name(dp, devname, &port)) {
1968 return EEXIST;
1969 }
1970
1971 error = port_create(devname, type, port_no, &port);
1972 if (error) {
1973 return error;
1974 }
1975
1976 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1977 seq_change(dp->port_seq);
1978
1979 reconfigure_datapath(dp);
1980
1981 /* Check that port was successfully configured. */
1982 if (!dp_netdev_lookup_port(dp, port_no)) {
1983 return EINVAL;
1984 }
1985
1986 /* Updating device flags triggers an if_notifier, which triggers a bridge
1987 * reconfiguration and another attempt to add this port, leading to an
1988 * infinite loop if the device is configured incorrectly and cannot be
1989 * added. Setting the promisc mode after a successful reconfiguration,
1990 * since we already know that the device is somehow properly configured. */
1991 error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1992 if (error) {
1993 VLOG_ERR("%s: cannot set promisc flag", devname);
1994 do_del_port(dp, port);
1995 return error;
1996 }
1997 port->sf = sf;
1998
1999 return 0;
2000 }
2001
2002 static int
2003 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2004 odp_port_t *port_nop)
2005 {
2006 struct dp_netdev *dp = get_dp_netdev(dpif);
2007 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2008 const char *dpif_port;
2009 odp_port_t port_no;
2010 int error;
2011
2012 ovs_mutex_lock(&dp->port_mutex);
2013 dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2014 if (*port_nop != ODPP_NONE) {
2015 port_no = *port_nop;
2016 error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2017 } else {
2018 port_no = choose_port(dp, dpif_port);
2019 error = port_no == ODPP_NONE ? EFBIG : 0;
2020 }
2021 if (!error) {
2022 *port_nop = port_no;
2023 error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2024 }
2025 ovs_mutex_unlock(&dp->port_mutex);
2026
2027 return error;
2028 }
2029
2030 static int
2031 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2032 {
2033 struct dp_netdev *dp = get_dp_netdev(dpif);
2034 int error;
2035
2036 ovs_mutex_lock(&dp->port_mutex);
2037 if (port_no == ODPP_LOCAL) {
2038 error = EINVAL;
2039 } else {
2040 struct dp_netdev_port *port;
2041
2042 error = get_port_by_number(dp, port_no, &port);
2043 if (!error) {
2044 do_del_port(dp, port);
2045 }
2046 }
2047 ovs_mutex_unlock(&dp->port_mutex);
2048
2049 return error;
2050 }
2051
2052 static bool
2053 is_valid_port_number(odp_port_t port_no)
2054 {
2055 return port_no != ODPP_NONE;
2056 }
2057
2058 static struct dp_netdev_port *
2059 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2060 OVS_REQUIRES(dp->port_mutex)
2061 {
2062 struct dp_netdev_port *port;
2063
2064 HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2065 if (port->port_no == port_no) {
2066 return port;
2067 }
2068 }
2069 return NULL;
2070 }
2071
2072 static int
2073 get_port_by_number(struct dp_netdev *dp,
2074 odp_port_t port_no, struct dp_netdev_port **portp)
2075 OVS_REQUIRES(dp->port_mutex)
2076 {
2077 if (!is_valid_port_number(port_no)) {
2078 *portp = NULL;
2079 return EINVAL;
2080 } else {
2081 *portp = dp_netdev_lookup_port(dp, port_no);
2082 return *portp ? 0 : ENODEV;
2083 }
2084 }
2085
2086 static void
2087 port_destroy(struct dp_netdev_port *port)
2088 {
2089 if (!port) {
2090 return;
2091 }
2092
2093 netdev_close(port->netdev);
2094 netdev_restore_flags(port->sf);
2095
2096 for (unsigned i = 0; i < port->n_rxq; i++) {
2097 netdev_rxq_close(port->rxqs[i].rx);
2098 }
2099 ovs_mutex_destroy(&port->txq_used_mutex);
2100 free(port->rxq_affinity_list);
2101 free(port->txq_used);
2102 free(port->rxqs);
2103 free(port->type);
2104 free(port);
2105 }
2106
2107 static int
2108 get_port_by_name(struct dp_netdev *dp,
2109 const char *devname, struct dp_netdev_port **portp)
2110 OVS_REQUIRES(dp->port_mutex)
2111 {
2112 struct dp_netdev_port *port;
2113
2114 HMAP_FOR_EACH (port, node, &dp->ports) {
2115 if (!strcmp(netdev_get_name(port->netdev), devname)) {
2116 *portp = port;
2117 return 0;
2118 }
2119 }
2120
2121 /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2122 * existing port. */
2123 return ENODEV;
2124 }
2125
2126 /* Returns 'true' if there is a port with pmd netdev. */
2127 static bool
2128 has_pmd_port(struct dp_netdev *dp)
2129 OVS_REQUIRES(dp->port_mutex)
2130 {
2131 struct dp_netdev_port *port;
2132
2133 HMAP_FOR_EACH (port, node, &dp->ports) {
2134 if (netdev_is_pmd(port->netdev)) {
2135 return true;
2136 }
2137 }
2138
2139 return false;
2140 }
2141
2142 static void
2143 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2144 OVS_REQUIRES(dp->port_mutex)
2145 {
2146 hmap_remove(&dp->ports, &port->node);
2147 seq_change(dp->port_seq);
2148
2149 reconfigure_datapath(dp);
2150
2151 port_destroy(port);
2152 }
2153
2154 static void
2155 answer_port_query(const struct dp_netdev_port *port,
2156 struct dpif_port *dpif_port)
2157 {
2158 dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2159 dpif_port->type = xstrdup(port->type);
2160 dpif_port->port_no = port->port_no;
2161 }
2162
2163 static int
2164 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2165 struct dpif_port *dpif_port)
2166 {
2167 struct dp_netdev *dp = get_dp_netdev(dpif);
2168 struct dp_netdev_port *port;
2169 int error;
2170
2171 ovs_mutex_lock(&dp->port_mutex);
2172 error = get_port_by_number(dp, port_no, &port);
2173 if (!error && dpif_port) {
2174 answer_port_query(port, dpif_port);
2175 }
2176 ovs_mutex_unlock(&dp->port_mutex);
2177
2178 return error;
2179 }
2180
2181 static int
2182 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2183 struct dpif_port *dpif_port)
2184 {
2185 struct dp_netdev *dp = get_dp_netdev(dpif);
2186 struct dp_netdev_port *port;
2187 int error;
2188
2189 ovs_mutex_lock(&dp->port_mutex);
2190 error = get_port_by_name(dp, devname, &port);
2191 if (!error && dpif_port) {
2192 answer_port_query(port, dpif_port);
2193 }
2194 ovs_mutex_unlock(&dp->port_mutex);
2195
2196 return error;
2197 }
2198
2199 static void
2200 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2201 {
2202 dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2203 free(flow->dp_extra_info);
2204 free(flow);
2205 }
2206
2207 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2208 {
2209 if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2210 ovsrcu_postpone(dp_netdev_flow_free, flow);
2211 }
2212 }
2213
2214 static uint32_t
2215 dp_netdev_flow_hash(const ovs_u128 *ufid)
2216 {
2217 return ufid->u32[0];
2218 }
2219
2220 static inline struct dpcls *
2221 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2222 odp_port_t in_port)
2223 {
2224 struct dpcls *cls;
2225 uint32_t hash = hash_port_no(in_port);
2226 CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2227 if (cls->in_port == in_port) {
2228 /* Port classifier exists already */
2229 return cls;
2230 }
2231 }
2232 return NULL;
2233 }
2234
2235 static inline struct dpcls *
2236 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2237 odp_port_t in_port)
2238 OVS_REQUIRES(pmd->flow_mutex)
2239 {
2240 struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2241 uint32_t hash = hash_port_no(in_port);
2242
2243 if (!cls) {
2244 /* Create new classifier for in_port */
2245 cls = xmalloc(sizeof(*cls));
2246 dpcls_init(cls);
2247 cls->in_port = in_port;
2248 cmap_insert(&pmd->classifiers, &cls->node, hash);
2249 VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2250 }
2251 return cls;
2252 }
2253
2254 #define MAX_FLOW_MARK (UINT32_MAX - 1)
2255 #define INVALID_FLOW_MARK 0
2256 /* Zero flow mark is used to indicate the HW to remove the mark. A packet
2257 * marked with zero mark is received in SW without a mark at all, so it
2258 * cannot be used as a valid mark.
2259 */
2260
2261 struct megaflow_to_mark_data {
2262 const struct cmap_node node;
2263 ovs_u128 mega_ufid;
2264 uint32_t mark;
2265 };
2266
2267 struct flow_mark {
2268 struct cmap megaflow_to_mark;
2269 struct cmap mark_to_flow;
2270 struct id_pool *pool;
2271 };
2272
2273 static struct flow_mark flow_mark = {
2274 .megaflow_to_mark = CMAP_INITIALIZER,
2275 .mark_to_flow = CMAP_INITIALIZER,
2276 };
2277
2278 static uint32_t
2279 flow_mark_alloc(void)
2280 {
2281 uint32_t mark;
2282
2283 if (!flow_mark.pool) {
2284 /* Haven't initiated yet, do it here */
2285 flow_mark.pool = id_pool_create(1, MAX_FLOW_MARK);
2286 }
2287
2288 if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2289 return mark;
2290 }
2291
2292 return INVALID_FLOW_MARK;
2293 }
2294
2295 static void
2296 flow_mark_free(uint32_t mark)
2297 {
2298 id_pool_free_id(flow_mark.pool, mark);
2299 }
2300
2301 /* associate megaflow with a mark, which is a 1:1 mapping */
2302 static void
2303 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2304 {
2305 size_t hash = dp_netdev_flow_hash(mega_ufid);
2306 struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2307
2308 data->mega_ufid = *mega_ufid;
2309 data->mark = mark;
2310
2311 cmap_insert(&flow_mark.megaflow_to_mark,
2312 CONST_CAST(struct cmap_node *, &data->node), hash);
2313 }
2314
2315 /* disassociate meagaflow with a mark */
2316 static void
2317 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2318 {
2319 size_t hash = dp_netdev_flow_hash(mega_ufid);
2320 struct megaflow_to_mark_data *data;
2321
2322 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2323 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2324 cmap_remove(&flow_mark.megaflow_to_mark,
2325 CONST_CAST(struct cmap_node *, &data->node), hash);
2326 ovsrcu_postpone(free, data);
2327 return;
2328 }
2329 }
2330
2331 VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2332 UUID_ARGS((struct uuid *)mega_ufid));
2333 }
2334
2335 static inline uint32_t
2336 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2337 {
2338 size_t hash = dp_netdev_flow_hash(mega_ufid);
2339 struct megaflow_to_mark_data *data;
2340
2341 CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2342 if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2343 return data->mark;
2344 }
2345 }
2346
2347 VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2348 UUID_ARGS((struct uuid *)mega_ufid));
2349 return INVALID_FLOW_MARK;
2350 }
2351
2352 /* associate mark with a flow, which is 1:N mapping */
2353 static void
2354 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2355 {
2356 dp_netdev_flow_ref(flow);
2357
2358 cmap_insert(&flow_mark.mark_to_flow,
2359 CONST_CAST(struct cmap_node *, &flow->mark_node),
2360 hash_int(mark, 0));
2361 flow->mark = mark;
2362
2363 VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2364 flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2365 }
2366
2367 static bool
2368 flow_mark_has_no_ref(uint32_t mark)
2369 {
2370 struct dp_netdev_flow *flow;
2371
2372 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2373 &flow_mark.mark_to_flow) {
2374 if (flow->mark == mark) {
2375 return false;
2376 }
2377 }
2378
2379 return true;
2380 }
2381
2382 static int
2383 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2384 struct dp_netdev_flow *flow)
2385 {
2386 const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2387 struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2388 &flow->mark_node);
2389 uint32_t mark = flow->mark;
2390 int ret = 0;
2391
2392 /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2393 * never associated. */
2394 if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2395 return EINVAL;
2396 }
2397
2398 cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2399 flow->mark = INVALID_FLOW_MARK;
2400
2401 /*
2402 * no flow is referencing the mark any more? If so, let's
2403 * remove the flow from hardware and free the mark.
2404 */
2405 if (flow_mark_has_no_ref(mark)) {
2406 struct netdev *port;
2407 odp_port_t in_port = flow->flow.in_port.odp_port;
2408
2409 port = netdev_ports_get(in_port, dpif_type_str);
2410 if (port) {
2411 /* Taking a global 'port_mutex' to fulfill thread safety
2412 * restrictions for the netdev-offload-dpdk module. */
2413 ovs_mutex_lock(&pmd->dp->port_mutex);
2414 ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2415 ovs_mutex_unlock(&pmd->dp->port_mutex);
2416 netdev_close(port);
2417 }
2418
2419 flow_mark_free(mark);
2420 VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2421 UUID_ARGS((struct uuid *) &flow->mega_ufid));
2422
2423 megaflow_to_mark_disassociate(&flow->mega_ufid);
2424 }
2425 dp_netdev_flow_unref(flow);
2426
2427 return ret;
2428 }
2429
2430 static void
2431 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2432 {
2433 struct dp_netdev_flow *flow;
2434
2435 CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2436 if (flow->pmd_id == pmd->core_id) {
2437 queue_netdev_flow_del(pmd, flow);
2438 }
2439 }
2440 }
2441
2442 static struct dp_netdev_flow *
2443 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2444 const uint32_t mark)
2445 {
2446 struct dp_netdev_flow *flow;
2447
2448 CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2449 &flow_mark.mark_to_flow) {
2450 if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2451 flow->dead == false) {
2452 return flow;
2453 }
2454 }
2455
2456 return NULL;
2457 }
2458
2459 static struct dp_flow_offload_item *
2460 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2461 struct dp_netdev_flow *flow,
2462 int op)
2463 {
2464 struct dp_flow_offload_item *offload;
2465
2466 offload = xzalloc(sizeof(*offload));
2467 offload->pmd = pmd;
2468 offload->flow = flow;
2469 offload->op = op;
2470
2471 dp_netdev_flow_ref(flow);
2472 dp_netdev_pmd_try_ref(pmd);
2473
2474 return offload;
2475 }
2476
2477 static void
2478 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2479 {
2480 dp_netdev_pmd_unref(offload->pmd);
2481 dp_netdev_flow_unref(offload->flow);
2482
2483 free(offload->actions);
2484 free(offload);
2485 }
2486
2487 static void
2488 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2489 {
2490 ovs_mutex_lock(&dp_flow_offload.mutex);
2491 ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2492 xpthread_cond_signal(&dp_flow_offload.cond);
2493 ovs_mutex_unlock(&dp_flow_offload.mutex);
2494 }
2495
2496 static int
2497 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2498 {
2499 return mark_to_flow_disassociate(offload->pmd, offload->flow);
2500 }
2501
2502 /*
2503 * There are two flow offload operations here: addition and modification.
2504 *
2505 * For flow addition, this function does:
2506 * - allocate a new flow mark id
2507 * - perform hardware flow offload
2508 * - associate the flow mark with flow and mega flow
2509 *
2510 * For flow modification, both flow mark and the associations are still
2511 * valid, thus only item 2 needed.
2512 */
2513 static int
2514 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2515 {
2516 struct dp_netdev_pmd_thread *pmd = offload->pmd;
2517 struct dp_netdev_flow *flow = offload->flow;
2518 odp_port_t in_port = flow->flow.in_port.odp_port;
2519 const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2520 bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2521 struct offload_info info;
2522 struct netdev *port;
2523 uint32_t mark;
2524 int ret;
2525
2526 if (flow->dead) {
2527 return -1;
2528 }
2529
2530 if (modification) {
2531 mark = flow->mark;
2532 ovs_assert(mark != INVALID_FLOW_MARK);
2533 } else {
2534 /*
2535 * If a mega flow has already been offloaded (from other PMD
2536 * instances), do not offload it again.
2537 */
2538 mark = megaflow_to_mark_find(&flow->mega_ufid);
2539 if (mark != INVALID_FLOW_MARK) {
2540 VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2541 if (flow->mark != INVALID_FLOW_MARK) {
2542 ovs_assert(flow->mark == mark);
2543 } else {
2544 mark_to_flow_associate(mark, flow);
2545 }
2546 return 0;
2547 }
2548
2549 mark = flow_mark_alloc();
2550 if (mark == INVALID_FLOW_MARK) {
2551 VLOG_ERR("Failed to allocate flow mark!\n");
2552 return -1;
2553 }
2554 }
2555 info.flow_mark = mark;
2556
2557 port = netdev_ports_get(in_port, dpif_type_str);
2558 if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2559 netdev_close(port);
2560 goto err_free;
2561 }
2562 /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2563 * the netdev-offload-dpdk module. */
2564 ovs_mutex_lock(&pmd->dp->port_mutex);
2565 ret = netdev_flow_put(port, &offload->match,
2566 CONST_CAST(struct nlattr *, offload->actions),
2567 offload->actions_len, &flow->mega_ufid, &info,
2568 NULL);
2569 ovs_mutex_unlock(&pmd->dp->port_mutex);
2570 netdev_close(port);
2571
2572 if (ret) {
2573 goto err_free;
2574 }
2575
2576 if (!modification) {
2577 megaflow_to_mark_associate(&flow->mega_ufid, mark);
2578 mark_to_flow_associate(mark, flow);
2579 }
2580 return 0;
2581
2582 err_free:
2583 if (!modification) {
2584 flow_mark_free(mark);
2585 } else {
2586 mark_to_flow_disassociate(pmd, flow);
2587 }
2588 return -1;
2589 }
2590
2591 static void *
2592 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2593 {
2594 struct dp_flow_offload_item *offload;
2595 struct ovs_list *list;
2596 const char *op;
2597 int ret;
2598
2599 for (;;) {
2600 ovs_mutex_lock(&dp_flow_offload.mutex);
2601 if (ovs_list_is_empty(&dp_flow_offload.list)) {
2602 ovsrcu_quiesce_start();
2603 ovs_mutex_cond_wait(&dp_flow_offload.cond,
2604 &dp_flow_offload.mutex);
2605 ovsrcu_quiesce_end();
2606 }
2607 list = ovs_list_pop_front(&dp_flow_offload.list);
2608 offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2609 ovs_mutex_unlock(&dp_flow_offload.mutex);
2610
2611 switch (offload->op) {
2612 case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2613 op = "add";
2614 ret = dp_netdev_flow_offload_put(offload);
2615 break;
2616 case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2617 op = "modify";
2618 ret = dp_netdev_flow_offload_put(offload);
2619 break;
2620 case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2621 op = "delete";
2622 ret = dp_netdev_flow_offload_del(offload);
2623 break;
2624 default:
2625 OVS_NOT_REACHED();
2626 }
2627
2628 VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2629 ret == 0 ? "succeed" : "failed", op,
2630 UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
2631 dp_netdev_free_flow_offload(offload);
2632 ovsrcu_quiesce();
2633 }
2634
2635 return NULL;
2636 }
2637
2638 static void
2639 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2640 struct dp_netdev_flow *flow)
2641 {
2642 struct dp_flow_offload_item *offload;
2643
2644 if (ovsthread_once_start(&offload_thread_once)) {
2645 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2646 ovs_thread_create("dp_netdev_flow_offload",
2647 dp_netdev_flow_offload_main, NULL);
2648 ovsthread_once_done(&offload_thread_once);
2649 }
2650
2651 offload = dp_netdev_alloc_flow_offload(pmd, flow,
2652 DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2653 dp_netdev_append_flow_offload(offload);
2654 }
2655
2656 static void
2657 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2658 struct dp_netdev_flow *flow, struct match *match,
2659 const struct nlattr *actions, size_t actions_len)
2660 {
2661 struct dp_flow_offload_item *offload;
2662 int op;
2663
2664 if (!netdev_is_flow_api_enabled()) {
2665 return;
2666 }
2667
2668 if (ovsthread_once_start(&offload_thread_once)) {
2669 xpthread_cond_init(&dp_flow_offload.cond, NULL);
2670 ovs_thread_create("dp_netdev_flow_offload",
2671 dp_netdev_flow_offload_main, NULL);
2672 ovsthread_once_done(&offload_thread_once);
2673 }
2674
2675 if (flow->mark != INVALID_FLOW_MARK) {
2676 op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2677 } else {
2678 op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2679 }
2680 offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2681 offload->match = *match;
2682 offload->actions = xmalloc(actions_len);
2683 memcpy(offload->actions, actions, actions_len);
2684 offload->actions_len = actions_len;
2685
2686 dp_netdev_append_flow_offload(offload);
2687 }
2688
2689 static void
2690 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2691 struct dp_netdev_flow *flow)
2692 OVS_REQUIRES(pmd->flow_mutex)
2693 {
2694 struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2695 struct dpcls *cls;
2696 odp_port_t in_port = flow->flow.in_port.odp_port;
2697
2698 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2699 ovs_assert(cls != NULL);
2700 dpcls_remove(cls, &flow->cr);
2701 cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2702 if (flow->mark != INVALID_FLOW_MARK) {
2703 queue_netdev_flow_del(pmd, flow);
2704 }
2705 flow->dead = true;
2706
2707 dp_netdev_flow_unref(flow);
2708 }
2709
2710 static void
2711 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2712 {
2713 struct dp_netdev_flow *netdev_flow;
2714
2715 ovs_mutex_lock(&pmd->flow_mutex);
2716 CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2717 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2718 }
2719 ovs_mutex_unlock(&pmd->flow_mutex);
2720 }
2721
2722 static int
2723 dpif_netdev_flow_flush(struct dpif *dpif)
2724 {
2725 struct dp_netdev *dp = get_dp_netdev(dpif);
2726 struct dp_netdev_pmd_thread *pmd;
2727
2728 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2729 dp_netdev_pmd_flow_flush(pmd);
2730 }
2731
2732 return 0;
2733 }
2734
2735 struct dp_netdev_port_state {
2736 struct hmap_position position;
2737 char *name;
2738 };
2739
2740 static int
2741 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2742 {
2743 *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2744 return 0;
2745 }
2746
2747 static int
2748 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2749 struct dpif_port *dpif_port)
2750 {
2751 struct dp_netdev_port_state *state = state_;
2752 struct dp_netdev *dp = get_dp_netdev(dpif);
2753 struct hmap_node *node;
2754 int retval;
2755
2756 ovs_mutex_lock(&dp->port_mutex);
2757 node = hmap_at_position(&dp->ports, &state->position);
2758 if (node) {
2759 struct dp_netdev_port *port;
2760
2761 port = CONTAINER_OF(node, struct dp_netdev_port, node);
2762
2763 free(state->name);
2764 state->name = xstrdup(netdev_get_name(port->netdev));
2765 dpif_port->name = state->name;
2766 dpif_port->type = port->type;
2767 dpif_port->port_no = port->port_no;
2768
2769 retval = 0;
2770 } else {
2771 retval = EOF;
2772 }
2773 ovs_mutex_unlock(&dp->port_mutex);
2774
2775 return retval;
2776 }
2777
2778 static int
2779 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2780 {
2781 struct dp_netdev_port_state *state = state_;
2782 free(state->name);
2783 free(state);
2784 return 0;
2785 }
2786
2787 static int
2788 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2789 {
2790 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2791 uint64_t new_port_seq;
2792 int error;
2793
2794 new_port_seq = seq_read(dpif->dp->port_seq);
2795 if (dpif->last_port_seq != new_port_seq) {
2796 dpif->last_port_seq = new_port_seq;
2797 error = ENOBUFS;
2798 } else {
2799 error = EAGAIN;
2800 }
2801
2802 return error;
2803 }
2804
2805 static void
2806 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2807 {
2808 struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2809
2810 seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2811 }
2812
2813 static struct dp_netdev_flow *
2814 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2815 {
2816 return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2817 }
2818
2819 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2820 {
2821 return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2822 }
2823
2824 /* netdev_flow_key utilities.
2825 *
2826 * netdev_flow_key is basically a miniflow. We use these functions
2827 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2828 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2829 *
2830 * - Since we are dealing exclusively with miniflows created by
2831 * miniflow_extract(), if the map is different the miniflow is different.
2832 * Therefore we can be faster by comparing the map and the miniflow in a
2833 * single memcmp().
2834 * - These functions can be inlined by the compiler. */
2835
2836 /* Given the number of bits set in miniflow's maps, returns the size of the
2837 * 'netdev_flow_key.mf' */
2838 static inline size_t
2839 netdev_flow_key_size(size_t flow_u64s)
2840 {
2841 return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2842 }
2843
2844 static inline bool
2845 netdev_flow_key_equal(const struct netdev_flow_key *a,
2846 const struct netdev_flow_key *b)
2847 {
2848 /* 'b->len' may be not set yet. */
2849 return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2850 }
2851
2852 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2853 * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2854 * generated by miniflow_extract. */
2855 static inline bool
2856 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2857 const struct miniflow *mf)
2858 {
2859 return !memcmp(&key->mf, mf, key->len);
2860 }
2861
2862 static inline void
2863 netdev_flow_key_clone(struct netdev_flow_key *dst,
2864 const struct netdev_flow_key *src)
2865 {
2866 memcpy(dst, src,
2867 offsetof(struct netdev_flow_key, mf) + src->len);
2868 }
2869
2870 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2871 static inline void
2872 netdev_flow_mask_init(struct netdev_flow_key *mask,
2873 const struct match *match)
2874 {
2875 uint64_t *dst = miniflow_values(&mask->mf);
2876 struct flowmap fmap;
2877 uint32_t hash = 0;
2878 size_t idx;
2879
2880 /* Only check masks that make sense for the flow. */
2881 flow_wc_map(&match->flow, &fmap);
2882 flowmap_init(&mask->mf.map);
2883
2884 FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2885 uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2886
2887 if (mask_u64) {
2888 flowmap_set(&mask->mf.map, idx, 1);
2889 *dst++ = mask_u64;
2890 hash = hash_add64(hash, mask_u64);
2891 }
2892 }
2893
2894 map_t map;
2895
2896 FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2897 hash = hash_add64(hash, map);
2898 }
2899
2900 size_t n = dst - miniflow_get_values(&mask->mf);
2901
2902 mask->hash = hash_finish(hash, n * 8);
2903 mask->len = netdev_flow_key_size(n);
2904 }
2905
2906 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2907 static inline void
2908 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2909 const struct flow *flow,
2910 const struct netdev_flow_key *mask)
2911 {
2912 uint64_t *dst_u64 = miniflow_values(&dst->mf);
2913 const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2914 uint32_t hash = 0;
2915 uint64_t value;
2916
2917 dst->len = mask->len;
2918 dst->mf = mask->mf; /* Copy maps. */
2919
2920 FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2921 *dst_u64 = value & *mask_u64++;
2922 hash = hash_add64(hash, *dst_u64++);
2923 }
2924 dst->hash = hash_finish(hash,
2925 (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2926 }
2927
2928 static inline bool
2929 emc_entry_alive(struct emc_entry *ce)
2930 {
2931 return ce->flow && !ce->flow->dead;
2932 }
2933
2934 static void
2935 emc_clear_entry(struct emc_entry *ce)
2936 {
2937 if (ce->flow) {
2938 dp_netdev_flow_unref(ce->flow);
2939 ce->flow = NULL;
2940 }
2941 }
2942
2943 static inline void
2944 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2945 const struct netdev_flow_key *key)
2946 {
2947 if (ce->flow != flow) {
2948 if (ce->flow) {
2949 dp_netdev_flow_unref(ce->flow);
2950 }
2951
2952 if (dp_netdev_flow_ref(flow)) {
2953 ce->flow = flow;
2954 } else {
2955 ce->flow = NULL;
2956 }
2957 }
2958 if (key) {
2959 netdev_flow_key_clone(&ce->key, key);
2960 }
2961 }
2962
2963 static inline void
2964 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2965 struct dp_netdev_flow *flow)
2966 {
2967 struct emc_entry *to_be_replaced = NULL;
2968 struct emc_entry *current_entry;
2969
2970 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2971 if (netdev_flow_key_equal(&current_entry->key, key)) {
2972 /* We found the entry with the 'mf' miniflow */
2973 emc_change_entry(current_entry, flow, NULL);
2974 return;
2975 }
2976
2977 /* Replacement policy: put the flow in an empty (not alive) entry, or
2978 * in the first entry where it can be */
2979 if (!to_be_replaced
2980 || (emc_entry_alive(to_be_replaced)
2981 && !emc_entry_alive(current_entry))
2982 || current_entry->key.hash < to_be_replaced->key.hash) {
2983 to_be_replaced = current_entry;
2984 }
2985 }
2986 /* We didn't find the miniflow in the cache.
2987 * The 'to_be_replaced' entry is where the new flow will be stored */
2988
2989 emc_change_entry(to_be_replaced, flow, key);
2990 }
2991
2992 static inline void
2993 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2994 const struct netdev_flow_key *key,
2995 struct dp_netdev_flow *flow)
2996 {
2997 /* Insert an entry into the EMC based on probability value 'min'. By
2998 * default the value is UINT32_MAX / 100 which yields an insertion
2999 * probability of 1/100 ie. 1% */
3000
3001 uint32_t min = pmd->ctx.emc_insert_min;
3002
3003 if (min && random_uint32() <= min) {
3004 emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3005 }
3006 }
3007
3008 static inline struct dp_netdev_flow *
3009 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
3010 {
3011 struct emc_entry *current_entry;
3012
3013 EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3014 if (current_entry->key.hash == key->hash
3015 && emc_entry_alive(current_entry)
3016 && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
3017
3018 /* We found the entry with the 'key->mf' miniflow */
3019 return current_entry->flow;
3020 }
3021 }
3022
3023 return NULL;
3024 }
3025
3026 static inline const struct cmap_node *
3027 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3028 {
3029 struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3030 struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3031 uint16_t sig = hash >> 16;
3032 uint16_t index = UINT16_MAX;
3033
3034 for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3035 if (bucket->sig[i] == sig) {
3036 index = bucket->flow_idx[i];
3037 break;
3038 }
3039 }
3040 if (index != UINT16_MAX) {
3041 return cmap_find_by_index(&pmd->flow_table, index);
3042 }
3043 return NULL;
3044 }
3045
3046 static void
3047 smc_clear_entry(struct smc_bucket *b, int idx)
3048 {
3049 b->flow_idx[idx] = UINT16_MAX;
3050 }
3051
3052 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3053 * turned off, 2) the flow_table index is larger than uint16_t can handle.
3054 * If there is already an SMC entry having same signature, the index will be
3055 * updated. If there is no existing entry, but an empty entry is available,
3056 * the empty entry will be taken. If no empty entry or existing same signature,
3057 * a random entry from the hashed bucket will be picked. */
3058 static inline void
3059 smc_insert(struct dp_netdev_pmd_thread *pmd,
3060 const struct netdev_flow_key *key,
3061 uint32_t hash)
3062 {
3063 struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3064 struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3065 uint16_t index;
3066 uint32_t cmap_index;
3067 bool smc_enable_db;
3068 int i;
3069
3070 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
3071 if (!smc_enable_db) {
3072 return;
3073 }
3074
3075 cmap_index = cmap_find_index(&pmd->flow_table, hash);
3076 index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3077
3078 /* If the index is larger than SMC can handle (uint16_t), we don't
3079 * insert */
3080 if (index == UINT16_MAX) {
3081 return;
3082 }
3083
3084 /* If an entry with same signature already exists, update the index */
3085 uint16_t sig = key->hash >> 16;
3086 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3087 if (bucket->sig[i] == sig) {
3088 bucket->flow_idx[i] = index;
3089 return;
3090 }
3091 }
3092 /* If there is an empty entry, occupy it. */
3093 for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3094 if (bucket->flow_idx[i] == UINT16_MAX) {
3095 bucket->sig[i] = sig;
3096 bucket->flow_idx[i] = index;
3097 return;
3098 }
3099 }
3100 /* Otherwise, pick a random entry. */
3101 i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3102 bucket->sig[i] = sig;
3103 bucket->flow_idx[i] = index;
3104 }
3105
3106 static struct dp_netdev_flow *
3107 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3108 const struct netdev_flow_key *key,
3109 int *lookup_num_p)
3110 {
3111 struct dpcls *cls;
3112 struct dpcls_rule *rule;
3113 odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3114 in_port.odp_port));
3115 struct dp_netdev_flow *netdev_flow = NULL;
3116
3117 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3118 if (OVS_LIKELY(cls)) {
3119 dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3120 netdev_flow = dp_netdev_flow_cast(rule);
3121 }
3122 return netdev_flow;
3123 }
3124
3125 static struct dp_netdev_flow *
3126 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3127 const ovs_u128 *ufidp, const struct nlattr *key,
3128 size_t key_len)
3129 {
3130 struct dp_netdev_flow *netdev_flow;
3131 struct flow flow;
3132 ovs_u128 ufid;
3133
3134 /* If a UFID is not provided, determine one based on the key. */
3135 if (!ufidp && key && key_len
3136 && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3137 odp_flow_key_hash(&flow, sizeof flow, &ufid);
3138 ufidp = &ufid;
3139 }
3140
3141 if (ufidp) {
3142 CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3143 &pmd->flow_table) {
3144 if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3145 return netdev_flow;
3146 }
3147 }
3148 }
3149
3150 return NULL;
3151 }
3152
3153 static bool
3154 dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3155 const struct dp_netdev_flow *netdev_flow,
3156 struct dpif_flow_stats *stats,
3157 struct dpif_flow_attrs *attrs)
3158 {
3159 uint64_t act_buf[1024 / 8];
3160 struct nlattr *actions;
3161 struct netdev *netdev;
3162 struct match match;
3163 struct ofpbuf buf;
3164
3165 int ret = 0;
3166
3167 if (!netdev_is_flow_api_enabled()) {
3168 return false;
3169 }
3170
3171 netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3172 dpif_normalize_type(dp->class->type));
3173 if (!netdev) {
3174 return false;
3175 }
3176 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3177 /* Taking a global 'port_mutex' to fulfill thread safety
3178 * restrictions for the netdev-offload-dpdk module. */
3179 ovs_mutex_lock(&dp->port_mutex);
3180 ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid,
3181 stats, attrs, &buf);
3182 ovs_mutex_unlock(&dp->port_mutex);
3183 netdev_close(netdev);
3184 if (ret) {
3185 return false;
3186 }
3187
3188 return true;
3189 }
3190
3191 static void
3192 get_dpif_flow_status(const struct dp_netdev *dp,
3193 const struct dp_netdev_flow *netdev_flow_,
3194 struct dpif_flow_stats *stats,
3195 struct dpif_flow_attrs *attrs)
3196 {
3197 struct dpif_flow_stats offload_stats;
3198 struct dpif_flow_attrs offload_attrs;
3199 struct dp_netdev_flow *netdev_flow;
3200 unsigned long long n;
3201 long long used;
3202 uint16_t flags;
3203
3204 netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3205
3206 atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3207 stats->n_packets = n;
3208 atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3209 stats->n_bytes = n;
3210 atomic_read_relaxed(&netdev_flow->stats.used, &used);
3211 stats->used = used;
3212 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3213 stats->tcp_flags = flags;
3214
3215 if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3216 &offload_stats, &offload_attrs)) {
3217 stats->n_packets += offload_stats.n_packets;
3218 stats->n_bytes += offload_stats.n_bytes;
3219 stats->used = MAX(stats->used, offload_stats.used);
3220 stats->tcp_flags |= offload_stats.tcp_flags;
3221 if (attrs) {
3222 attrs->offloaded = offload_attrs.offloaded;
3223 attrs->dp_layer = offload_attrs.dp_layer;
3224 }
3225 } else if (attrs) {
3226 attrs->offloaded = false;
3227 attrs->dp_layer = "ovs";
3228 }
3229 }
3230
3231 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3232 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3233 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3234 * protect them. */
3235 static void
3236 dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3237 const struct dp_netdev_flow *netdev_flow,
3238 struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3239 struct dpif_flow *flow, bool terse)
3240 {
3241 if (terse) {
3242 memset(flow, 0, sizeof *flow);
3243 } else {
3244 struct flow_wildcards wc;
3245 struct dp_netdev_actions *actions;
3246 size_t offset;
3247 struct odp_flow_key_parms odp_parms = {
3248 .flow = &netdev_flow->flow,
3249 .mask = &wc.masks,
3250 .support = dp_netdev_support,
3251 };
3252
3253 miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3254 /* in_port is exact matched, but we have left it out from the mask for
3255 * optimnization reasons. Add in_port back to the mask. */
3256 wc.masks.in_port.odp_port = ODPP_NONE;
3257
3258 /* Key */
3259 offset = key_buf->size;
3260 flow->key = ofpbuf_tail(key_buf);
3261 odp_flow_key_from_flow(&odp_parms, key_buf);
3262 flow->key_len = key_buf->size - offset;
3263
3264 /* Mask */
3265 offset = mask_buf->size;
3266 flow->mask = ofpbuf_tail(mask_buf);
3267 odp_parms.key_buf = key_buf;
3268 odp_flow_key_from_mask(&odp_parms, mask_buf);
3269 flow->mask_len = mask_buf->size - offset;
3270
3271 /* Actions */
3272 actions = dp_netdev_flow_get_actions(netdev_flow);
3273 flow->actions = actions->actions;
3274 flow->actions_len = actions->size;
3275 }
3276
3277 flow->ufid = netdev_flow->ufid;
3278 flow->ufid_present = true;
3279 flow->pmd_id = netdev_flow->pmd_id;
3280
3281 get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3282 flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3283 }
3284
3285 static int
3286 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3287 const struct nlattr *mask_key,
3288 uint32_t mask_key_len, const struct flow *flow,
3289 struct flow_wildcards *wc, bool probe)
3290 {
3291 enum odp_key_fitness fitness;
3292
3293 fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3294 if (fitness) {
3295 if (!probe) {
3296 /* This should not happen: it indicates that
3297 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3298 * disagree on the acceptable form of a mask. Log the problem
3299 * as an error, with enough details to enable debugging. */
3300 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3301
3302 if (!VLOG_DROP_ERR(&rl)) {
3303 struct ds s;
3304
3305 ds_init(&s);
3306 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3307 true);
3308 VLOG_ERR("internal error parsing flow mask %s (%s)",
3309 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3310 ds_destroy(&s);
3311 }
3312 }
3313
3314 return EINVAL;
3315 }
3316
3317 return 0;
3318 }
3319
3320 static int
3321 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3322 struct flow *flow, bool probe)
3323 {
3324 if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3325 if (!probe) {
3326 /* This should not happen: it indicates that
3327 * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3328 * the acceptable form of a flow. Log the problem as an error,
3329 * with enough details to enable debugging. */
3330 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3331
3332 if (!VLOG_DROP_ERR(&rl)) {
3333 struct ds s;
3334
3335 ds_init(&s);
3336 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3337 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3338 ds_destroy(&s);
3339 }
3340 }
3341
3342 return EINVAL;
3343 }
3344
3345 if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3346 return EINVAL;
3347 }
3348
3349 return 0;
3350 }
3351
3352 static int
3353 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3354 {
3355 struct dp_netdev *dp = get_dp_netdev(dpif);
3356 struct dp_netdev_flow *netdev_flow;
3357 struct dp_netdev_pmd_thread *pmd;
3358 struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3359 struct hmapx_node *node;
3360 int error = EINVAL;
3361
3362 if (get->pmd_id == PMD_ID_NULL) {
3363 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3364 if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3365 dp_netdev_pmd_unref(pmd);
3366 }
3367 }
3368 } else {
3369 pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3370 if (!pmd) {
3371 goto out;
3372 }
3373 hmapx_add(&to_find, pmd);
3374 }
3375
3376 if (!hmapx_count(&to_find)) {
3377 goto out;
3378 }
3379
3380 HMAPX_FOR_EACH (node, &to_find) {
3381 pmd = (struct dp_netdev_pmd_thread *) node->data;
3382 netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3383 get->key_len);
3384 if (netdev_flow) {
3385 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3386 get->buffer, get->flow, false);
3387 error = 0;
3388 break;
3389 } else {
3390 error = ENOENT;
3391 }
3392 }
3393
3394 HMAPX_FOR_EACH (node, &to_find) {
3395 pmd = (struct dp_netdev_pmd_thread *) node->data;
3396 dp_netdev_pmd_unref(pmd);
3397 }
3398 out:
3399 hmapx_destroy(&to_find);
3400 return error;
3401 }
3402
3403 static void
3404 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3405 {
3406 struct flow masked_flow;
3407 size_t i;
3408
3409 for (i = 0; i < sizeof(struct flow); i++) {
3410 ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3411 ((uint8_t *)&match->wc)[i];
3412 }
3413 odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3414 }
3415
3416 static struct dp_netdev_flow *
3417 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3418 struct match *match, const ovs_u128 *ufid,
3419 const struct nlattr *actions, size_t actions_len)
3420 OVS_REQUIRES(pmd->flow_mutex)
3421 {
3422 struct ds extra_info = DS_EMPTY_INITIALIZER;
3423 struct dp_netdev_flow *flow;
3424 struct netdev_flow_key mask;
3425 struct dpcls *cls;
3426 size_t unit;
3427
3428 /* Make sure in_port is exact matched before we read it. */
3429 ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3430 odp_port_t in_port = match->flow.in_port.odp_port;
3431
3432 /* As we select the dpcls based on the port number, each netdev flow
3433 * belonging to the same dpcls will have the same odp_port value.
3434 * For performance reasons we wildcard odp_port here in the mask. In the
3435 * typical case dp_hash is also wildcarded, and the resulting 8-byte
3436 * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3437 * will not be part of the subtable mask.
3438 * This will speed up the hash computation during dpcls_lookup() because
3439 * there is one less call to hash_add64() in this case. */
3440 match->wc.masks.in_port.odp_port = 0;
3441 netdev_flow_mask_init(&mask, match);
3442 match->wc.masks.in_port.odp_port = ODPP_NONE;
3443
3444 /* Make sure wc does not have metadata. */
3445 ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3446 && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3447
3448 /* Do not allocate extra space. */
3449 flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3450 memset(&flow->stats, 0, sizeof flow->stats);
3451 flow->dead = false;
3452 flow->batch = NULL;
3453 flow->mark = INVALID_FLOW_MARK;
3454 *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3455 *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3456 *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3457 ovs_refcount_init(&flow->ref_cnt);
3458 ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3459
3460 dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3461 netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3462
3463 /* Select dpcls for in_port. Relies on in_port to be exact match. */
3464 cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3465 dpcls_insert(cls, &flow->cr, &mask);
3466
3467 ds_put_cstr(&extra_info, "miniflow_bits(");
3468 FLOWMAP_FOR_EACH_UNIT (unit) {
3469 if (unit) {
3470 ds_put_char(&extra_info, ',');
3471 }
3472 ds_put_format(&extra_info, "%d",
3473 count_1bits(flow->cr.mask->mf.map.bits[unit]));
3474 }
3475 ds_put_char(&extra_info, ')');
3476 flow->dp_extra_info = ds_steal_cstr(&extra_info);
3477 ds_destroy(&extra_info);
3478
3479 cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3480 dp_netdev_flow_hash(&flow->ufid));
3481
3482 queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3483
3484 if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3485 struct ds ds = DS_EMPTY_INITIALIZER;
3486 struct ofpbuf key_buf, mask_buf;
3487 struct odp_flow_key_parms odp_parms = {
3488 .flow = &match->flow,
3489 .mask = &match->wc.masks,
3490 .support = dp_netdev_support,
3491 };
3492
3493 ofpbuf_init(&key_buf, 0);
3494 ofpbuf_init(&mask_buf, 0);
3495
3496 odp_flow_key_from_flow(&odp_parms, &key_buf);
3497 odp_parms.key_buf = &key_buf;
3498 odp_flow_key_from_mask(&odp_parms, &mask_buf);
3499
3500 ds_put_cstr(&ds, "flow_add: ");
3501 odp_format_ufid(ufid, &ds);
3502 ds_put_cstr(&ds, " mega_");
3503 odp_format_ufid(&flow->mega_ufid, &ds);
3504 ds_put_cstr(&ds, " ");
3505 odp_flow_format(key_buf.data, key_buf.size,
3506 mask_buf.data, mask_buf.size,
3507 NULL, &ds, false);
3508 ds_put_cstr(&ds, ", actions:");
3509 format_odp_actions(&ds, actions, actions_len, NULL);
3510
3511 VLOG_DBG("%s", ds_cstr(&ds));
3512
3513 ofpbuf_uninit(&key_buf);
3514 ofpbuf_uninit(&mask_buf);
3515
3516 /* Add a printout of the actual match installed. */
3517 struct match m;
3518 ds_clear(&ds);
3519 ds_put_cstr(&ds, "flow match: ");
3520 miniflow_expand(&flow->cr.flow.mf, &m.flow);
3521 miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3522 memset(&m.tun_md, 0, sizeof m.tun_md);
3523 match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3524
3525 VLOG_DBG("%s", ds_cstr(&ds));
3526
3527 ds_destroy(&ds);
3528 }
3529
3530 return flow;
3531 }
3532
3533 static int
3534 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3535 struct netdev_flow_key *key,
3536 struct match *match,
3537 ovs_u128 *ufid,
3538 const struct dpif_flow_put *put,
3539 struct dpif_flow_stats *stats)
3540 {
3541 struct dp_netdev_flow *netdev_flow;
3542 int error = 0;
3543
3544 if (stats) {
3545 memset(stats, 0, sizeof *stats);
3546 }
3547
3548 ovs_mutex_lock(&pmd->flow_mutex);
3549 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3550 if (!netdev_flow) {
3551 if (put->flags & DPIF_FP_CREATE) {
3552 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3553 put->actions_len);
3554 } else {
3555 error = ENOENT;
3556 }
3557 } else {
3558 if (put->flags & DPIF_FP_MODIFY) {
3559 struct dp_netdev_actions *new_actions;
3560 struct dp_netdev_actions *old_actions;
3561
3562 new_actions = dp_netdev_actions_create(put->actions,
3563 put->actions_len);
3564
3565 old_actions = dp_netdev_flow_get_actions(netdev_flow);
3566 ovsrcu_set(&netdev_flow->actions, new_actions);
3567
3568 queue_netdev_flow_put(pmd, netdev_flow, match,
3569 put->actions, put->actions_len);
3570
3571 if (stats) {
3572 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3573 }
3574 if (put->flags & DPIF_FP_ZERO_STATS) {
3575 /* XXX: The userspace datapath uses thread local statistics
3576 * (for flows), which should be updated only by the owning
3577 * thread. Since we cannot write on stats memory here,
3578 * we choose not to support this flag. Please note:
3579 * - This feature is currently used only by dpctl commands with
3580 * option --clear.
3581 * - Should the need arise, this operation can be implemented
3582 * by keeping a base value (to be update here) for each
3583 * counter, and subtracting it before outputting the stats */
3584 error = EOPNOTSUPP;
3585 }
3586
3587 ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3588 } else if (put->flags & DPIF_FP_CREATE) {
3589 error = EEXIST;
3590 } else {
3591 /* Overlapping flow. */
3592 error = EINVAL;
3593 }
3594 }
3595 ovs_mutex_unlock(&pmd->flow_mutex);
3596 return error;
3597 }
3598
3599 static int
3600 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3601 {
3602 struct dp_netdev *dp = get_dp_netdev(dpif);
3603 struct netdev_flow_key key, mask;
3604 struct dp_netdev_pmd_thread *pmd;
3605 struct match match;
3606 ovs_u128 ufid;
3607 int error;
3608 bool probe = put->flags & DPIF_FP_PROBE;
3609
3610 if (put->stats) {
3611 memset(put->stats, 0, sizeof *put->stats);
3612 }
3613 error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3614 probe);
3615 if (error) {
3616 return error;
3617 }
3618 error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3619 put->mask, put->mask_len,
3620 &match.flow, &match.wc, probe);
3621 if (error) {
3622 return error;
3623 }
3624
3625 if (put->ufid) {
3626 ufid = *put->ufid;
3627 } else {
3628 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3629 }
3630
3631 /* The Netlink encoding of datapath flow keys cannot express
3632 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3633 * tag is interpreted as exact match on the fact that there is no
3634 * VLAN. Unless we refactor a lot of code that translates between
3635 * Netlink and struct flow representations, we have to do the same
3636 * here. This must be in sync with 'match' in handle_packet_upcall(). */
3637 if (!match.wc.masks.vlans[0].tci) {
3638 match.wc.masks.vlans[0].tci = htons(0xffff);
3639 }
3640
3641 /* Must produce a netdev_flow_key for lookup.
3642 * Use the same method as employed to create the key when adding
3643 * the flow to the dplcs to make sure they match. */
3644 netdev_flow_mask_init(&mask, &match);
3645 netdev_flow_key_init_masked(&key, &match.flow, &mask);
3646
3647 if (put->pmd_id == PMD_ID_NULL) {
3648 if (cmap_count(&dp->poll_threads) == 0) {
3649 return EINVAL;
3650 }
3651 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3652 struct dpif_flow_stats pmd_stats;
3653 int pmd_error;
3654
3655 pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3656 &pmd_stats);
3657 if (pmd_error) {
3658 error = pmd_error;
3659 } else if (put->stats) {
3660 put->stats->n_packets += pmd_stats.n_packets;
3661 put->stats->n_bytes += pmd_stats.n_bytes;
3662 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3663 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3664 }
3665 }
3666 } else {
3667 pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3668 if (!pmd) {
3669 return EINVAL;
3670 }
3671 error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3672 dp_netdev_pmd_unref(pmd);
3673 }
3674
3675 return error;
3676 }
3677
3678 static int
3679 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3680 struct dpif_flow_stats *stats,
3681 const struct dpif_flow_del *del)
3682 {
3683 struct dp_netdev_flow *netdev_flow;
3684 int error = 0;
3685
3686 ovs_mutex_lock(&pmd->flow_mutex);
3687 netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3688 del->key_len);
3689 if (netdev_flow) {
3690 if (stats) {
3691 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3692 }
3693 dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3694 } else {
3695 error = ENOENT;
3696 }
3697 ovs_mutex_unlock(&pmd->flow_mutex);
3698
3699 return error;
3700 }
3701
3702 static int
3703 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3704 {
3705 struct dp_netdev *dp = get_dp_netdev(dpif);
3706 struct dp_netdev_pmd_thread *pmd;
3707 int error = 0;
3708
3709 if (del->stats) {
3710 memset(del->stats, 0, sizeof *del->stats);
3711 }
3712
3713 if (del->pmd_id == PMD_ID_NULL) {
3714 if (cmap_count(&dp->poll_threads) == 0) {
3715 return EINVAL;
3716 }
3717 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3718 struct dpif_flow_stats pmd_stats;
3719 int pmd_error;
3720
3721 pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3722 if (pmd_error) {
3723 error = pmd_error;
3724 } else if (del->stats) {
3725 del->stats->n_packets += pmd_stats.n_packets;
3726 del->stats->n_bytes += pmd_stats.n_bytes;
3727 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3728 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3729 }
3730 }
3731 } else {
3732 pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3733 if (!pmd) {
3734 return EINVAL;
3735 }
3736 error = flow_del_on_pmd(pmd, del->stats, del);
3737 dp_netdev_pmd_unref(pmd);
3738 }
3739
3740
3741 return error;
3742 }
3743
3744 struct dpif_netdev_flow_dump {
3745 struct dpif_flow_dump up;
3746 struct cmap_position poll_thread_pos;
3747 struct cmap_position flow_pos;
3748 struct dp_netdev_pmd_thread *cur_pmd;
3749 int status;
3750 struct ovs_mutex mutex;
3751 };
3752
3753 static struct dpif_netdev_flow_dump *
3754 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3755 {
3756 return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3757 }
3758
3759 static struct dpif_flow_dump *
3760 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3761 struct dpif_flow_dump_types *types OVS_UNUSED)
3762 {
3763 struct dpif_netdev_flow_dump *dump;
3764
3765 dump = xzalloc(sizeof *dump);
3766 dpif_flow_dump_init(&dump->up, dpif_);
3767 dump->up.terse = terse;
3768 ovs_mutex_init(&dump->mutex);
3769
3770 return &dump->up;
3771 }
3772
3773 static int
3774 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3775 {
3776 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3777
3778 ovs_mutex_destroy(&dump->mutex);
3779 free(dump);
3780 return 0;
3781 }
3782
3783 struct dpif_netdev_flow_dump_thread {
3784 struct dpif_flow_dump_thread up;
3785 struct dpif_netdev_flow_dump *dump;
3786 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3787 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3788 };
3789
3790 static struct dpif_netdev_flow_dump_thread *
3791 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3792 {
3793 return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3794 }
3795
3796 static struct dpif_flow_dump_thread *
3797 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3798 {
3799 struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3800 struct dpif_netdev_flow_dump_thread *thread;
3801
3802 thread = xmalloc(sizeof *thread);
3803 dpif_flow_dump_thread_init(&thread->up, &dump->up);
3804 thread->dump = dump;
3805 return &thread->up;
3806 }
3807
3808 static void
3809 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3810 {
3811 struct dpif_netdev_flow_dump_thread *thread
3812 = dpif_netdev_flow_dump_thread_cast(thread_);
3813
3814 free(thread);
3815 }
3816
3817 static int
3818 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3819 struct dpif_flow *flows, int max_flows)
3820 {
3821 struct dpif_netdev_flow_dump_thread *thread
3822 = dpif_netdev_flow_dump_thread_cast(thread_);
3823 struct dpif_netdev_flow_dump *dump = thread->dump;
3824 struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3825 struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3826 struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3827 int n_flows = 0;
3828 int i;
3829
3830 ovs_mutex_lock(&dump->mutex);
3831 if (!dump->status) {
3832 struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3833 int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3834
3835 /* First call to dump_next(), extracts the first pmd thread.
3836 * If there is no pmd thread, returns immediately. */
3837 if (!pmd) {
3838 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3839 if (!pmd) {
3840 ovs_mutex_unlock(&dump->mutex);
3841 return n_flows;
3842
3843 }
3844 }
3845
3846 do {
3847 for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3848 struct cmap_node *node;
3849
3850 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3851 if (!node) {
3852 break;
3853 }
3854 netdev_flows[n_flows] = CONTAINER_OF(node,
3855 struct dp_netdev_flow,
3856 node);
3857 }
3858 /* When finishing dumping the current pmd thread, moves to
3859 * the next. */
3860 if (n_flows < flow_limit) {
3861 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3862 dp_netdev_pmd_unref(pmd);
3863 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3864 if (!pmd) {
3865 dump->status = EOF;
3866 break;
3867 }
3868 }
3869 /* Keeps the reference to next caller. */
3870 dump->cur_pmd = pmd;
3871
3872 /* If the current dump is empty, do not exit the loop, since the
3873 * remaining pmds could have flows to be dumped. Just dumps again
3874 * on the new 'pmd'. */
3875 } while (!n_flows);
3876 }
3877 ovs_mutex_unlock(&dump->mutex);
3878
3879 for (i = 0; i < n_flows; i++) {
3880 struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3881 struct odputil_keybuf *keybuf = &thread->keybuf[i];
3882 struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3883 struct dpif_flow *f = &flows[i];
3884 struct ofpbuf key, mask;
3885
3886 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3887 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3888 dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
3889 dump->up.terse);
3890 }
3891
3892 return n_flows;
3893 }
3894
3895 static int
3896 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3897 OVS_NO_THREAD_SAFETY_ANALYSIS
3898 {
3899 struct dp_netdev *dp = get_dp_netdev(dpif);
3900 struct dp_netdev_pmd_thread *pmd;
3901 struct dp_packet_batch pp;
3902
3903 if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3904 dp_packet_size(execute->packet) > UINT16_MAX) {
3905 return EINVAL;
3906 }
3907
3908 /* Tries finding the 'pmd'. If NULL is returned, that means
3909 * the current thread is a non-pmd thread and should use
3910 * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3911 pmd = ovsthread_getspecific(dp->per_pmd_key);
3912 if (!pmd) {
3913 pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3914 if (!pmd) {
3915 return EBUSY;
3916 }
3917 }
3918
3919 if (execute->probe) {
3920 /* If this is part of a probe, Drop the packet, since executing
3921 * the action may actually cause spurious packets be sent into
3922 * the network. */
3923 if (pmd->core_id == NON_PMD_CORE_ID) {
3924 dp_netdev_pmd_unref(pmd);
3925 }
3926 return 0;
3927 }
3928
3929 /* If the current thread is non-pmd thread, acquires
3930 * the 'non_pmd_mutex'. */
3931 if (pmd->core_id == NON_PMD_CORE_ID) {
3932 ovs_mutex_lock(&dp->non_pmd_mutex);
3933 }
3934
3935 /* Update current time in PMD context. We don't care about EMC insertion
3936 * probability, because we are on a slow path. */
3937 pmd_thread_ctx_time_update(pmd);
3938
3939 /* The action processing expects the RSS hash to be valid, because
3940 * it's always initialized at the beginning of datapath processing.
3941 * In this case, though, 'execute->packet' may not have gone through
3942 * the datapath at all, it may have been generated by the upper layer
3943 * (OpenFlow packet-out, BFD frame, ...). */
3944 if (!dp_packet_rss_valid(execute->packet)) {
3945 dp_packet_set_rss_hash(execute->packet,
3946 flow_hash_5tuple(execute->flow, 0));
3947 }
3948
3949 dp_packet_batch_init_packet(&pp, execute->packet);
3950 pp.do_not_steal = true;
3951 dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3952 execute->actions, execute->actions_len);
3953 dp_netdev_pmd_flush_output_packets(pmd, true);
3954
3955 if (pmd->core_id == NON_PMD_CORE_ID) {
3956 ovs_mutex_unlock(&dp->non_pmd_mutex);
3957 dp_netdev_pmd_unref(pmd);
3958 }
3959
3960 return 0;
3961 }
3962
3963 static void
3964 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3965 enum dpif_offload_type offload_type OVS_UNUSED)
3966 {
3967 size_t i;
3968
3969 for (i = 0; i < n_ops; i++) {
3970 struct dpif_op *op = ops[i];
3971
3972 switch (op->type) {
3973 case DPIF_OP_FLOW_PUT:
3974 op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3975 break;
3976
3977 case DPIF_OP_FLOW_DEL:
3978 op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3979 break;
3980
3981 case DPIF_OP_EXECUTE:
3982 op->error = dpif_netdev_execute(dpif, &op->execute);
3983 break;
3984
3985 case DPIF_OP_FLOW_GET:
3986 op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3987 break;
3988 }
3989 }
3990 }
3991
3992 /* Enable or Disable PMD auto load balancing. */
3993 static void
3994 set_pmd_auto_lb(struct dp_netdev *dp)
3995 {
3996 unsigned int cnt = 0;
3997 struct dp_netdev_pmd_thread *pmd;
3998 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3999
4000 bool enable_alb = false;
4001 bool multi_rxq = false;
4002 bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
4003
4004 /* Ensure that there is at least 2 non-isolated PMDs and
4005 * one of them is polling more than one rxq. */
4006 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4007 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4008 continue;
4009 }
4010
4011 if (hmap_count(&pmd->poll_list) > 1) {
4012 multi_rxq = true;
4013 }
4014 if (cnt && multi_rxq) {
4015 enable_alb = true;
4016 break;
4017 }
4018 cnt++;
4019 }
4020
4021 /* Enable auto LB if it is requested and cycle based assignment is true. */
4022 enable_alb = enable_alb && pmd_rxq_assign_cyc &&
4023 pmd_alb->auto_lb_requested;
4024
4025 if (pmd_alb->is_enabled != enable_alb) {
4026 pmd_alb->is_enabled = enable_alb;
4027 if (pmd_alb->is_enabled) {
4028 VLOG_INFO("PMD auto load balance is enabled "
4029 "(with rebalance interval:%"PRIu64" msec)",
4030 pmd_alb->rebalance_intvl);
4031 } else {
4032 pmd_alb->rebalance_poll_timer = 0;
4033 VLOG_INFO("PMD auto load balance is disabled");
4034 }
4035 }
4036
4037 }
4038
4039 /* Applies datapath configuration from the database. Some of the changes are
4040 * actually applied in dpif_netdev_run(). */
4041 static int
4042 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4043 {
4044 struct dp_netdev *dp = get_dp_netdev(dpif);
4045 const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4046 const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4047 "cycles");
4048 unsigned long long insert_prob =
4049 smap_get_ullong(other_config, "emc-insert-inv-prob",
4050 DEFAULT_EM_FLOW_INSERT_INV_PROB);
4051 uint32_t insert_min, cur_min;
4052 uint32_t tx_flush_interval, cur_tx_flush_interval;
4053 uint64_t rebalance_intvl;
4054
4055 tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4056 DEFAULT_TX_FLUSH_INTERVAL);
4057 atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4058 if (tx_flush_interval != cur_tx_flush_interval) {
4059 atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4060 VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4061 tx_flush_interval);
4062 }
4063
4064 if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4065 free(dp->pmd_cmask);
4066 dp->pmd_cmask = nullable_xstrdup(cmask);
4067 dp_netdev_request_reconfigure(dp);
4068 }
4069
4070 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4071 if (insert_prob <= UINT32_MAX) {
4072 insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4073 } else {
4074 insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4075 insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4076 }
4077
4078 if (insert_min != cur_min) {
4079 atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4080 if (insert_min == 0) {
4081 VLOG_INFO("EMC insertion probability changed to zero");
4082 } else {
4083 VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4084 insert_prob, (100 / (float)insert_prob));
4085 }
4086 }
4087
4088 bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4089 bool cur_perf_enabled;
4090 atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4091 if (perf_enabled != cur_perf_enabled) {
4092 atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4093 if (perf_enabled) {
4094 VLOG_INFO("PMD performance metrics collection enabled");
4095 } else {
4096 VLOG_INFO("PMD performance metrics collection disabled");
4097 }
4098 }
4099
4100 bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4101 bool cur_smc;
4102 atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4103 if (smc_enable != cur_smc) {
4104 atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4105 if (smc_enable) {
4106 VLOG_INFO("SMC cache is enabled");
4107 } else {
4108 VLOG_INFO("SMC cache is disabled");
4109 }
4110 }
4111
4112 bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
4113 if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
4114 VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
4115 "Defaulting to 'cycles'.");
4116 pmd_rxq_assign_cyc = true;
4117 pmd_rxq_assign = "cycles";
4118 }
4119 if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4120 dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4121 VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4122 pmd_rxq_assign);
4123 dp_netdev_request_reconfigure(dp);
4124 }
4125
4126 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4127 pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4128 false);
4129
4130 rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4131 ALB_PMD_REBALANCE_POLL_INTERVAL);
4132
4133 /* Input is in min, convert it to msec. */
4134 rebalance_intvl =
4135 rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4136
4137 if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4138 pmd_alb->rebalance_intvl = rebalance_intvl;
4139 }
4140
4141 set_pmd_auto_lb(dp);
4142 return 0;
4143 }
4144
4145 /* Parses affinity list and returns result in 'core_ids'. */
4146 static int
4147 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4148 {
4149 unsigned i;
4150 char *list, *copy, *key, *value;
4151 int error = 0;
4152
4153 for (i = 0; i < n_rxq; i++) {
4154 core_ids[i] = OVS_CORE_UNSPEC;
4155 }
4156
4157 if (!affinity_list) {
4158 return 0;
4159 }
4160
4161 list = copy = xstrdup(affinity_list);
4162
4163 while (ofputil_parse_key_value(&list, &key, &value)) {
4164 int rxq_id, core_id;
4165
4166 if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4167 || !str_to_int(value, 0, &core_id) || core_id < 0) {
4168 error = EINVAL;
4169 break;
4170 }
4171
4172 if (rxq_id < n_rxq) {
4173 core_ids[rxq_id] = core_id;
4174 }
4175 }
4176
4177 free(copy);
4178 return error;
4179 }
4180
4181 /* Parses 'affinity_list' and applies configuration if it is valid. */
4182 static int
4183 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4184 const char *affinity_list)
4185 {
4186 unsigned *core_ids, i;
4187 int error = 0;
4188
4189 core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4190 if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4191 error = EINVAL;
4192 goto exit;
4193 }
4194
4195 for (i = 0; i < port->n_rxq; i++) {
4196 port->rxqs[i].core_id = core_ids[i];
4197 }
4198
4199 exit:
4200 free(core_ids);
4201 return error;
4202 }
4203
4204 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4205 * of given PMD thread. */
4206 static bool
4207 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4208 struct dp_netdev_port *port)
4209 OVS_EXCLUDED(pmd->port_mutex)
4210 {
4211 struct rxq_poll *poll;
4212 bool found = false;
4213
4214 ovs_mutex_lock(&pmd->port_mutex);
4215 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4216 if (port == poll->rxq->port) {
4217 found = true;
4218 break;
4219 }
4220 }
4221 ovs_mutex_unlock(&pmd->port_mutex);
4222 return found;
4223 }
4224
4225 /* Updates port configuration from the database. The changes are actually
4226 * applied in dpif_netdev_run(). */
4227 static int
4228 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4229 const struct smap *cfg)
4230 {
4231 struct dp_netdev *dp = get_dp_netdev(dpif);
4232 struct dp_netdev_port *port;
4233 int error = 0;
4234 const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4235 bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4236
4237 ovs_mutex_lock(&dp->port_mutex);
4238 error = get_port_by_number(dp, port_no, &port);
4239 if (error) {
4240 goto unlock;
4241 }
4242
4243 if (emc_enabled != port->emc_enabled) {
4244 struct dp_netdev_pmd_thread *pmd;
4245 struct ds ds = DS_EMPTY_INITIALIZER;
4246 uint32_t cur_min, insert_prob;
4247
4248 port->emc_enabled = emc_enabled;
4249 /* Mark for reload all the threads that polls this port and request
4250 * for reconfiguration for the actual reloading of threads. */
4251 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4252 if (dpif_netdev_pmd_polls_port(pmd, port)) {
4253 pmd->need_reload = true;
4254 }
4255 }
4256 dp_netdev_request_reconfigure(dp);
4257
4258 ds_put_format(&ds, "%s: EMC has been %s.",
4259 netdev_get_name(port->netdev),
4260 (emc_enabled) ? "enabled" : "disabled");
4261 if (emc_enabled) {
4262 ds_put_cstr(&ds, " Current insertion probability is ");
4263 atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4264 if (!cur_min) {
4265 ds_put_cstr(&ds, "zero.");
4266 } else {
4267 insert_prob = UINT32_MAX / cur_min;
4268 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4269 insert_prob, 100 / (float) insert_prob);
4270 }
4271 }
4272 VLOG_INFO("%s", ds_cstr(&ds));
4273 ds_destroy(&ds);
4274 }
4275
4276 /* Checking for RXq affinity changes. */
4277 if (!netdev_is_pmd(port->netdev)
4278 || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4279 goto unlock;
4280 }
4281
4282 error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4283 if (error) {
4284 goto unlock;
4285 }
4286 free(port->rxq_affinity_list);
4287 port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4288
4289 dp_netdev_request_reconfigure(dp);
4290 unlock:
4291 ovs_mutex_unlock(&dp->port_mutex);
4292 return error;
4293 }
4294
4295 static int
4296 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4297 uint32_t queue_id, uint32_t *priority)
4298 {
4299 *priority = queue_id;
4300 return 0;
4301 }
4302
4303 \f
4304 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4305 * a copy of the 'size' bytes of 'actions' input parameters. */
4306 struct dp_netdev_actions *
4307 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4308 {
4309 struct dp_netdev_actions *netdev_actions;
4310
4311 netdev_actions = xmalloc(sizeof *netdev_actions + size);
4312 memcpy(netdev_actions->actions, actions, size);
4313 netdev_actions->size = size;
4314
4315 return netdev_actions;
4316 }
4317
4318 struct dp_netdev_actions *
4319 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4320 {
4321 return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4322 }
4323
4324 static void
4325 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4326 {
4327 free(actions);
4328 }
4329 \f
4330 static void
4331 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4332 enum rxq_cycles_counter_type type,
4333 unsigned long long cycles)
4334 {
4335 atomic_store_relaxed(&rx->cycles[type], cycles);
4336 }
4337
4338 static void
4339 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4340 enum rxq_cycles_counter_type type,
4341 unsigned long long cycles)
4342 {
4343 non_atomic_ullong_add(&rx->cycles[type], cycles);
4344 }
4345
4346 static uint64_t
4347 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4348 enum rxq_cycles_counter_type type)
4349 {
4350 unsigned long long processing_cycles;
4351 atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4352 return processing_cycles;
4353 }
4354
4355 static void
4356 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4357 unsigned long long cycles)
4358 {
4359 unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4360 atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4361 }
4362
4363 static uint64_t
4364 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4365 {
4366 unsigned long long processing_cycles;
4367 atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4368 return processing_cycles;
4369 }
4370
4371 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4372 static inline bool
4373 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4374 {
4375 bool pmd_perf_enabled;
4376 atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4377 return pmd_perf_enabled;
4378 }
4379 #else
4380 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4381 * performance metrics are not available as locked access to 64 bit
4382 * integers would be prohibitively expensive. */
4383 static inline bool
4384 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4385 {
4386 return false;
4387 }
4388 #endif
4389
4390 static int
4391 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4392 struct tx_port *p)
4393 {
4394 int i;
4395 int tx_qid;
4396 int output_cnt;
4397 bool dynamic_txqs;
4398 struct cycle_timer timer;
4399 uint64_t cycles;
4400 uint32_t tx_flush_interval;
4401
4402 cycle_timer_start(&pmd->perf_stats, &timer);
4403
4404 dynamic_txqs = p->port->dynamic_txqs;
4405 if (dynamic_txqs) {
4406 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4407 } else {
4408 tx_qid = pmd->static_tx_qid;
4409 }
4410
4411 output_cnt = dp_packet_batch_size(&p->output_pkts);
4412 ovs_assert(output_cnt > 0);
4413
4414 netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4415 dp_packet_batch_init(&p->output_pkts);
4416
4417 /* Update time of the next flush. */
4418 atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4419 p->flush_time = pmd->ctx.now + tx_flush_interval;
4420
4421 ovs_assert(pmd->n_output_batches > 0);
4422 pmd->n_output_batches--;
4423
4424 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4425 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4426
4427 /* Distribute send cycles evenly among transmitted packets and assign to
4428 * their respective rx queues. */
4429 cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4430 for (i = 0; i < output_cnt; i++) {
4431 if (p->output_pkts_rxqs[i]) {
4432 dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4433 RXQ_CYCLES_PROC_CURR, cycles);
4434 }
4435 }
4436
4437 return output_cnt;
4438 }
4439
4440 static int
4441 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4442 bool force)
4443 {
4444 struct tx_port *p;
4445 int output_cnt = 0;
4446
4447 if (!pmd->n_output_batches) {
4448 return 0;
4449 }
4450
4451 HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4452 if (!dp_packet_batch_is_empty(&p->output_pkts)
4453 && (force || pmd->ctx.now >= p->flush_time)) {
4454 output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4455 }
4456 }
4457 return output_cnt;
4458 }
4459
4460 static int
4461 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4462 struct dp_netdev_rxq *rxq,
4463 odp_port_t port_no)
4464 {
4465 struct pmd_perf_stats *s = &pmd->perf_stats;
4466 struct dp_packet_batch batch;
4467 struct cycle_timer timer;
4468 int error;
4469 int batch_cnt = 0;
4470 int rem_qlen = 0, *qlen_p = NULL;
4471 uint64_t cycles;
4472
4473 /* Measure duration for polling and processing rx burst. */
4474 cycle_timer_start(&pmd->perf_stats, &timer);
4475
4476 pmd->ctx.last_rxq = rxq;
4477 dp_packet_batch_init(&batch);
4478
4479 /* Fetch the rx queue length only for vhostuser ports. */
4480 if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4481 qlen_p = &rem_qlen;
4482 }
4483
4484 error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4485 if (!error) {
4486 /* At least one packet received. */
4487 *recirc_depth_get() = 0;
4488 pmd_thread_ctx_time_update(pmd);
4489 batch_cnt = dp_packet_batch_size(&batch);
4490 if (pmd_perf_metrics_enabled(pmd)) {
4491 /* Update batch histogram. */
4492 s->current.batches++;
4493 histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4494 /* Update the maximum vhost rx queue fill level. */
4495 if (rxq->is_vhost && rem_qlen >= 0) {
4496 uint32_t qfill = batch_cnt + rem_qlen;
4497 if (qfill > s->current.max_vhost_qfill) {
4498 s->current.max_vhost_qfill = qfill;
4499 }
4500 }
4501 }
4502 /* Process packet batch. */
4503 dp_netdev_input(pmd, &batch, port_no);
4504
4505 /* Assign processing cycles to rx queue. */
4506 cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4507 dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4508
4509 dp_netdev_pmd_flush_output_packets(pmd, false);
4510 } else {
4511 /* Discard cycles. */
4512 cycle_timer_stop(&pmd->perf_stats, &timer);
4513 if (error != EAGAIN && error != EOPNOTSUPP) {
4514 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4515
4516 VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4517 netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4518 }
4519 }
4520
4521 pmd->ctx.last_rxq = NULL;
4522
4523 return batch_cnt;
4524 }
4525
4526 static struct tx_port *
4527 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4528 {
4529 struct tx_port *tx;
4530
4531 HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4532 if (tx->port->port_no == port_no) {
4533 return tx;
4534 }
4535 }
4536
4537 return NULL;
4538 }
4539
4540 static struct tx_bond *
4541 tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4542 {
4543 uint32_t hash = hash_bond_id(bond_id);
4544 struct tx_bond *tx;
4545
4546 CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4547 if (tx->bond_id == bond_id) {
4548 return tx;
4549 }
4550 }
4551 return NULL;
4552 }
4553
4554 static int
4555 port_reconfigure(struct dp_netdev_port *port)
4556 {
4557 struct netdev *netdev = port->netdev;
4558 int i, err;
4559
4560 /* Closes the existing 'rxq's. */
4561 for (i = 0; i < port->n_rxq; i++) {
4562 netdev_rxq_close(port->rxqs[i].rx);
4563 port->rxqs[i].rx = NULL;
4564 }
4565 unsigned last_nrxq = port->n_rxq;
4566 port->n_rxq = 0;
4567
4568 /* Allows 'netdev' to apply the pending configuration changes. */
4569 if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4570 err = netdev_reconfigure(netdev);
4571 if (err && (err != EOPNOTSUPP)) {
4572 VLOG_ERR("Failed to set interface %s new configuration",
4573 netdev_get_name(netdev));
4574 return err;
4575 }
4576 }
4577 /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4578 port->rxqs = xrealloc(port->rxqs,
4579 sizeof *port->rxqs * netdev_n_rxq(netdev));
4580 /* Realloc 'used' counters for tx queues. */
4581 free(port->txq_used);
4582 port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4583
4584 for (i = 0; i < netdev_n_rxq(netdev); i++) {
4585 bool new_queue = i >= last_nrxq;
4586 if (new_queue) {
4587 memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4588 }
4589
4590 port->rxqs[i].port = port;
4591 port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4592
4593 err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4594 if (err) {
4595 return err;
4596 }
4597 port->n_rxq++;
4598 }
4599
4600 /* Parse affinity list to apply configuration for new queues. */
4601 dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4602
4603 /* If reconfiguration was successful mark it as such, so we can use it */
4604 port->need_reconfigure = false;
4605
4606 return 0;
4607 }
4608
4609 struct rr_numa_list {
4610 struct hmap numas; /* Contains 'struct rr_numa' */
4611 };
4612
4613 struct rr_numa {
4614 struct hmap_node node;
4615
4616 int numa_id;
4617
4618 /* Non isolated pmds on numa node 'numa_id' */
4619 struct dp_netdev_pmd_thread **pmds;
4620 int n_pmds;
4621
4622 int cur_index;
4623 bool idx_inc;
4624 };
4625
4626 static struct rr_numa *
4627 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4628 {
4629 struct rr_numa *numa;
4630
4631 HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4632 if (numa->numa_id == numa_id) {
4633 return numa;
4634 }
4635 }
4636
4637 return NULL;
4638 }
4639
4640 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4641 * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4642 * Returns NULL if 'rr' numa list is empty. */
4643 static struct rr_numa *
4644 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4645 {
4646 struct hmap_node *node = NULL;
4647
4648 if (numa) {
4649 node = hmap_next(&rr->numas, &numa->node);
4650 }
4651 if (!node) {
4652 node = hmap_first(&rr->numas);
4653 }
4654
4655 return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4656 }
4657
4658 static void
4659 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4660 {
4661 struct dp_netdev_pmd_thread *pmd;
4662 struct rr_numa *numa;
4663
4664 hmap_init(&rr->numas);
4665
4666 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4667 if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4668 continue;
4669 }
4670
4671 numa = rr_numa_list_lookup(rr, pmd->numa_id);
4672 if (!numa) {
4673 numa = xzalloc(sizeof *numa);
4674 numa->numa_id = pmd->numa_id;
4675 hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4676 }
4677 numa->n_pmds++;
4678 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4679 numa->pmds[numa->n_pmds - 1] = pmd;
4680 /* At least one pmd so initialise curr_idx and idx_inc. */
4681 numa->cur_index = 0;
4682 numa->idx_inc = true;
4683 }
4684 }
4685
4686 /*
4687 * Returns the next pmd from the numa node.
4688 *
4689 * If 'updown' is 'true' it will alternate between selecting the next pmd in
4690 * either an up or down walk, switching between up/down when the first or last
4691 * core is reached. e.g. 1,2,3,3,2,1,1,2...
4692 *
4693 * If 'updown' is 'false' it will select the next pmd wrapping around when last
4694 * core reached. e.g. 1,2,3,1,2,3,1,2...
4695 */
4696 static struct dp_netdev_pmd_thread *
4697 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4698 {
4699 int numa_idx = numa->cur_index;
4700
4701 if (numa->idx_inc == true) {
4702 /* Incrementing through list of pmds. */
4703 if (numa->cur_index == numa->n_pmds-1) {
4704 /* Reached the last pmd. */
4705 if (updown) {
4706 numa->idx_inc = false;
4707 } else {
4708 numa->cur_index = 0;
4709 }
4710 } else {
4711 numa->cur_index++;
4712 }
4713 } else {
4714 /* Decrementing through list of pmds. */
4715 if (numa->cur_index == 0) {
4716 /* Reached the first pmd. */
4717 numa->idx_inc = true;
4718 } else {
4719 numa->cur_index--;
4720 }
4721 }
4722 return numa->pmds[numa_idx];
4723 }
4724
4725 static void
4726 rr_numa_list_destroy(struct rr_numa_list *rr)
4727 {
4728 struct rr_numa *numa;
4729
4730 HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4731 free(numa->pmds);
4732 free(numa);
4733 }
4734 hmap_destroy(&rr->numas);
4735 }
4736
4737 /* Sort Rx Queues by the processing cycles they are consuming. */
4738 static int
4739 compare_rxq_cycles(const void *a, const void *b)
4740 {
4741 struct dp_netdev_rxq *qa;
4742 struct dp_netdev_rxq *qb;
4743 uint64_t cycles_qa, cycles_qb;
4744
4745 qa = *(struct dp_netdev_rxq **) a;
4746 qb = *(struct dp_netdev_rxq **) b;
4747
4748 cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4749 cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4750
4751 if (cycles_qa != cycles_qb) {
4752 return (cycles_qa < cycles_qb) ? 1 : -1;
4753 } else {
4754 /* Cycles are the same so tiebreak on port/queue id.
4755 * Tiebreaking (as opposed to return 0) ensures consistent
4756 * sort results across multiple OS's. */
4757 uint32_t port_qa = odp_to_u32(qa->port->port_no);
4758 uint32_t port_qb = odp_to_u32(qb->port->port_no);
4759 if (port_qa != port_qb) {
4760 return port_qa > port_qb ? 1 : -1;
4761 } else {
4762 return netdev_rxq_get_queue_id(qa->rx)
4763 - netdev_rxq_get_queue_id(qb->rx);
4764 }
4765 }
4766 }
4767
4768 /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
4769 * queues and marks the pmds as isolated. Otherwise, assign non isolated
4770 * pmds to unpinned queues.
4771 *
4772 * The function doesn't touch the pmd threads, it just stores the assignment
4773 * in the 'pmd' member of each rxq. */
4774 static void
4775 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4776 {
4777 struct dp_netdev_port *port;
4778 struct rr_numa_list rr;
4779 struct rr_numa *non_local_numa = NULL;
4780 struct dp_netdev_rxq ** rxqs = NULL;
4781 int n_rxqs = 0;
4782 struct rr_numa *numa = NULL;
4783 int numa_id;
4784 bool assign_cyc = dp->pmd_rxq_assign_cyc;
4785
4786 HMAP_FOR_EACH (port, node, &dp->ports) {
4787 if (!netdev_is_pmd(port->netdev)) {
4788 continue;
4789 }
4790
4791 for (int qid = 0; qid < port->n_rxq; qid++) {
4792 struct dp_netdev_rxq *q = &port->rxqs[qid];
4793
4794 if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4795 struct dp_netdev_pmd_thread *pmd;
4796
4797 pmd = dp_netdev_get_pmd(dp, q->core_id);
4798 if (!pmd) {
4799 VLOG_WARN("There is no PMD thread on core %d. Queue "
4800 "%d on port \'%s\' will not be polled.",
4801 q->core_id, qid, netdev_get_name(port->netdev));
4802 } else {
4803 q->pmd = pmd;
4804 pmd->isolated = true;
4805 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4806 "rx queue %d.", pmd->core_id, pmd->numa_id,
4807 netdev_rxq_get_name(q->rx),
4808 netdev_rxq_get_queue_id(q->rx));
4809 dp_netdev_pmd_unref(pmd);
4810 }
4811 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4812 uint64_t cycle_hist = 0;
4813
4814 if (n_rxqs == 0) {
4815 rxqs = xmalloc(sizeof *rxqs);
4816 } else {
4817 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4818 }
4819
4820 if (assign_cyc) {
4821 /* Sum the queue intervals and store the cycle history. */
4822 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4823 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4824 }
4825 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4826 cycle_hist);
4827 }
4828 /* Store the queue. */
4829 rxqs[n_rxqs++] = q;
4830 }
4831 }
4832 }
4833
4834 if (n_rxqs > 1 && assign_cyc) {
4835 /* Sort the queues in order of the processing cycles
4836 * they consumed during their last pmd interval. */
4837 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4838 }
4839
4840 rr_numa_list_populate(dp, &rr);
4841 /* Assign the sorted queues to pmds in round robin. */
4842 for (int i = 0; i < n_rxqs; i++) {
4843 numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4844 numa = rr_numa_list_lookup(&rr, numa_id);
4845 if (!numa) {
4846 /* There are no pmds on the queue's local NUMA node.
4847 Round robin on the NUMA nodes that do have pmds. */
4848 non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4849 if (!non_local_numa) {
4850 VLOG_ERR("There is no available (non-isolated) pmd "
4851 "thread for port \'%s\' queue %d. This queue "
4852 "will not be polled. Is pmd-cpu-mask set to "
4853 "zero? Or are all PMDs isolated to other "
4854 "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4855 netdev_rxq_get_queue_id(rxqs[i]->rx));
4856 continue;
4857 }
4858 rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4859 VLOG_WARN("There's no available (non-isolated) pmd thread "
4860 "on numa node %d. Queue %d on port \'%s\' will "
4861 "be assigned to the pmd on core %d "
4862 "(numa node %d). Expect reduced performance.",
4863 numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4864 netdev_rxq_get_name(rxqs[i]->rx),
4865 rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4866 } else {
4867 rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4868 if (assign_cyc) {
4869 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4870 "rx queue %d "
4871 "(measured processing cycles %"PRIu64").",
4872 rxqs[i]->pmd->core_id, numa_id,
4873 netdev_rxq_get_name(rxqs[i]->rx),
4874 netdev_rxq_get_queue_id(rxqs[i]->rx),
4875 dp_netdev_rxq_get_cycles(rxqs[i],
4876 RXQ_CYCLES_PROC_HIST));
4877 } else {
4878 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4879 "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4880 netdev_rxq_get_name(rxqs[i]->rx),
4881 netdev_rxq_get_queue_id(rxqs[i]->rx));
4882 }
4883 }
4884 }
4885
4886 rr_numa_list_destroy(&rr);
4887 free(rxqs);
4888 }
4889
4890 static void
4891 reload_affected_pmds(struct dp_netdev *dp)
4892 {
4893 struct dp_netdev_pmd_thread *pmd;
4894
4895 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4896 if (pmd->need_reload) {
4897 flow_mark_flush(pmd);
4898 dp_netdev_reload_pmd__(pmd);
4899 }
4900 }
4901
4902 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4903 if (pmd->need_reload) {
4904 if (pmd->core_id != NON_PMD_CORE_ID) {
4905 bool reload;
4906
4907 do {
4908 atomic_read_explicit(&pmd->reload, &reload,
4909 memory_order_acquire);
4910 } while (reload);
4911 }
4912 pmd->need_reload = false;
4913 }
4914 }
4915 }
4916
4917 static void
4918 reconfigure_pmd_threads(struct dp_netdev *dp)
4919 OVS_REQUIRES(dp->port_mutex)
4920 {
4921 struct dp_netdev_pmd_thread *pmd;
4922 struct ovs_numa_dump *pmd_cores;
4923 struct ovs_numa_info_core *core;
4924 struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4925 struct hmapx_node *node;
4926 bool changed = false;
4927 bool need_to_adjust_static_tx_qids = false;
4928
4929 /* The pmd threads should be started only if there's a pmd port in the
4930 * datapath. If the user didn't provide any "pmd-cpu-mask", we start
4931 * NR_PMD_THREADS per numa node. */
4932 if (!has_pmd_port(dp)) {
4933 pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4934 } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4935 pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4936 } else {
4937 pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4938 }
4939
4940 /* We need to adjust 'static_tx_qid's only if we're reducing number of
4941 * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4942 if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4943 /* Adjustment is required to keep 'static_tx_qid's sequential and
4944 * avoid possible issues, for example, imbalanced tx queue usage
4945 * and unnecessary locking caused by remapping on netdev level. */
4946 need_to_adjust_static_tx_qids = true;
4947 }
4948
4949 /* Check for unwanted pmd threads */
4950 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4951 if (pmd->core_id == NON_PMD_CORE_ID) {
4952 continue;
4953 }
4954 if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4955 pmd->core_id)) {
4956 hmapx_add(&to_delete, pmd);
4957 } else if (need_to_adjust_static_tx_qids) {
4958 atomic_store_relaxed(&pmd->reload_tx_qid, true);
4959 pmd->need_reload = true;
4960 }
4961 }
4962
4963 HMAPX_FOR_EACH (node, &to_delete) {
4964 pmd = (struct dp_netdev_pmd_thread *) node->data;
4965 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4966 pmd->numa_id, pmd->core_id);
4967 dp_netdev_del_pmd(dp, pmd);
4968 }
4969 changed = !hmapx_is_empty(&to_delete);
4970 hmapx_destroy(&to_delete);
4971
4972 if (need_to_adjust_static_tx_qids) {
4973 /* 'static_tx_qid's are not sequential now.
4974 * Reload remaining threads to fix this. */
4975 reload_affected_pmds(dp);
4976 }
4977
4978 /* Check for required new pmd threads */
4979 FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4980 pmd = dp_netdev_get_pmd(dp, core->core_id);
4981 if (!pmd) {
4982 struct ds name = DS_EMPTY_INITIALIZER;
4983
4984 pmd = xzalloc(sizeof *pmd);
4985 dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4986
4987 ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4988 pmd->thread = ovs_thread_create(ds_cstr(&name),
4989 pmd_thread_main, pmd);
4990 ds_destroy(&name);
4991
4992 VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4993 pmd->numa_id, pmd->core_id);
4994 changed = true;
4995 } else {
4996 dp_netdev_pmd_unref(pmd);
4997 }
4998 }
4999
5000 if (changed) {
5001 struct ovs_numa_info_numa *numa;
5002
5003 /* Log the number of pmd threads per numa node. */
5004 FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5005 VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5006 numa->n_cores, numa->numa_id);
5007 }
5008 }
5009
5010 ovs_numa_dump_destroy(pmd_cores);
5011 }
5012
5013 static void
5014 pmd_remove_stale_ports(struct dp_netdev *dp,
5015 struct dp_netdev_pmd_thread *pmd)
5016 OVS_EXCLUDED(pmd->port_mutex)
5017 OVS_REQUIRES(dp->port_mutex)
5018 {
5019 struct rxq_poll *poll, *poll_next;
5020 struct tx_port *tx, *tx_next;
5021
5022 ovs_mutex_lock(&pmd->port_mutex);
5023 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5024 struct dp_netdev_port *port = poll->rxq->port;
5025
5026 if (port->need_reconfigure
5027 || !hmap_contains(&dp->ports, &port->node)) {
5028 dp_netdev_del_rxq_from_pmd(pmd, poll);
5029 }
5030 }
5031 HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
5032 struct dp_netdev_port *port = tx->port;
5033
5034 if (port->need_reconfigure
5035 || !hmap_contains(&dp->ports, &port->node)) {
5036 dp_netdev_del_port_tx_from_pmd(pmd, tx);
5037 }
5038 }
5039 ovs_mutex_unlock(&pmd->port_mutex);
5040 }
5041
5042 /* Must be called each time a port is added/removed or the cmask changes.
5043 * This creates and destroys pmd threads, reconfigures ports, opens their
5044 * rxqs and assigns all rxqs/txqs to pmd threads. */
5045 static void
5046 reconfigure_datapath(struct dp_netdev *dp)
5047 OVS_REQUIRES(dp->port_mutex)
5048 {
5049 struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5050 struct dp_netdev_pmd_thread *pmd;
5051 struct dp_netdev_port *port;
5052 int wanted_txqs;
5053
5054 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5055
5056 /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5057 * on the system and the user configuration. */
5058 reconfigure_pmd_threads(dp);
5059
5060 wanted_txqs = cmap_count(&dp->poll_threads);
5061
5062 /* The number of pmd threads might have changed, or a port can be new:
5063 * adjust the txqs. */
5064 HMAP_FOR_EACH (port, node, &dp->ports) {
5065 netdev_set_tx_multiq(port->netdev, wanted_txqs);
5066 }
5067
5068 /* Step 2: Remove from the pmd threads ports that have been removed or
5069 * need reconfiguration. */
5070
5071 /* Check for all the ports that need reconfiguration. We cache this in
5072 * 'port->need_reconfigure', because netdev_is_reconf_required() can
5073 * change at any time.
5074 * Also mark for reconfiguration all ports which will likely change their
5075 * 'dynamic_txqs' parameter. It's required to stop using them before
5076 * changing this setting and it's simpler to mark ports here and allow
5077 * 'pmd_remove_stale_ports' to remove them from threads. There will be
5078 * no actual reconfiguration in 'port_reconfigure' because it's
5079 * unnecessary. */
5080 HMAP_FOR_EACH (port, node, &dp->ports) {
5081 if (netdev_is_reconf_required(port->netdev)
5082 || (port->dynamic_txqs
5083 != (netdev_n_txq(port->netdev) < wanted_txqs))) {
5084 port->need_reconfigure = true;
5085 }
5086 }
5087
5088 /* Remove from the pmd threads all the ports that have been deleted or
5089 * need reconfiguration. */
5090 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5091 pmd_remove_stale_ports(dp, pmd);
5092 }
5093
5094 /* Reload affected pmd threads. We must wait for the pmd threads before
5095 * reconfiguring the ports, because a port cannot be reconfigured while
5096 * it's being used. */
5097 reload_affected_pmds(dp);
5098
5099 /* Step 3: Reconfigure ports. */
5100
5101 /* We only reconfigure the ports that we determined above, because they're
5102 * not being used by any pmd thread at the moment. If a port fails to
5103 * reconfigure we remove it from the datapath. */
5104 struct dp_netdev_port *next_port;
5105 HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
5106 int err;
5107
5108 if (!port->need_reconfigure) {
5109 continue;
5110 }
5111
5112 err = port_reconfigure(port);
5113 if (err) {
5114 hmap_remove(&dp->ports, &port->node);
5115 seq_change(dp->port_seq);
5116 port_destroy(port);
5117 } else {
5118 port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
5119 }
5120 }
5121
5122 /* Step 4: Compute new rxq scheduling. We don't touch the pmd threads
5123 * for now, we just update the 'pmd' pointer in each rxq to point to the
5124 * wanted thread according to the scheduling policy. */
5125
5126 /* Reset all the pmd threads to non isolated. */
5127 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5128 pmd->isolated = false;
5129 }
5130
5131 /* Reset all the queues to unassigned */
5132 HMAP_FOR_EACH (port, node, &dp->ports) {
5133 for (int i = 0; i < port->n_rxq; i++) {
5134 port->rxqs[i].pmd = NULL;
5135 }
5136 }
5137
5138 /* Add pinned queues and mark pmd threads isolated. */
5139 rxq_scheduling(dp, true);
5140
5141 /* Add non-pinned queues. */
5142 rxq_scheduling(dp, false);
5143
5144 /* Step 5: Remove queues not compliant with new scheduling. */
5145
5146 /* Count all the threads that will have at least one queue to poll. */
5147 HMAP_FOR_EACH (port, node, &dp->ports) {
5148 for (int qid = 0; qid < port->n_rxq; qid++) {
5149 struct dp_netdev_rxq *q = &port->rxqs[qid];
5150
5151 if (q->pmd) {
5152 hmapx_add(&busy_threads, q->pmd);
5153 }
5154 }
5155 }
5156
5157 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5158 struct rxq_poll *poll, *poll_next;
5159
5160 ovs_mutex_lock(&pmd->port_mutex);
5161 HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5162 if (poll->rxq->pmd != pmd) {
5163 dp_netdev_del_rxq_from_pmd(pmd, poll);
5164
5165 /* This pmd might sleep after this step if it has no rxq
5166 * remaining. Tell it to busy wait for new assignment if it
5167 * has at least one scheduled queue. */
5168 if (hmap_count(&pmd->poll_list) == 0 &&
5169 hmapx_contains(&busy_threads, pmd)) {
5170 atomic_store_relaxed(&pmd->wait_for_reload, true);
5171 }
5172 }
5173 }
5174 ovs_mutex_unlock(&pmd->port_mutex);
5175 }
5176
5177 hmapx_destroy(&busy_threads);
5178
5179 /* Reload affected pmd threads. We must wait for the pmd threads to remove
5180 * the old queues before readding them, otherwise a queue can be polled by
5181 * two threads at the same time. */
5182 reload_affected_pmds(dp);
5183
5184 /* Step 6: Add queues from scheduling, if they're not there already. */
5185 HMAP_FOR_EACH (port, node, &dp->ports) {
5186 if (!netdev_is_pmd(port->netdev)) {
5187 continue;
5188 }
5189
5190 for (int qid = 0; qid < port->n_rxq; qid++) {
5191 struct dp_netdev_rxq *q = &port->rxqs[qid];
5192
5193 if (q->pmd) {
5194 ovs_mutex_lock(&q->pmd->port_mutex);
5195 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5196 ovs_mutex_unlock(&q->pmd->port_mutex);
5197 }
5198 }
5199 }
5200
5201 /* Add every port and bond to the tx port and bond caches of
5202 * every pmd thread, if it's not there already and if this pmd
5203 * has at least one rxq to poll.
5204 */
5205 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5206 ovs_mutex_lock(&pmd->port_mutex);
5207 if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5208 struct tx_bond *bond;
5209
5210 HMAP_FOR_EACH (port, node, &dp->ports) {
5211 dp_netdev_add_port_tx_to_pmd(pmd, port);
5212 }
5213
5214 CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5215 dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5216 }
5217 }
5218 ovs_mutex_unlock(&pmd->port_mutex);
5219 }
5220
5221 /* Reload affected pmd threads. */
5222 reload_affected_pmds(dp);
5223
5224 /* Check if PMD Auto LB is to be enabled */
5225 set_pmd_auto_lb(dp);
5226 }
5227
5228 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5229 static bool
5230 ports_require_restart(const struct dp_netdev *dp)
5231 OVS_REQUIRES(dp->port_mutex)
5232 {
5233 struct dp_netdev_port *port;
5234
5235 HMAP_FOR_EACH (port, node, &dp->ports) {
5236 if (netdev_is_reconf_required(port->netdev)) {
5237 return true;
5238 }
5239 }
5240
5241 return false;
5242 }
5243
5244 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5245 * of elements in array to be considered for calculating vairance.
5246 * Usage example: data array 'a' contains the processing load of each pmd and
5247 * 'n' is the number of PMDs. It returns the variance in processing load of
5248 * PMDs*/
5249 static uint64_t
5250 variance(uint64_t a[], int n)
5251 {
5252 /* Compute mean (average of elements). */
5253 uint64_t sum = 0;
5254 uint64_t mean = 0;
5255 uint64_t sqDiff = 0;
5256
5257 if (!n) {
5258 return 0;
5259 }
5260
5261 for (int i = 0; i < n; i++) {
5262 sum += a[i];
5263 }
5264
5265 if (sum) {
5266 mean = sum / n;
5267
5268 /* Compute sum squared differences with mean. */
5269 for (int i = 0; i < n; i++) {
5270 sqDiff += (a[i] - mean)*(a[i] - mean);
5271 }
5272 }
5273 return (sqDiff ? (sqDiff / n) : 0);
5274 }
5275
5276
5277 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5278 * assignment to PMDs. */
5279 static bool
5280 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5281 uint32_t num_pmds, uint64_t *predicted_variance)
5282 OVS_REQUIRES(dp->port_mutex)
5283 {
5284 struct dp_netdev_port *port;
5285 struct dp_netdev_pmd_thread *pmd;
5286 struct dp_netdev_rxq **rxqs = NULL;
5287 struct rr_numa *numa = NULL;
5288 struct rr_numa_list rr;
5289 int n_rxqs = 0;
5290 bool ret = false;
5291 uint64_t *pmd_usage;
5292
5293 if (!predicted_variance) {
5294 return ret;
5295 }
5296
5297 pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5298
5299 HMAP_FOR_EACH (port, node, &dp->ports) {
5300 if (!netdev_is_pmd(port->netdev)) {
5301 continue;
5302 }
5303
5304 for (int qid = 0; qid < port->n_rxq; qid++) {
5305 struct dp_netdev_rxq *q = &port->rxqs[qid];
5306 uint64_t cycle_hist = 0;
5307
5308 if (q->pmd->isolated) {
5309 continue;
5310 }
5311
5312 if (n_rxqs == 0) {
5313 rxqs = xmalloc(sizeof *rxqs);
5314 } else {
5315 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5316 }
5317
5318 /* Sum the queue intervals and store the cycle history. */
5319 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5320 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5321 }
5322 dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5323 cycle_hist);
5324 /* Store the queue. */
5325 rxqs[n_rxqs++] = q;
5326 }
5327 }
5328 if (n_rxqs > 1) {
5329 /* Sort the queues in order of the processing cycles
5330 * they consumed during their last pmd interval. */
5331 qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5332 }
5333 rr_numa_list_populate(dp, &rr);
5334
5335 for (int i = 0; i < n_rxqs; i++) {
5336 int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5337 numa = rr_numa_list_lookup(&rr, numa_id);
5338 if (!numa) {
5339 /* Abort if cross NUMA polling. */
5340 VLOG_DBG("PMD auto lb dry run."
5341 " Aborting due to cross-numa polling.");
5342 goto cleanup;
5343 }
5344
5345 pmd = rr_numa_get_pmd(numa, true);
5346 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5347 "to be assigned port \'%s\' rx queue %d "
5348 "(measured processing cycles %"PRIu64").",
5349 pmd->core_id, numa_id,
5350 netdev_rxq_get_name(rxqs[i]->rx),
5351 netdev_rxq_get_queue_id(rxqs[i]->rx),
5352 dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5353
5354 for (int id = 0; id < num_pmds; id++) {
5355 if (pmd->core_id == core_list[id]) {
5356 /* Add the processing cycles of rxq to pmd polling it. */
5357 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5358 RXQ_CYCLES_PROC_HIST);
5359 }
5360 }
5361 }
5362
5363 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5364 uint64_t total_cycles = 0;
5365
5366 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5367 continue;
5368 }
5369
5370 /* Get the total pmd cycles for an interval. */
5371 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5372 /* Estimate the cycles to cover all intervals. */
5373 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5374 for (int id = 0; id < num_pmds; id++) {
5375 if (pmd->core_id == core_list[id]) {
5376 if (pmd_usage[id]) {
5377 pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5378 }
5379 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5380 "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5381 }
5382 }
5383 }
5384 *predicted_variance = variance(pmd_usage, num_pmds);
5385 ret = true;
5386
5387 cleanup:
5388 rr_numa_list_destroy(&rr);
5389 free(rxqs);
5390 free(pmd_usage);
5391 return ret;
5392 }
5393
5394 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5395 * better distribution of load on PMDs. */
5396 static bool
5397 pmd_rebalance_dry_run(struct dp_netdev *dp)
5398 OVS_REQUIRES(dp->port_mutex)
5399 {
5400 struct dp_netdev_pmd_thread *pmd;
5401 uint64_t *curr_pmd_usage;
5402
5403 uint64_t curr_variance;
5404 uint64_t new_variance;
5405 uint64_t improvement = 0;
5406 uint32_t num_pmds;
5407 uint32_t *pmd_corelist;
5408 struct rxq_poll *poll;
5409 bool ret;
5410
5411 num_pmds = cmap_count(&dp->poll_threads);
5412
5413 if (num_pmds > 1) {
5414 curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5415 pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5416 } else {
5417 return false;
5418 }
5419
5420 num_pmds = 0;
5421 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5422 uint64_t total_cycles = 0;
5423 uint64_t total_proc = 0;
5424
5425 if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5426 continue;
5427 }
5428
5429 /* Get the total pmd cycles for an interval. */
5430 atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5431 /* Estimate the cycles to cover all intervals. */
5432 total_cycles *= PMD_RXQ_INTERVAL_MAX;
5433
5434 ovs_mutex_lock(&pmd->port_mutex);
5435 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5436 for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5437 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5438 }
5439 }
5440 ovs_mutex_unlock(&pmd->port_mutex);
5441
5442 if (total_proc) {
5443 curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5444 }
5445
5446 VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5447 pmd->core_id, curr_pmd_usage[num_pmds]);
5448
5449 if (atomic_count_get(&pmd->pmd_overloaded)) {
5450 atomic_count_set(&pmd->pmd_overloaded, 0);
5451 }
5452
5453 pmd_corelist[num_pmds] = pmd->core_id;
5454 num_pmds++;
5455 }
5456
5457 curr_variance = variance(curr_pmd_usage, num_pmds);
5458 ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5459
5460 if (ret) {
5461 VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5462 " Predicted PMD variance: %"PRIu64"",
5463 curr_variance, new_variance);
5464
5465 if (new_variance < curr_variance) {
5466 improvement =
5467 ((curr_variance - new_variance) * 100) / curr_variance;
5468 }
5469 if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5470 ret = false;
5471 }
5472 }
5473
5474 free(curr_pmd_usage);
5475 free(pmd_corelist);
5476 return ret;
5477 }
5478
5479
5480 /* Return true if needs to revalidate datapath flows. */
5481 static bool
5482 dpif_netdev_run(struct dpif *dpif)
5483 {
5484 struct dp_netdev_port *port;
5485 struct dp_netdev *dp = get_dp_netdev(dpif);
5486 struct dp_netdev_pmd_thread *non_pmd;
5487 uint64_t new_tnl_seq;
5488 bool need_to_flush = true;
5489 bool pmd_rebalance = false;
5490 long long int now = time_msec();
5491 struct dp_netdev_pmd_thread *pmd;
5492
5493 ovs_mutex_lock(&dp->port_mutex);
5494 non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5495 if (non_pmd) {
5496 ovs_mutex_lock(&dp->non_pmd_mutex);
5497 HMAP_FOR_EACH (port, node, &dp->ports) {
5498 if (!netdev_is_pmd(port->netdev)) {
5499 int i;
5500
5501 if (port->emc_enabled) {
5502 atomic_read_relaxed(&dp->emc_insert_min,
5503 &non_pmd->ctx.emc_insert_min);
5504 } else {
5505 non_pmd->ctx.emc_insert_min = 0;
5506 }
5507
5508 for (i = 0; i < port->n_rxq; i++) {
5509
5510 if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5511 continue;
5512 }
5513
5514 if (dp_netdev_process_rxq_port(non_pmd,
5515 &port->rxqs[i],
5516 port->port_no)) {
5517 need_to_flush = false;
5518 }
5519 }
5520 }
5521 }
5522 if (need_to_flush) {
5523 /* We didn't receive anything in the process loop.
5524 * Check if we need to send something.
5525 * There was no time updates on current iteration. */
5526 pmd_thread_ctx_time_update(non_pmd);
5527 dp_netdev_pmd_flush_output_packets(non_pmd, false);
5528 }
5529
5530 dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5531 ovs_mutex_unlock(&dp->non_pmd_mutex);
5532
5533 dp_netdev_pmd_unref(non_pmd);
5534 }
5535
5536 struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5537 if (pmd_alb->is_enabled) {
5538 if (!pmd_alb->rebalance_poll_timer) {
5539 pmd_alb->rebalance_poll_timer = now;
5540 } else if ((pmd_alb->rebalance_poll_timer +
5541 pmd_alb->rebalance_intvl) < now) {
5542 pmd_alb->rebalance_poll_timer = now;
5543 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5544 if (atomic_count_get(&pmd->pmd_overloaded) >=
5545 PMD_RXQ_INTERVAL_MAX) {
5546 pmd_rebalance = true;
5547 break;
5548 }
5549 }
5550
5551 if (pmd_rebalance &&
5552 !dp_netdev_is_reconf_required(dp) &&
5553 !ports_require_restart(dp) &&
5554 pmd_rebalance_dry_run(dp)) {
5555 VLOG_INFO("PMD auto lb dry run."
5556 " requesting datapath reconfigure.");
5557 dp_netdev_request_reconfigure(dp);
5558 }
5559 }
5560 }
5561
5562 if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5563 reconfigure_datapath(dp);
5564 }
5565 ovs_mutex_unlock(&dp->port_mutex);
5566
5567 tnl_neigh_cache_run();
5568 tnl_port_map_run();
5569 new_tnl_seq = seq_read(tnl_conf_seq);
5570
5571 if (dp->last_tnl_conf_seq != new_tnl_seq) {
5572 dp->last_tnl_conf_seq = new_tnl_seq;
5573 return true;
5574 }
5575 return false;
5576 }
5577
5578 static void
5579 dpif_netdev_wait(struct dpif *dpif)
5580 {
5581 struct dp_netdev_port *port;
5582 struct dp_netdev *dp = get_dp_netdev(dpif);
5583
5584 ovs_mutex_lock(&dp_netdev_mutex);
5585 ovs_mutex_lock(&dp->port_mutex);
5586 HMAP_FOR_EACH (port, node, &dp->ports) {
5587 netdev_wait_reconf_required(port->netdev);
5588 if (!netdev_is_pmd(port->netdev)) {
5589 int i;
5590
5591 for (i = 0; i < port->n_rxq; i++) {
5592 netdev_rxq_wait(port->rxqs[i].rx);
5593 }
5594 }
5595 }
5596 ovs_mutex_unlock(&dp->port_mutex);
5597 ovs_mutex_unlock(&dp_netdev_mutex);
5598 seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5599 }
5600
5601 static void
5602 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5603 {
5604 struct tx_port *tx_port_cached;
5605
5606 /* Flush all the queued packets. */
5607 dp_netdev_pmd_flush_output_packets(pmd, true);
5608 /* Free all used tx queue ids. */
5609 dpif_netdev_xps_revalidate_pmd(pmd, true);
5610
5611 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5612 free(tx_port_cached);
5613 }
5614 HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5615 free(tx_port_cached);
5616 }
5617 }
5618
5619 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5620 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5621 * device, otherwise to 'pmd->send_port_cache' if the port has at least
5622 * one txq. */
5623 static void
5624 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5625 OVS_REQUIRES(pmd->port_mutex)
5626 {
5627 struct tx_port *tx_port, *tx_port_cached;
5628
5629 pmd_free_cached_ports(pmd);
5630 hmap_shrink(&pmd->send_port_cache);
5631 hmap_shrink(&pmd->tnl_port_cache);
5632
5633 HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5634 if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5635 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5636 hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5637 hash_port_no(tx_port_cached->port->port_no));
5638 }
5639
5640 if (netdev_n_txq(tx_port->port->netdev)) {
5641 tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5642 hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5643 hash_port_no(tx_port_cached->port->port_no));
5644 }
5645 }
5646 }
5647
5648 static void
5649 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5650 {
5651 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5652 if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5653 VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5654 ", numa_id %d.", pmd->core_id, pmd->numa_id);
5655 }
5656 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5657
5658 VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5659 ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5660 }
5661
5662 static void
5663 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5664 {
5665 ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5666 id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5667 ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5668 }
5669
5670 static int
5671 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5672 struct polled_queue **ppoll_list)
5673 {
5674 struct polled_queue *poll_list = *ppoll_list;
5675 struct rxq_poll *poll;
5676 int i;
5677
5678 ovs_mutex_lock(&pmd->port_mutex);
5679 poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5680 * sizeof *poll_list);
5681
5682 i = 0;
5683 HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5684 poll_list[i].rxq = poll->rxq;
5685 poll_list[i].port_no = poll->rxq->port->port_no;
5686 poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5687 poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5688 poll_list[i].change_seq =
5689 netdev_get_change_seq(poll->rxq->port->netdev);
5690 i++;
5691 }
5692
5693 pmd_load_cached_ports(pmd);
5694
5695 ovs_mutex_unlock(&pmd->port_mutex);
5696
5697 *ppoll_list = poll_list;
5698 return i;
5699 }
5700
5701 static void *
5702 pmd_thread_main(void *f_)
5703 {
5704 struct dp_netdev_pmd_thread *pmd = f_;
5705 struct pmd_perf_stats *s = &pmd->perf_stats;
5706 unsigned int lc = 0;
5707 struct polled_queue *poll_list;
5708 bool wait_for_reload = false;
5709 bool reload_tx_qid;
5710 bool exiting;
5711 bool reload;
5712 int poll_cnt;
5713 int i;
5714 int process_packets = 0;
5715
5716 poll_list = NULL;
5717
5718 /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5719 ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5720 ovs_numa_thread_setaffinity_core(pmd->core_id);
5721 dpdk_set_lcore_id(pmd->core_id);
5722 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5723 dfc_cache_init(&pmd->flow_cache);
5724 pmd_alloc_static_tx_qid(pmd);
5725
5726 reload:
5727 atomic_count_init(&pmd->pmd_overloaded, 0);
5728
5729 /* List port/core affinity */
5730 for (i = 0; i < poll_cnt; i++) {
5731 VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5732 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5733 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5734 /* Reset the rxq current cycles counter. */
5735 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5736 }
5737
5738 if (!poll_cnt) {
5739 if (wait_for_reload) {
5740 /* Don't sleep, control thread will ask for a reload shortly. */
5741 do {
5742 atomic_read_explicit(&pmd->reload, &reload,
5743 memory_order_acquire);
5744 } while (!reload);
5745 } else {
5746 while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5747 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5748 poll_block();
5749 }
5750 }
5751 }
5752
5753 pmd->intrvl_tsc_prev = 0;
5754 atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5755 cycles_counter_update(s);
5756
5757 pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5758
5759 /* Protect pmd stats from external clearing while polling. */
5760 ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5761 for (;;) {
5762 uint64_t rx_packets = 0, tx_packets = 0;
5763
5764 pmd_perf_start_iteration(s);
5765
5766 for (i = 0; i < poll_cnt; i++) {
5767
5768 if (!poll_list[i].rxq_enabled) {
5769 continue;
5770 }
5771
5772 if (poll_list[i].emc_enabled) {
5773 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5774 &pmd->ctx.emc_insert_min);
5775 } else {
5776 pmd->ctx.emc_insert_min = 0;
5777 }
5778
5779 process_packets =
5780 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5781 poll_list[i].port_no);
5782 rx_packets += process_packets;
5783 }
5784
5785 if (!rx_packets) {
5786 /* We didn't receive anything in the process loop.
5787 * Check if we need to send something.
5788 * There was no time updates on current iteration. */
5789 pmd_thread_ctx_time_update(pmd);
5790 tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5791 }
5792
5793 /* Do RCU synchronization at fixed interval. This ensures that
5794 * synchronization would not be delayed long even at high load of
5795 * packet processing. */
5796 if (pmd->ctx.now > pmd->next_rcu_quiesce) {
5797 if (!ovsrcu_try_quiesce()) {
5798 pmd->next_rcu_quiesce =
5799 pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5800 }
5801 }
5802
5803 if (lc++ > 1024) {
5804 lc = 0;
5805
5806 coverage_try_clear();
5807 dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5808 if (!ovsrcu_try_quiesce()) {
5809 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5810 pmd->next_rcu_quiesce =
5811 pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5812 }
5813
5814 for (i = 0; i < poll_cnt; i++) {
5815 uint64_t current_seq =
5816 netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5817 if (poll_list[i].change_seq != current_seq) {
5818 poll_list[i].change_seq = current_seq;
5819 poll_list[i].rxq_enabled =
5820 netdev_rxq_enabled(poll_list[i].rxq->rx);
5821 }
5822 }
5823 }
5824
5825 atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5826 if (OVS_UNLIKELY(reload)) {
5827 break;
5828 }
5829
5830 pmd_perf_end_iteration(s, rx_packets, tx_packets,
5831 pmd_perf_metrics_enabled(pmd));
5832 }
5833 ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5834
5835 poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5836 atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5837 atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5838 atomic_read_relaxed(&pmd->exit, &exiting);
5839 /* Signal here to make sure the pmd finishes
5840 * reloading the updated configuration. */
5841 dp_netdev_pmd_reload_done(pmd);
5842
5843 if (reload_tx_qid) {
5844 pmd_free_static_tx_qid(pmd);
5845 pmd_alloc_static_tx_qid(pmd);
5846 }
5847
5848 if (!exiting) {
5849 goto reload;
5850 }
5851
5852 pmd_free_static_tx_qid(pmd);
5853 dfc_cache_uninit(&pmd->flow_cache);
5854 free(poll_list);
5855 pmd_free_cached_ports(pmd);
5856 return NULL;
5857 }
5858
5859 static void
5860 dp_netdev_disable_upcall(struct dp_netdev *dp)
5861 OVS_ACQUIRES(dp->upcall_rwlock)
5862 {
5863 fat_rwlock_wrlock(&dp->upcall_rwlock);
5864 }
5865
5866 \f
5867 /* Meters */
5868 static void
5869 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5870 struct ofputil_meter_features *features)
5871 {
5872 features->max_meters = MAX_METERS;
5873 features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5874 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5875 features->max_bands = MAX_BANDS;
5876 features->max_color = 0;
5877 }
5878
5879 /* Applies the meter identified by 'meter_id' to 'packets_'. Packets
5880 * that exceed a band are dropped in-place. */
5881 static void
5882 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5883 uint32_t meter_id, long long int now)
5884 {
5885 struct dp_meter *meter;
5886 struct dp_meter_band *band;
5887 struct dp_packet *packet;
5888 long long int long_delta_t; /* msec */
5889 uint32_t delta_t; /* msec */
5890 uint32_t delta_in_us; /* usec */
5891 const size_t cnt = dp_packet_batch_size(packets_);
5892 uint32_t bytes, volume;
5893 int exceeded_band[NETDEV_MAX_BURST];
5894 uint32_t exceeded_rate[NETDEV_MAX_BURST];
5895 int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5896
5897 if (meter_id >= MAX_METERS) {
5898 return;
5899 }
5900
5901 meter_lock(dp, meter_id);
5902 meter = dp->meters[meter_id];
5903 if (!meter) {
5904 goto out;
5905 }
5906
5907 /* Initialize as negative values. */
5908 memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5909 /* Initialize as zeroes. */
5910 memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5911
5912 /* All packets will hit the meter at the same time. */
5913 long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5914
5915 if (long_delta_t < 0) {
5916 /* This condition means that we have several threads fighting for a
5917 meter lock, and the one who received the packets a bit later wins.
5918 Assuming that all racing threads received packets at the same time
5919 to avoid overflow. */
5920 long_delta_t = 0;
5921 delta_in_us = 0;
5922 } else {
5923 delta_in_us = (now - meter->used) % 1000;
5924 }
5925
5926 /* Make sure delta_t will not be too large, so that bucket will not
5927 * wrap around below. */
5928 delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5929 ? meter->max_delta_t : (uint32_t)long_delta_t;
5930
5931 /* Update meter stats. */
5932 meter->used = now;
5933 meter->packet_count += cnt;
5934 bytes = 0;
5935 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5936 bytes += dp_packet_size(packet);
5937 }
5938 meter->byte_count += bytes;
5939
5940 /* Meters can operate in terms of packets per second or kilobits per
5941 * second. */
5942 if (meter->flags & OFPMF13_PKTPS) {
5943 /* Rate in packets/second, bucket 1/1000 packets. */
5944 /* msec * packets/sec = 1/1000 packets. */
5945 volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5946 } else {
5947 /* Rate in kbps, bucket in bits. */
5948 /* msec * kbps = bits */
5949 volume = bytes * 8;
5950 }
5951
5952 /* Update all bands and find the one hit with the highest rate for each
5953 * packet (if any). */
5954 for (int m = 0; m < meter->n_bands; ++m) {
5955 band = &meter->bands[m];
5956
5957 /* Update band's bucket. */
5958 band->bucket += delta_t * band->up.rate;
5959 band->bucket += delta_in_us * band->up.rate / 1000;
5960 if (band->bucket > band->up.burst_size) {
5961 band->bucket = band->up.burst_size;
5962 }
5963
5964 /* Drain the bucket for all the packets, if possible. */
5965 if (band->bucket >= volume) {
5966 band->bucket -= volume;
5967 } else {
5968 int band_exceeded_pkt;
5969
5970 /* Band limit hit, must process packet-by-packet. */
5971 if (meter->flags & OFPMF13_PKTPS) {
5972 band_exceeded_pkt = band->bucket / 1000;
5973 band->bucket %= 1000; /* Remainder stays in bucket. */
5974
5975 /* Update the exceeding band for each exceeding packet.
5976 * (Only one band will be fired by a packet, and that
5977 * can be different for each packet.) */
5978 for (int i = band_exceeded_pkt; i < cnt; i++) {
5979 if (band->up.rate > exceeded_rate[i]) {
5980 exceeded_rate[i] = band->up.rate;
5981 exceeded_band[i] = m;
5982 }
5983 }
5984 } else {
5985 /* Packet sizes differ, must process one-by-one. */
5986 band_exceeded_pkt = cnt;
5987 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5988 uint32_t bits = dp_packet_size(packet) * 8;
5989
5990 if (band->bucket >= bits) {
5991 band->bucket -= bits;
5992 } else {
5993 if (i < band_exceeded_pkt) {
5994 band_exceeded_pkt = i;
5995 }
5996 /* Update the exceeding band for the exceeding packet.
5997 * (Only one band will be fired by a packet, and that
5998 * can be different for each packet.) */
5999 if (band->up.rate > exceeded_rate[i]) {
6000 exceeded_rate[i] = band->up.rate;
6001 exceeded_band[i] = m;
6002 }
6003 }
6004 }
6005 }
6006 /* Remember the first exceeding packet. */
6007 if (exceeded_pkt > band_exceeded_pkt) {
6008 exceeded_pkt = band_exceeded_pkt;
6009 }
6010 }
6011 }
6012
6013 /* Fire the highest rate band exceeded by each packet, and drop
6014 * packets if needed. */
6015 size_t j;
6016 DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6017 if (exceeded_band[j] >= 0) {
6018 /* Meter drop packet. */
6019 band = &meter->bands[exceeded_band[j]];
6020 band->packet_count += 1;
6021 band->byte_count += dp_packet_size(packet);
6022 COVERAGE_INC(datapath_drop_meter);
6023 dp_packet_delete(packet);
6024 } else {
6025 /* Meter accepts packet. */
6026 dp_packet_batch_refill(packets_, packet, j);
6027 }
6028 }
6029 out:
6030 meter_unlock(dp, meter_id);
6031 }
6032
6033 /* Meter set/get/del processing is still single-threaded. */
6034 static int
6035 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6036 struct ofputil_meter_config *config)
6037 {
6038 struct dp_netdev *dp = get_dp_netdev(dpif);
6039 uint32_t mid = meter_id.uint32;
6040 struct dp_meter *meter;
6041 int i;
6042
6043 if (mid >= MAX_METERS) {
6044 return EFBIG; /* Meter_id out of range. */
6045 }
6046
6047 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6048 return EBADF; /* Unsupported flags set */
6049 }
6050
6051 if (config->n_bands > MAX_BANDS) {
6052 return EINVAL;
6053 }
6054
6055 for (i = 0; i < config->n_bands; ++i) {
6056 switch (config->bands[i].type) {
6057 case OFPMBT13_DROP:
6058 break;
6059 default:
6060 return ENODEV; /* Unsupported band type */
6061 }
6062 }
6063
6064 /* Allocate meter */
6065 meter = xzalloc(sizeof *meter
6066 + config->n_bands * sizeof(struct dp_meter_band));
6067
6068 meter->flags = config->flags;
6069 meter->n_bands = config->n_bands;
6070 meter->max_delta_t = 0;
6071 meter->used = time_usec();
6072
6073 /* set up bands */
6074 for (i = 0; i < config->n_bands; ++i) {
6075 uint32_t band_max_delta_t;
6076
6077 /* Set burst size to a workable value if none specified. */
6078 if (config->bands[i].burst_size == 0) {
6079 config->bands[i].burst_size = config->bands[i].rate;
6080 }
6081
6082 meter->bands[i].up = config->bands[i];
6083 /* Convert burst size to the bucket units: */
6084 /* pkts => 1/1000 packets, kilobits => bits. */
6085 meter->bands[i].up.burst_size *= 1000;
6086 /* Initialize bucket to empty. */
6087 meter->bands[i].bucket = 0;
6088
6089 /* Figure out max delta_t that is enough to fill any bucket. */
6090 band_max_delta_t
6091 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
6092 if (band_max_delta_t > meter->max_delta_t) {
6093 meter->max_delta_t = band_max_delta_t;
6094 }
6095 }
6096
6097 meter_lock(dp, mid);
6098 dp_delete_meter(dp, mid); /* Free existing meter, if any */
6099 dp->meters[mid] = meter;
6100 meter_unlock(dp, mid);
6101
6102 return 0;
6103 }
6104
6105 static int
6106 dpif_netdev_meter_get(const struct dpif *dpif,
6107 ofproto_meter_id meter_id_,
6108 struct ofputil_meter_stats *stats, uint16_t n_bands)
6109 {
6110 const struct dp_netdev *dp = get_dp_netdev(dpif);
6111 uint32_t meter_id = meter_id_.uint32;
6112 int retval = 0;
6113
6114 if (meter_id >= MAX_METERS) {
6115 return EFBIG;
6116 }
6117
6118 meter_lock(dp, meter_id);
6119 const struct dp_meter *meter = dp->meters[meter_id];
6120 if (!meter) {
6121 retval = ENOENT;
6122 goto done;
6123 }
6124 if (stats) {
6125 int i = 0;
6126
6127 stats->packet_in_count = meter->packet_count;
6128 stats->byte_in_count = meter->byte_count;
6129
6130 for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6131 stats->bands[i].packet_count = meter->bands[i].packet_count;
6132 stats->bands[i].byte_count = meter->bands[i].byte_count;
6133 }
6134
6135 stats->n_bands = i;
6136 }
6137
6138 done:
6139 meter_unlock(dp, meter_id);
6140 return retval;
6141 }
6142
6143 static int
6144 dpif_netdev_meter_del(struct dpif *dpif,
6145 ofproto_meter_id meter_id_,
6146 struct ofputil_meter_stats *stats, uint16_t n_bands)
6147 {
6148 struct dp_netdev *dp = get_dp_netdev(dpif);
6149 int error;
6150
6151 error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6152 if (!error) {
6153 uint32_t meter_id = meter_id_.uint32;
6154
6155 meter_lock(dp, meter_id);
6156 dp_delete_meter(dp, meter_id);
6157 meter_unlock(dp, meter_id);
6158 }
6159 return error;
6160 }
6161
6162 \f
6163 static void
6164 dpif_netdev_disable_upcall(struct dpif *dpif)
6165 OVS_NO_THREAD_SAFETY_ANALYSIS
6166 {
6167 struct dp_netdev *dp = get_dp_netdev(dpif);
6168 dp_netdev_disable_upcall(dp);
6169 }
6170
6171 static void
6172 dp_netdev_enable_upcall(struct dp_netdev *dp)
6173 OVS_RELEASES(dp->upcall_rwlock)
6174 {
6175 fat_rwlock_unlock(&dp->upcall_rwlock);
6176 }
6177
6178 static void
6179 dpif_netdev_enable_upcall(struct dpif *dpif)
6180 OVS_NO_THREAD_SAFETY_ANALYSIS
6181 {
6182 struct dp_netdev *dp = get_dp_netdev(dpif);
6183 dp_netdev_enable_upcall(dp);
6184 }
6185
6186 static void
6187 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6188 {
6189 atomic_store_relaxed(&pmd->wait_for_reload, false);
6190 atomic_store_relaxed(&pmd->reload_tx_qid, false);
6191 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6192 atomic_store_explicit(&pmd->reload, false, memory_order_release);
6193 }
6194
6195 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'. Returns
6196 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6197 * 'core_id' is NON_PMD_CORE_ID).
6198 *
6199 * Caller must unrefs the returned reference. */
6200 static struct dp_netdev_pmd_thread *
6201 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6202 {
6203 struct dp_netdev_pmd_thread *pmd;
6204 const struct cmap_node *pnode;
6205
6206 pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
6207 if (!pnode) {
6208 return NULL;
6209 }
6210 pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6211
6212 return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6213 }
6214
6215 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6216 static void
6217 dp_netdev_set_nonpmd(struct dp_netdev *dp)
6218 OVS_REQUIRES(dp->port_mutex)
6219 {
6220 struct dp_netdev_pmd_thread *non_pmd;
6221
6222 non_pmd = xzalloc(sizeof *non_pmd);
6223 dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6224 }
6225
6226 /* Caller must have valid pointer to 'pmd'. */
6227 static bool
6228 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6229 {
6230 return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6231 }
6232
6233 static void
6234 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6235 {
6236 if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6237 ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6238 }
6239 }
6240
6241 /* Given cmap position 'pos', tries to ref the next node. If try_ref()
6242 * fails, keeps checking for next node until reaching the end of cmap.
6243 *
6244 * Caller must unrefs the returned reference. */
6245 static struct dp_netdev_pmd_thread *
6246 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6247 {
6248 struct dp_netdev_pmd_thread *next;
6249
6250 do {
6251 struct cmap_node *node;
6252
6253 node = cmap_next_position(&dp->poll_threads, pos);
6254 next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6255 : NULL;
6256 } while (next && !dp_netdev_pmd_try_ref(next));
6257
6258 return next;
6259 }
6260
6261 /* Configures the 'pmd' based on the input argument. */
6262 static void
6263 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6264 unsigned core_id, int numa_id)
6265 {
6266 pmd->dp = dp;
6267 pmd->core_id = core_id;
6268 pmd->numa_id = numa_id;
6269 pmd->need_reload = false;
6270 pmd->n_output_batches = 0;
6271
6272 ovs_refcount_init(&pmd->ref_cnt);
6273 atomic_init(&pmd->exit, false);
6274 pmd->reload_seq = seq_create();
6275 pmd->last_reload_seq = seq_read(pmd->reload_seq);
6276 atomic_init(&pmd->reload, false);
6277 ovs_mutex_init(&pmd->flow_mutex);
6278 ovs_mutex_init(&pmd->port_mutex);
6279 ovs_mutex_init(&pmd->bond_mutex);
6280 cmap_init(&pmd->flow_table);
6281 cmap_init(&pmd->classifiers);
6282 pmd->ctx.last_rxq = NULL;
6283 pmd_thread_ctx_time_update(pmd);
6284 pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6285 pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6286 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6287 hmap_init(&pmd->poll_list);
6288 hmap_init(&pmd->tx_ports);
6289 hmap_init(&pmd->tnl_port_cache);
6290 hmap_init(&pmd->send_port_cache);
6291 cmap_init(&pmd->tx_bonds);
6292 /* init the 'flow_cache' since there is no
6293 * actual thread created for NON_PMD_CORE_ID. */
6294 if (core_id == NON_PMD_CORE_ID) {
6295 dfc_cache_init(&pmd->flow_cache);
6296 pmd_alloc_static_tx_qid(pmd);
6297 }
6298 pmd_perf_stats_init(&pmd->perf_stats);
6299 cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6300 hash_int(core_id, 0));
6301 }
6302
6303 static void
6304 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6305 {
6306 struct dpcls *cls;
6307
6308 dp_netdev_pmd_flow_flush(pmd);
6309 hmap_destroy(&pmd->send_port_cache);
6310 hmap_destroy(&pmd->tnl_port_cache);
6311 hmap_destroy(&pmd->tx_ports);
6312 cmap_destroy(&pmd->tx_bonds);
6313 hmap_destroy(&pmd->poll_list);
6314 /* All flows (including their dpcls_rules) have been deleted already */
6315 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6316 dpcls_destroy(cls);
6317 ovsrcu_postpone(free, cls);
6318 }
6319 cmap_destroy(&pmd->classifiers);
6320 cmap_destroy(&pmd->flow_table);
6321 ovs_mutex_destroy(&pmd->flow_mutex);
6322 seq_destroy(pmd->reload_seq);
6323 ovs_mutex_destroy(&pmd->port_mutex);
6324 ovs_mutex_destroy(&pmd->bond_mutex);
6325 free(pmd);
6326 }
6327
6328 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6329 * and unrefs the struct. */
6330 static void
6331 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6332 {
6333 /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6334 * but extra cleanup is necessary */
6335 if (pmd->core_id == NON_PMD_CORE_ID) {
6336 ovs_mutex_lock(&dp->non_pmd_mutex);
6337 dfc_cache_uninit(&pmd->flow_cache);
6338 pmd_free_cached_ports(pmd);
6339 pmd_free_static_tx_qid(pmd);
6340 ovs_mutex_unlock(&dp->non_pmd_mutex);
6341 } else {
6342 atomic_store_relaxed(&pmd->exit, true);
6343 dp_netdev_reload_pmd__(pmd);
6344 xpthread_join(pmd->thread, NULL);
6345 }
6346
6347 dp_netdev_pmd_clear_ports(pmd);
6348
6349 /* Purges the 'pmd''s flows after stopping the thread, but before
6350 * destroying the flows, so that the flow stats can be collected. */
6351 if (dp->dp_purge_cb) {
6352 dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6353 }
6354 cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6355 dp_netdev_pmd_unref(pmd);
6356 }
6357
6358 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6359 * thread. */
6360 static void
6361 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6362 {
6363 struct dp_netdev_pmd_thread *pmd;
6364 struct dp_netdev_pmd_thread **pmd_list;
6365 size_t k = 0, n_pmds;
6366
6367 n_pmds = cmap_count(&dp->poll_threads);
6368 pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6369
6370 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6371 if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6372 continue;
6373 }
6374 /* We cannot call dp_netdev_del_pmd(), since it alters
6375 * 'dp->poll_threads' (while we're iterating it) and it
6376 * might quiesce. */
6377 ovs_assert(k < n_pmds);
6378 pmd_list[k++] = pmd;
6379 }
6380
6381 for (size_t i = 0; i < k; i++) {
6382 dp_netdev_del_pmd(dp, pmd_list[i]);
6383 }
6384 free(pmd_list);
6385 }
6386
6387 /* Deletes all rx queues from pmd->poll_list and all the ports from
6388 * pmd->tx_ports. */
6389 static void
6390 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6391 {
6392 struct rxq_poll *poll;
6393 struct tx_port *port;
6394 struct tx_bond *tx;
6395
6396 ovs_mutex_lock(&pmd->port_mutex);
6397 HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6398 free(poll);
6399 }
6400 HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6401 free(port);
6402 }
6403 ovs_mutex_unlock(&pmd->port_mutex);
6404
6405 ovs_mutex_lock(&pmd->bond_mutex);
6406 CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6407 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6408 ovsrcu_postpone(free, tx);
6409 }
6410 ovs_mutex_unlock(&pmd->bond_mutex);
6411 }
6412
6413 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6414 static void
6415 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6416 struct dp_netdev_rxq *rxq)
6417 OVS_REQUIRES(pmd->port_mutex)
6418 {
6419 int qid = netdev_rxq_get_queue_id(rxq->rx);
6420 uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6421 struct rxq_poll *poll;
6422
6423 HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6424 if (poll->rxq == rxq) {
6425 /* 'rxq' is already polled by this thread. Do nothing. */
6426 return;
6427 }
6428 }
6429
6430 poll = xmalloc(sizeof *poll);
6431 poll->rxq = rxq;
6432 hmap_insert(&pmd->poll_list, &poll->node, hash);
6433
6434 pmd->need_reload = true;
6435 }
6436
6437 /* Delete 'poll' from poll_list of PMD thread. */
6438 static void
6439 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6440 struct rxq_poll *poll)
6441 OVS_REQUIRES(pmd->port_mutex)
6442 {
6443 hmap_remove(&pmd->poll_list, &poll->node);
6444 free(poll);
6445
6446 pmd->need_reload = true;
6447 }
6448
6449 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6450 * changes to take effect. */
6451 static void
6452 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6453 struct dp_netdev_port *port)
6454 OVS_REQUIRES(pmd->port_mutex)
6455 {
6456 struct tx_port *tx;
6457
6458 tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6459 if (tx) {
6460 /* 'port' is already on this thread tx cache. Do nothing. */
6461 return;
6462 }
6463
6464 tx = xzalloc(sizeof *tx);
6465
6466 tx->port = port;
6467 tx->qid = -1;
6468 tx->flush_time = 0LL;
6469 dp_packet_batch_init(&tx->output_pkts);
6470
6471 hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6472 pmd->need_reload = true;
6473 }
6474
6475 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6476 * changes to take effect. */
6477 static void
6478 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6479 struct tx_port *tx)
6480 OVS_REQUIRES(pmd->port_mutex)
6481 {
6482 hmap_remove(&pmd->tx_ports, &tx->node);
6483 free(tx);
6484 pmd->need_reload = true;
6485 }
6486
6487 /* Add bond to the tx bond cmap of 'pmd'. */
6488 static void
6489 dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6490 struct tx_bond *bond, bool update)
6491 OVS_EXCLUDED(pmd->bond_mutex)
6492 {
6493 struct tx_bond *tx;
6494
6495 ovs_mutex_lock(&pmd->bond_mutex);
6496 tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6497
6498 if (tx && !update) {
6499 /* It's not an update and the entry already exists. Do nothing. */
6500 goto unlock;
6501 }
6502
6503 if (tx) {
6504 struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6505
6506 /* Copy the stats for each bucket. */
6507 for (int i = 0; i < BOND_BUCKETS; i++) {
6508 uint64_t n_packets, n_bytes;
6509
6510 atomic_read_relaxed(&tx->slave_buckets[i].n_packets, &n_packets);
6511 atomic_read_relaxed(&tx->slave_buckets[i].n_bytes, &n_bytes);
6512 atomic_init(&new_tx->slave_buckets[i].n_packets, n_packets);
6513 atomic_init(&new_tx->slave_buckets[i].n_bytes, n_bytes);
6514 }
6515 cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6516 hash_bond_id(bond->bond_id));
6517 ovsrcu_postpone(free, tx);
6518 } else {
6519 tx = xmemdup(bond, sizeof *bond);
6520 cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6521 }
6522 unlock:
6523 ovs_mutex_unlock(&pmd->bond_mutex);
6524 }
6525
6526 /* Delete bond from the tx bond cmap of 'pmd'. */
6527 static void
6528 dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6529 uint32_t bond_id)
6530 OVS_EXCLUDED(pmd->bond_mutex)
6531 {
6532 struct tx_bond *tx;
6533
6534 ovs_mutex_lock(&pmd->bond_mutex);
6535 tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6536 if (tx) {
6537 cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6538 ovsrcu_postpone(free, tx);
6539 }
6540 ovs_mutex_unlock(&pmd->bond_mutex);
6541 }
6542 \f
6543 static char *
6544 dpif_netdev_get_datapath_version(void)
6545 {
6546 return xstrdup("<built-in>");
6547 }
6548
6549 static void
6550 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6551 uint16_t tcp_flags, long long now)
6552 {
6553 uint16_t flags;
6554
6555 atomic_store_relaxed(&netdev_flow->stats.used, now);
6556 non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6557 non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6558 atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6559 flags |= tcp_flags;
6560 atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6561 }
6562
6563 static int
6564 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6565 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6566 enum dpif_upcall_type type, const struct nlattr *userdata,
6567 struct ofpbuf *actions, struct ofpbuf *put_actions)
6568 {
6569 struct dp_netdev *dp = pmd->dp;
6570
6571 if (OVS_UNLIKELY(!dp->upcall_cb)) {
6572 return ENODEV;
6573 }
6574
6575 if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6576 struct ds ds = DS_EMPTY_INITIALIZER;
6577 char *packet_str;
6578 struct ofpbuf key;
6579 struct odp_flow_key_parms odp_parms = {
6580 .flow = flow,
6581 .mask = wc ? &wc->masks : NULL,
6582 .support = dp_netdev_support,
6583 };
6584
6585 ofpbuf_init(&key, 0);
6586 odp_flow_key_from_flow(&odp_parms, &key);
6587 packet_str = ofp_dp_packet_to_string(packet_);
6588
6589 odp_flow_key_format(key.data, key.size, &ds);
6590
6591 VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6592 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6593
6594 ofpbuf_uninit(&key);
6595 free(packet_str);
6596
6597 ds_destroy(&ds);
6598 }
6599
6600 return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6601 actions, wc, put_actions, dp->upcall_aux);
6602 }
6603
6604 static inline uint32_t
6605 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6606 const struct miniflow *mf)
6607 {
6608 uint32_t hash;
6609
6610 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6611 hash = dp_packet_get_rss_hash(packet);
6612 } else {
6613 hash = miniflow_hash_5tuple(mf, 0);
6614 dp_packet_set_rss_hash(packet, hash);
6615 }
6616
6617 return hash;
6618 }
6619
6620 static inline uint32_t
6621 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6622 const struct miniflow *mf)
6623 {
6624 uint32_t hash, recirc_depth;
6625
6626 if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6627 hash = dp_packet_get_rss_hash(packet);
6628 } else {
6629 hash = miniflow_hash_5tuple(mf, 0);
6630 dp_packet_set_rss_hash(packet, hash);
6631 }
6632
6633 /* The RSS hash must account for the recirculation depth to avoid
6634 * collisions in the exact match cache */
6635 recirc_depth = *recirc_depth_get_unsafe();
6636 if (OVS_UNLIKELY(recirc_depth)) {
6637 hash = hash_finish(hash, recirc_depth);
6638 }
6639 return hash;
6640 }
6641
6642 struct packet_batch_per_flow {
6643 unsigned int byte_count;
6644 uint16_t tcp_flags;
6645 struct dp_netdev_flow *flow;
6646
6647 struct dp_packet_batch array;
6648 };
6649
6650 static inline void
6651 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6652 struct dp_packet *packet,
6653 uint16_t tcp_flags)
6654 {
6655 batch->byte_count += dp_packet_size(packet);
6656 batch->tcp_flags |= tcp_flags;
6657 dp_packet_batch_add(&batch->array, packet);
6658 }
6659
6660 static inline void
6661 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6662 struct dp_netdev_flow *flow)
6663 {
6664 flow->batch = batch;
6665
6666 batch->flow = flow;
6667 dp_packet_batch_init(&batch->array);
6668 batch->byte_count = 0;
6669 batch->tcp_flags = 0;
6670 }
6671
6672 static inline void
6673 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6674 struct dp_netdev_pmd_thread *pmd)
6675 {
6676 struct dp_netdev_actions *actions;
6677 struct dp_netdev_flow *flow = batch->flow;
6678
6679 dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6680 batch->byte_count,
6681 batch->tcp_flags, pmd->ctx.now / 1000);
6682
6683 actions = dp_netdev_flow_get_actions(flow);
6684
6685 dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6686 actions->actions, actions->size);
6687 }
6688
6689 static inline void
6690 dp_netdev_queue_batches(struct dp_packet *pkt,
6691 struct dp_netdev_flow *flow, uint16_t tcp_flags,
6692 struct packet_batch_per_flow *batches,
6693 size_t *n_batches)
6694 {
6695 struct packet_batch_per_flow *batch = flow->batch;
6696
6697 if (OVS_UNLIKELY(!batch)) {
6698 batch = &batches[(*n_batches)++];
6699 packet_batch_per_flow_init(batch, flow);
6700 }
6701
6702 packet_batch_per_flow_update(batch, pkt, tcp_flags);
6703 }
6704
6705 static inline void
6706 packet_enqueue_to_flow_map(struct dp_packet *packet,
6707 struct dp_netdev_flow *flow,
6708 uint16_t tcp_flags,
6709 struct dp_packet_flow_map *flow_map,
6710 size_t index)
6711 {
6712 struct dp_packet_flow_map *map = &flow_map[index];
6713 map->flow = flow;
6714 map->packet = packet;
6715 map->tcp_flags = tcp_flags;
6716 }
6717
6718 /* SMC lookup function for a batch of packets.
6719 * By doing batching SMC lookup, we can use prefetch
6720 * to hide memory access latency.
6721 */
6722 static inline void
6723 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6724 struct netdev_flow_key *keys,
6725 struct netdev_flow_key **missed_keys,
6726 struct dp_packet_batch *packets_,
6727 const int cnt,
6728 struct dp_packet_flow_map *flow_map,
6729 uint8_t *index_map)
6730 {
6731 int i;
6732 struct dp_packet *packet;
6733 size_t n_smc_hit = 0, n_missed = 0;
6734 struct dfc_cache *cache = &pmd->flow_cache;
6735 struct smc_cache *smc_cache = &cache->smc_cache;
6736 const struct cmap_node *flow_node;
6737 int recv_idx;
6738 uint16_t tcp_flags;
6739
6740 /* Prefetch buckets for all packets */
6741 for (i = 0; i < cnt; i++) {
6742 OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6743 }
6744
6745 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6746 struct dp_netdev_flow *flow = NULL;
6747 flow_node = smc_entry_get(pmd, keys[i].hash);
6748 bool hit = false;
6749 /* Get the original order of this packet in received batch. */
6750 recv_idx = index_map[i];
6751
6752 if (OVS_LIKELY(flow_node != NULL)) {
6753 CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6754 /* Since we dont have per-port megaflow to check the port
6755 * number, we need to verify that the input ports match. */
6756 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6757 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6758 tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6759
6760 /* SMC hit and emc miss, we insert into EMC */
6761 keys[i].len =
6762 netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6763 emc_probabilistic_insert(pmd, &keys[i], flow);
6764 /* Add these packets into the flow map in the same order
6765 * as received.
6766 */
6767 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6768 flow_map, recv_idx);
6769 n_smc_hit++;
6770 hit = true;
6771 break;
6772 }
6773 }
6774 if (hit) {
6775 continue;
6776 }
6777 }
6778
6779 /* SMC missed. Group missed packets together at
6780 * the beginning of the 'packets' array. */
6781 dp_packet_batch_refill(packets_, packet, i);
6782
6783 /* Preserve the order of packet for flow batching. */
6784 index_map[n_missed] = recv_idx;
6785
6786 /* Put missed keys to the pointer arrays return to the caller */
6787 missed_keys[n_missed++] = &keys[i];
6788 }
6789
6790 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6791 }
6792
6793 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6794 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6795 * miniflow is copied into 'keys' and the packet pointer is moved at the
6796 * beginning of the 'packets' array. The pointers of missed keys are put in the
6797 * missed_keys pointer array for future processing.
6798 *
6799 * The function returns the number of packets that needs to be processed in the
6800 * 'packets' array (they have been moved to the beginning of the vector).
6801 *
6802 * For performance reasons a caller may choose not to initialize the metadata
6803 * in 'packets_'. If 'md_is_valid' is false, the metadata in 'packets'
6804 * is not valid and must be initialized by this function using 'port_no'.
6805 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6806 * will be ignored.
6807 */
6808 static inline size_t
6809 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6810 struct dp_packet_batch *packets_,
6811 struct netdev_flow_key *keys,
6812 struct netdev_flow_key **missed_keys,
6813 struct packet_batch_per_flow batches[], size_t *n_batches,
6814 struct dp_packet_flow_map *flow_map,
6815 size_t *n_flows, uint8_t *index_map,
6816 bool md_is_valid, odp_port_t port_no)
6817 {
6818 struct netdev_flow_key *key = &keys[0];
6819 size_t n_missed = 0, n_emc_hit = 0;
6820 struct dfc_cache *cache = &pmd->flow_cache;
6821 struct dp_packet *packet;
6822 const size_t cnt = dp_packet_batch_size(packets_);
6823 uint32_t cur_min = pmd->ctx.emc_insert_min;
6824 int i;
6825 uint16_t tcp_flags;
6826 bool smc_enable_db;
6827 size_t map_cnt = 0;
6828 bool batch_enable = true;
6829
6830 atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6831 pmd_perf_update_counter(&pmd->perf_stats,
6832 md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6833 cnt);
6834
6835 DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6836 struct dp_netdev_flow *flow;
6837 uint32_t mark;
6838
6839 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6840 dp_packet_delete(packet);
6841 COVERAGE_INC(datapath_drop_rx_invalid_packet);
6842 continue;
6843 }
6844
6845 if (i != cnt - 1) {
6846 struct dp_packet **packets = packets_->packets;
6847 /* Prefetch next packet data and metadata. */
6848 OVS_PREFETCH(dp_packet_data(packets[i+1]));
6849 pkt_metadata_prefetch_init(&packets[i+1]->md);
6850 }
6851
6852 if (!md_is_valid) {
6853 pkt_metadata_init(&packet->md, port_no);
6854 }
6855
6856 if ((*recirc_depth_get() == 0) &&
6857 dp_packet_has_flow_mark(packet, &mark)) {
6858 flow = mark_to_flow_find(pmd, mark);
6859 if (OVS_LIKELY(flow)) {
6860 tcp_flags = parse_tcp_flags(packet);
6861 if (OVS_LIKELY(batch_enable)) {
6862 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6863 n_batches);
6864 } else {
6865 /* Flow batching should be performed only after fast-path
6866 * processing is also completed for packets with emc miss
6867 * or else it will result in reordering of packets with
6868 * same datapath flows. */
6869 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6870 flow_map, map_cnt++);
6871 }
6872 continue;
6873 }
6874 }
6875
6876 miniflow_extract(packet, &key->mf);
6877 key->len = 0; /* Not computed yet. */
6878 key->hash =
6879 (md_is_valid == false)
6880 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6881 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6882
6883 /* If EMC is disabled skip emc_lookup */
6884 flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6885 if (OVS_LIKELY(flow)) {
6886 tcp_flags = miniflow_get_tcp_flags(&key->mf);
6887 n_emc_hit++;
6888 if (OVS_LIKELY(batch_enable)) {
6889 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6890 n_batches);
6891 } else {
6892 /* Flow batching should be performed only after fast-path
6893 * processing is also completed for packets with emc miss
6894 * or else it will result in reordering of packets with
6895 * same datapath flows. */
6896 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6897 flow_map, map_cnt++);
6898 }
6899 } else {
6900 /* Exact match cache missed. Group missed packets together at
6901 * the beginning of the 'packets' array. */
6902 dp_packet_batch_refill(packets_, packet, i);
6903
6904 /* Preserve the order of packet for flow batching. */
6905 index_map[n_missed] = map_cnt;
6906 flow_map[map_cnt++].flow = NULL;
6907
6908 /* 'key[n_missed]' contains the key of the current packet and it
6909 * will be passed to SMC lookup. The next key should be extracted
6910 * to 'keys[n_missed + 1]'.
6911 * We also maintain a pointer array to keys missed both SMC and EMC
6912 * which will be returned to the caller for future processing. */
6913 missed_keys[n_missed] = key;
6914 key = &keys[++n_missed];
6915
6916 /* Skip batching for subsequent packets to avoid reordering. */
6917 batch_enable = false;
6918 }
6919 }
6920 /* Count of packets which are not flow batched. */
6921 *n_flows = map_cnt;
6922
6923 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6924
6925 if (!smc_enable_db) {
6926 return dp_packet_batch_size(packets_);
6927 }
6928
6929 /* Packets miss EMC will do a batch lookup in SMC if enabled */
6930 smc_lookup_batch(pmd, keys, missed_keys, packets_,
6931 n_missed, flow_map, index_map);
6932
6933 return dp_packet_batch_size(packets_);
6934 }
6935
6936 static inline int
6937 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6938 struct dp_packet *packet,
6939 const struct netdev_flow_key *key,
6940 struct ofpbuf *actions, struct ofpbuf *put_actions)
6941 {
6942 struct ofpbuf *add_actions;
6943 struct dp_packet_batch b;
6944 struct match match;
6945 ovs_u128 ufid;
6946 int error;
6947 uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6948
6949 match.tun_md.valid = false;
6950 miniflow_expand(&key->mf, &match.flow);
6951 memset(&match.wc, 0, sizeof match.wc);
6952
6953 ofpbuf_clear(actions);
6954 ofpbuf_clear(put_actions);
6955
6956 odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
6957 error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6958 &ufid, DPIF_UC_MISS, NULL, actions,
6959 put_actions);
6960 if (OVS_UNLIKELY(error && error != ENOSPC)) {
6961 dp_packet_delete(packet);
6962 COVERAGE_INC(datapath_drop_upcall_error);
6963 return error;
6964 }
6965
6966 /* The Netlink encoding of datapath flow keys cannot express
6967 * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6968 * tag is interpreted as exact match on the fact that there is no
6969 * VLAN. Unless we refactor a lot of code that translates between
6970 * Netlink and struct flow representations, we have to do the same
6971 * here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
6972 if (!match.wc.masks.vlans[0].tci) {
6973 match.wc.masks.vlans[0].tci = htons(0xffff);
6974 }
6975
6976 /* We can't allow the packet batching in the next loop to execute
6977 * the actions. Otherwise, if there are any slow path actions,
6978 * we'll send the packet up twice. */
6979 dp_packet_batch_init_packet(&b, packet);
6980 dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6981 actions->data, actions->size);
6982
6983 add_actions = put_actions->size ? put_actions : actions;
6984 if (OVS_LIKELY(error != ENOSPC)) {
6985 struct dp_netdev_flow *netdev_flow;
6986
6987 /* XXX: There's a race window where a flow covering this packet
6988 * could have already been installed since we last did the flow
6989 * lookup before upcall. This could be solved by moving the
6990 * mutex lock outside the loop, but that's an awful long time
6991 * to be locking revalidators out of making flow modifications. */
6992 ovs_mutex_lock(&pmd->flow_mutex);
6993 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6994 if (OVS_LIKELY(!netdev_flow)) {
6995 netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6996 add_actions->data,
6997 add_actions->size);
6998 }
6999 ovs_mutex_unlock(&pmd->flow_mutex);
7000 uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7001 smc_insert(pmd, key, hash);
7002 emc_probabilistic_insert(pmd, key, netdev_flow);
7003 }
7004 if (pmd_perf_metrics_enabled(pmd)) {
7005 /* Update upcall stats. */
7006 cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7007 struct pmd_perf_stats *s = &pmd->perf_stats;
7008 s->current.upcalls++;
7009 s->current.upcall_cycles += cycles;
7010 histogram_add_sample(&s->cycles_per_upcall, cycles);
7011 }
7012 return error;
7013 }
7014
7015 static inline void
7016 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7017 struct dp_packet_batch *packets_,
7018 struct netdev_flow_key **keys,
7019 struct dp_packet_flow_map *flow_map,
7020 uint8_t *index_map,
7021 odp_port_t in_port)
7022 {
7023 const size_t cnt = dp_packet_batch_size(packets_);
7024 #if !defined(__CHECKER__) && !defined(_WIN32)
7025 const size_t PKT_ARRAY_SIZE = cnt;
7026 #else
7027 /* Sparse or MSVC doesn't like variable length array. */
7028 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7029 #endif
7030 struct dp_packet *packet;
7031 struct dpcls *cls;
7032 struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7033 struct dp_netdev *dp = pmd->dp;
7034 int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7035 int lookup_cnt = 0, add_lookup_cnt;
7036 bool any_miss;
7037
7038 for (size_t i = 0; i < cnt; i++) {
7039 /* Key length is needed in all the cases, hash computed on demand. */
7040 keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7041 }
7042 /* Get the classifier for the in_port */
7043 cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7044 if (OVS_LIKELY(cls)) {
7045 any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7046 rules, cnt, &lookup_cnt);
7047 } else {
7048 any_miss = true;
7049 memset(rules, 0, sizeof(rules));
7050 }
7051 if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7052 uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7053 struct ofpbuf actions, put_actions;
7054
7055 ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7056 ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7057
7058 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7059 struct dp_netdev_flow *netdev_flow;
7060
7061 if (OVS_LIKELY(rules[i])) {
7062 continue;
7063 }
7064
7065 /* It's possible that an earlier slow path execution installed
7066 * a rule covering this flow. In this case, it's a lot cheaper
7067 * to catch it here than execute a miss. */
7068 netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7069 &add_lookup_cnt);
7070 if (netdev_flow) {
7071 lookup_cnt += add_lookup_cnt;
7072 rules[i] = &netdev_flow->cr;
7073 continue;
7074 }
7075
7076 int error = handle_packet_upcall(pmd, packet, keys[i],
7077 &actions, &put_actions);
7078
7079 if (OVS_UNLIKELY(error)) {
7080 upcall_fail_cnt++;
7081 } else {
7082 upcall_ok_cnt++;
7083 }
7084 }
7085
7086 ofpbuf_uninit(&actions);
7087 ofpbuf_uninit(&put_actions);
7088 fat_rwlock_unlock(&dp->upcall_rwlock);
7089 } else if (OVS_UNLIKELY(any_miss)) {
7090 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7091 if (OVS_UNLIKELY(!rules[i])) {
7092 dp_packet_delete(packet);
7093 COVERAGE_INC(datapath_drop_lock_error);
7094 upcall_fail_cnt++;
7095 }
7096 }
7097 }
7098
7099 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7100 struct dp_netdev_flow *flow;
7101 /* Get the original order of this packet in received batch. */
7102 int recv_idx = index_map[i];
7103 uint16_t tcp_flags;
7104
7105 if (OVS_UNLIKELY(!rules[i])) {
7106 continue;
7107 }
7108
7109 flow = dp_netdev_flow_cast(rules[i]);
7110 uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
7111 smc_insert(pmd, keys[i], hash);
7112
7113 emc_probabilistic_insert(pmd, keys[i], flow);
7114 /* Add these packets into the flow map in the same order
7115 * as received.
7116 */
7117 tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7118 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7119 flow_map, recv_idx);
7120 }
7121
7122 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7123 cnt - upcall_ok_cnt - upcall_fail_cnt);
7124 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7125 lookup_cnt);
7126 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7127 upcall_ok_cnt);
7128 pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7129 upcall_fail_cnt);
7130 }
7131
7132 /* Packets enter the datapath from a port (or from recirculation) here.
7133 *
7134 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7135 * When false the metadata in 'packets' need to be initialized. */
7136 static void
7137 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7138 struct dp_packet_batch *packets,
7139 bool md_is_valid, odp_port_t port_no)
7140 {
7141 #if !defined(__CHECKER__) && !defined(_WIN32)
7142 const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7143 #else
7144 /* Sparse or MSVC doesn't like variable length array. */
7145 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7146 #endif
7147 OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7148 struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7149 struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7150 struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7151 size_t n_batches;
7152 struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7153 uint8_t index_map[PKT_ARRAY_SIZE];
7154 size_t n_flows, i;
7155
7156 odp_port_t in_port;
7157
7158 n_batches = 0;
7159 dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7160 flow_map, &n_flows, index_map, md_is_valid, port_no);
7161
7162 if (!dp_packet_batch_is_empty(packets)) {
7163 /* Get ingress port from first packet's metadata. */
7164 in_port = packets->packets[0]->md.in_port.odp_port;
7165 fast_path_processing(pmd, packets, missed_keys,
7166 flow_map, index_map, in_port);
7167 }
7168
7169 /* Batch rest of packets which are in flow map. */
7170 for (i = 0; i < n_flows; i++) {
7171 struct dp_packet_flow_map *map = &flow_map[i];
7172
7173 if (OVS_UNLIKELY(!map->flow)) {
7174 continue;
7175 }
7176 dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7177 batches, &n_batches);
7178 }
7179
7180 /* All the flow batches need to be reset before any call to
7181 * packet_batch_per_flow_execute() as it could potentially trigger
7182 * recirculation. When a packet matching flow ‘j’ happens to be
7183 * recirculated, the nested call to dp_netdev_input__() could potentially
7184 * classify the packet as matching another flow - say 'k'. It could happen
7185 * that in the previous call to dp_netdev_input__() that same flow 'k' had
7186 * already its own batches[k] still waiting to be served. So if its
7187 * ‘batch’ member is not reset, the recirculated packet would be wrongly
7188 * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7189 for (i = 0; i < n_batches; i++) {
7190 batches[i].flow->batch = NULL;
7191 }
7192
7193 for (i = 0; i < n_batches; i++) {
7194 packet_batch_per_flow_execute(&batches[i], pmd);
7195 }
7196 }
7197
7198 static void
7199 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7200 struct dp_packet_batch *packets,
7201 odp_port_t port_no)
7202 {
7203 dp_netdev_input__(pmd, packets, false, port_no);
7204 }
7205
7206 static void
7207 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7208 struct dp_packet_batch *packets)
7209 {
7210 dp_netdev_input__(pmd, packets, true, 0);
7211 }
7212
7213 struct dp_netdev_execute_aux {
7214 struct dp_netdev_pmd_thread *pmd;
7215 const struct flow *flow;
7216 };
7217
7218 static void
7219 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7220 void *aux)
7221 {
7222 struct dp_netdev *dp = get_dp_netdev(dpif);
7223 dp->dp_purge_aux = aux;
7224 dp->dp_purge_cb = cb;
7225 }
7226
7227 static void
7228 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7229 void *aux)
7230 {
7231 struct dp_netdev *dp = get_dp_netdev(dpif);
7232 dp->upcall_aux = aux;
7233 dp->upcall_cb = cb;
7234 }
7235
7236 static void
7237 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7238 bool purge)
7239 {
7240 struct tx_port *tx;
7241 struct dp_netdev_port *port;
7242 long long interval;
7243
7244 HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7245 if (!tx->port->dynamic_txqs) {
7246 continue;
7247 }
7248 interval = pmd->ctx.now - tx->last_used;
7249 if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7250 port = tx->port;
7251 ovs_mutex_lock(&port->txq_used_mutex);
7252 port->txq_used[tx->qid]--;
7253 ovs_mutex_unlock(&port->txq_used_mutex);
7254 tx->qid = -1;
7255 }
7256 }
7257 }
7258
7259 static int
7260 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7261 struct tx_port *tx)
7262 {
7263 struct dp_netdev_port *port;
7264 long long interval;
7265 int i, min_cnt, min_qid;
7266
7267 interval = pmd->ctx.now - tx->last_used;
7268 tx->last_used = pmd->ctx.now;
7269
7270 if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7271 return tx->qid;
7272 }
7273
7274 port = tx->port;
7275
7276 ovs_mutex_lock(&port->txq_used_mutex);
7277 if (tx->qid >= 0) {
7278 port->txq_used[tx->qid]--;
7279 tx->qid = -1;
7280 }
7281
7282 min_cnt = -1;
7283 min_qid = 0;
7284 for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7285 if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7286 min_cnt = port->txq_used[i];
7287 min_qid = i;
7288 }
7289 }
7290
7291 port->txq_used[min_qid]++;
7292 tx->qid = min_qid;
7293
7294 ovs_mutex_unlock(&port->txq_used_mutex);
7295
7296 dpif_netdev_xps_revalidate_pmd(pmd, false);
7297
7298 VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7299 pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7300 return min_qid;
7301 }
7302
7303 static struct tx_port *
7304 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7305 odp_port_t port_no)
7306 {
7307 return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7308 }
7309
7310 static struct tx_port *
7311 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7312 odp_port_t port_no)
7313 {
7314 return tx_port_lookup(&pmd->send_port_cache, port_no);
7315 }
7316
7317 static int
7318 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7319 const struct nlattr *attr,
7320 struct dp_packet_batch *batch)
7321 {
7322 struct tx_port *tun_port;
7323 const struct ovs_action_push_tnl *data;
7324 int err;
7325
7326 data = nl_attr_get(attr);
7327
7328 tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7329 if (!tun_port) {
7330 err = -EINVAL;
7331 goto error;
7332 }
7333 err = netdev_push_header(tun_port->port->netdev, batch, data);
7334 if (!err) {
7335 return 0;
7336 }
7337 error:
7338 dp_packet_delete_batch(batch, true);
7339 return err;
7340 }
7341
7342 static void
7343 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7344 struct dp_packet *packet, bool should_steal,
7345 struct flow *flow, ovs_u128 *ufid,
7346 struct ofpbuf *actions,
7347 const struct nlattr *userdata)
7348 {
7349 struct dp_packet_batch b;
7350 int error;
7351
7352 ofpbuf_clear(actions);
7353
7354 error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7355 DPIF_UC_ACTION, userdata, actions,
7356 NULL);
7357 if (!error || error == ENOSPC) {
7358 dp_packet_batch_init_packet(&b, packet);
7359 dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7360 actions->data, actions->size);
7361 } else if (should_steal) {
7362 dp_packet_delete(packet);
7363 COVERAGE_INC(datapath_drop_userspace_action_error);
7364 }
7365 }
7366
7367 static bool
7368 dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7369 struct dp_packet_batch *packets_,
7370 bool should_steal, odp_port_t port_no)
7371 {
7372 struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7373 struct dp_packet_batch out;
7374
7375 if (!OVS_LIKELY(p)) {
7376 COVERAGE_ADD(datapath_drop_invalid_port,
7377 dp_packet_batch_size(packets_));
7378 dp_packet_delete_batch(packets_, should_steal);
7379 return false;
7380 }
7381 if (!should_steal) {
7382 dp_packet_batch_clone(&out, packets_);
7383 dp_packet_batch_reset_cutlen(packets_);
7384 packets_ = &out;
7385 }
7386 dp_packet_batch_apply_cutlen(packets_);
7387 #ifdef DPDK_NETDEV
7388 if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7389 && packets_->packets[0]->source
7390 != p->output_pkts.packets[0]->source)) {
7391 /* XXX: netdev-dpdk assumes that all packets in a single
7392 * output batch has the same source. Flush here to
7393 * avoid memory access issues. */
7394 dp_netdev_pmd_flush_output_on_port(pmd, p);
7395 }
7396 #endif
7397 if (dp_packet_batch_size(&p->output_pkts)
7398 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7399 /* Flush here to avoid overflow. */
7400 dp_netdev_pmd_flush_output_on_port(pmd, p);
7401 }
7402 if (dp_packet_batch_is_empty(&p->output_pkts)) {
7403 pmd->n_output_batches++;
7404 }
7405
7406 struct dp_packet *packet;
7407 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7408 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7409 pmd->ctx.last_rxq;
7410 dp_packet_batch_add(&p->output_pkts, packet);
7411 }
7412 return true;
7413 }
7414
7415 static void
7416 dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7417 struct dp_packet_batch *packets_,
7418 bool should_steal, uint32_t bond)
7419 {
7420 struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7421 struct dp_packet_batch out;
7422 struct dp_packet *packet;
7423
7424 if (!p_bond) {
7425 COVERAGE_ADD(datapath_drop_invalid_bond,
7426 dp_packet_batch_size(packets_));
7427 dp_packet_delete_batch(packets_, should_steal);
7428 return;
7429 }
7430 if (!should_steal) {
7431 dp_packet_batch_clone(&out, packets_);
7432 dp_packet_batch_reset_cutlen(packets_);
7433 packets_ = &out;
7434 }
7435 dp_packet_batch_apply_cutlen(packets_);
7436
7437 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7438 /*
7439 * Lookup the bond-hash table using hash to get the slave.
7440 */
7441 uint32_t hash = dp_packet_get_rss_hash(packet);
7442 struct slave_entry *s_entry = &p_bond->slave_buckets[hash & BOND_MASK];
7443 odp_port_t bond_member = s_entry->slave_id;
7444 uint32_t size = dp_packet_size(packet);
7445 struct dp_packet_batch output_pkt;
7446
7447 dp_packet_batch_init_packet(&output_pkt, packet);
7448 if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7449 bond_member))) {
7450 /* Update slave stats. */
7451 non_atomic_ullong_add(&s_entry->n_packets, 1);
7452 non_atomic_ullong_add(&s_entry->n_bytes, size);
7453 }
7454 }
7455 }
7456
7457 static void
7458 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7459 const struct nlattr *a, bool should_steal)
7460 OVS_NO_THREAD_SAFETY_ANALYSIS
7461 {
7462 struct dp_netdev_execute_aux *aux = aux_;
7463 uint32_t *depth = recirc_depth_get();
7464 struct dp_netdev_pmd_thread *pmd = aux->pmd;
7465 struct dp_netdev *dp = pmd->dp;
7466 int type = nl_attr_type(a);
7467 struct tx_port *p;
7468 uint32_t packet_count, packets_dropped;
7469
7470 switch ((enum ovs_action_attr)type) {
7471 case OVS_ACTION_ATTR_OUTPUT:
7472 dp_execute_output_action(pmd, packets_, should_steal,
7473 nl_attr_get_odp_port(a));
7474 return;
7475
7476 case OVS_ACTION_ATTR_LB_OUTPUT:
7477 dp_execute_lb_output_action(pmd, packets_, should_steal,
7478 nl_attr_get_u32(a));
7479 return;
7480
7481 case OVS_ACTION_ATTR_TUNNEL_PUSH:
7482 if (should_steal) {
7483 /* We're requested to push tunnel header, but also we need to take
7484 * the ownership of these packets. Thus, we can avoid performing
7485 * the action, because the caller will not use the result anyway.
7486 * Just break to free the batch. */
7487 break;
7488 }
7489 dp_packet_batch_apply_cutlen(packets_);
7490 packet_count = dp_packet_batch_size(packets_);
7491 if (push_tnl_action(pmd, a, packets_)) {
7492 COVERAGE_ADD(datapath_drop_tunnel_push_error,
7493 packet_count);
7494 }
7495 return;
7496
7497 case OVS_ACTION_ATTR_TUNNEL_POP:
7498 if (*depth < MAX_RECIRC_DEPTH) {
7499 struct dp_packet_batch *orig_packets_ = packets_;
7500 odp_port_t portno = nl_attr_get_odp_port(a);
7501
7502 p = pmd_tnl_port_cache_lookup(pmd, portno);
7503 if (p) {
7504 struct dp_packet_batch tnl_pkt;
7505
7506 if (!should_steal) {
7507 dp_packet_batch_clone(&tnl_pkt, packets_);
7508 packets_ = &tnl_pkt;
7509 dp_packet_batch_reset_cutlen(orig_packets_);
7510 }
7511
7512 dp_packet_batch_apply_cutlen(packets_);
7513
7514 packet_count = dp_packet_batch_size(packets_);
7515 netdev_pop_header(p->port->netdev, packets_);
7516 packets_dropped =
7517 packet_count - dp_packet_batch_size(packets_);
7518 if (packets_dropped) {
7519 COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7520 packets_dropped);
7521 }
7522 if (dp_packet_batch_is_empty(packets_)) {
7523 return;
7524 }
7525
7526 struct dp_packet *packet;
7527 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7528 packet->md.in_port.odp_port = portno;
7529 }
7530
7531 (*depth)++;
7532 dp_netdev_recirculate(pmd, packets_);
7533 (*depth)--;
7534 return;
7535 }
7536 COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7537 dp_packet_batch_size(packets_));
7538 } else {
7539 COVERAGE_ADD(datapath_drop_recirc_error,
7540 dp_packet_batch_size(packets_));
7541 }
7542 break;
7543
7544 case OVS_ACTION_ATTR_USERSPACE:
7545 if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7546 struct dp_packet_batch *orig_packets_ = packets_;
7547 const struct nlattr *userdata;
7548 struct dp_packet_batch usr_pkt;
7549 struct ofpbuf actions;
7550 struct flow flow;
7551 ovs_u128 ufid;
7552 bool clone = false;
7553
7554 userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7555 ofpbuf_init(&actions, 0);
7556
7557 if (packets_->trunc) {
7558 if (!should_steal) {
7559 dp_packet_batch_clone(&usr_pkt, packets_);
7560 packets_ = &usr_pkt;
7561 clone = true;
7562 dp_packet_batch_reset_cutlen(orig_packets_);
7563 }
7564
7565 dp_packet_batch_apply_cutlen(packets_);
7566 }
7567
7568 struct dp_packet *packet;
7569 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7570 flow_extract(packet, &flow);
7571 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7572 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7573 &ufid, &actions, userdata);
7574 }
7575
7576 if (clone) {
7577 dp_packet_delete_batch(packets_, true);
7578 }
7579
7580 ofpbuf_uninit(&actions);
7581 fat_rwlock_unlock(&dp->upcall_rwlock);
7582
7583 return;
7584 }
7585 COVERAGE_ADD(datapath_drop_lock_error,
7586 dp_packet_batch_size(packets_));
7587 break;
7588
7589 case OVS_ACTION_ATTR_RECIRC:
7590 if (*depth < MAX_RECIRC_DEPTH) {
7591 struct dp_packet_batch recirc_pkts;
7592
7593 if (!should_steal) {
7594 dp_packet_batch_clone(&recirc_pkts, packets_);
7595 packets_ = &recirc_pkts;
7596 }
7597
7598 struct dp_packet *packet;
7599 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7600 packet->md.recirc_id = nl_attr_get_u32(a);
7601 }
7602
7603 (*depth)++;
7604 dp_netdev_recirculate(pmd, packets_);
7605 (*depth)--;
7606
7607 return;
7608 }
7609
7610 COVERAGE_ADD(datapath_drop_recirc_error,
7611 dp_packet_batch_size(packets_));
7612 VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7613 break;
7614
7615 case OVS_ACTION_ATTR_CT: {
7616 const struct nlattr *b;
7617 bool force = false;
7618 bool commit = false;
7619 unsigned int left;
7620 uint16_t zone = 0;
7621 uint32_t tp_id = 0;
7622 const char *helper = NULL;
7623 const uint32_t *setmark = NULL;
7624 const struct ovs_key_ct_labels *setlabel = NULL;
7625 struct nat_action_info_t nat_action_info;
7626 struct nat_action_info_t *nat_action_info_ref = NULL;
7627 bool nat_config = false;
7628
7629 NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7630 nl_attr_get_size(a)) {
7631 enum ovs_ct_attr sub_type = nl_attr_type(b);
7632
7633 switch(sub_type) {
7634 case OVS_CT_ATTR_FORCE_COMMIT:
7635 force = true;
7636 /* fall through. */
7637 case OVS_CT_ATTR_COMMIT:
7638 commit = true;
7639 break;
7640 case OVS_CT_ATTR_ZONE:
7641 zone = nl_attr_get_u16(b);
7642 break;
7643 case OVS_CT_ATTR_HELPER:
7644 helper = nl_attr_get_string(b);
7645 break;
7646 case OVS_CT_ATTR_MARK:
7647 setmark = nl_attr_get(b);
7648 break;
7649 case OVS_CT_ATTR_LABELS:
7650 setlabel = nl_attr_get(b);
7651 break;
7652 case OVS_CT_ATTR_EVENTMASK:
7653 /* Silently ignored, as userspace datapath does not generate
7654 * netlink events. */
7655 break;
7656 case OVS_CT_ATTR_TIMEOUT:
7657 if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
7658 VLOG_WARN("Invalid Timeout Policy ID: %s.",
7659 nl_attr_get_string(b));
7660 tp_id = DEFAULT_TP_ID;
7661 }
7662 break;
7663 case OVS_CT_ATTR_NAT: {
7664 const struct nlattr *b_nest;
7665 unsigned int left_nest;
7666 bool ip_min_specified = false;
7667 bool proto_num_min_specified = false;
7668 bool ip_max_specified = false;
7669 bool proto_num_max_specified = false;
7670 memset(&nat_action_info, 0, sizeof nat_action_info);
7671 nat_action_info_ref = &nat_action_info;
7672
7673 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7674 enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7675
7676 switch (sub_type_nest) {
7677 case OVS_NAT_ATTR_SRC:
7678 case OVS_NAT_ATTR_DST:
7679 nat_config = true;
7680 nat_action_info.nat_action |=
7681 ((sub_type_nest == OVS_NAT_ATTR_SRC)
7682 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7683 break;
7684 case OVS_NAT_ATTR_IP_MIN:
7685 memcpy(&nat_action_info.min_addr,
7686 nl_attr_get(b_nest),
7687 nl_attr_get_size(b_nest));
7688 ip_min_specified = true;
7689 break;
7690 case OVS_NAT_ATTR_IP_MAX:
7691 memcpy(&nat_action_info.max_addr,
7692 nl_attr_get(b_nest),
7693 nl_attr_get_size(b_nest));
7694 ip_max_specified = true;
7695 break;
7696 case OVS_NAT_ATTR_PROTO_MIN:
7697 nat_action_info.min_port =
7698 nl_attr_get_u16(b_nest);
7699 proto_num_min_specified = true;
7700 break;
7701 case OVS_NAT_ATTR_PROTO_MAX:
7702 nat_action_info.max_port =
7703 nl_attr_get_u16(b_nest);
7704 proto_num_max_specified = true;
7705 break;
7706 case OVS_NAT_ATTR_PERSISTENT:
7707 case OVS_NAT_ATTR_PROTO_HASH:
7708 case OVS_NAT_ATTR_PROTO_RANDOM:
7709 break;
7710 case OVS_NAT_ATTR_UNSPEC:
7711 case __OVS_NAT_ATTR_MAX:
7712 OVS_NOT_REACHED();
7713 }
7714 }
7715
7716 if (ip_min_specified && !ip_max_specified) {
7717 nat_action_info.max_addr = nat_action_info.min_addr;
7718 }
7719 if (proto_num_min_specified && !proto_num_max_specified) {
7720 nat_action_info.max_port = nat_action_info.min_port;
7721 }
7722 if (proto_num_min_specified || proto_num_max_specified) {
7723 if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7724 nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7725 } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7726 nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7727 }
7728 }
7729 break;
7730 }
7731 case OVS_CT_ATTR_UNSPEC:
7732 case __OVS_CT_ATTR_MAX:
7733 OVS_NOT_REACHED();
7734 }
7735 }
7736
7737 /* We won't be able to function properly in this case, hence
7738 * complain loudly. */
7739 if (nat_config && !commit) {
7740 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7741 VLOG_WARN_RL(&rl, "NAT specified without commit.");
7742 }
7743
7744 conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7745 commit, zone, setmark, setlabel, aux->flow->tp_src,
7746 aux->flow->tp_dst, helper, nat_action_info_ref,
7747 pmd->ctx.now / 1000, tp_id);
7748 break;
7749 }
7750
7751 case OVS_ACTION_ATTR_METER:
7752 dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7753 pmd->ctx.now);
7754 break;
7755
7756 case OVS_ACTION_ATTR_PUSH_VLAN:
7757 case OVS_ACTION_ATTR_POP_VLAN:
7758 case OVS_ACTION_ATTR_PUSH_MPLS:
7759 case OVS_ACTION_ATTR_POP_MPLS:
7760 case OVS_ACTION_ATTR_SET:
7761 case OVS_ACTION_ATTR_SET_MASKED:
7762 case OVS_ACTION_ATTR_SAMPLE:
7763 case OVS_ACTION_ATTR_HASH:
7764 case OVS_ACTION_ATTR_UNSPEC:
7765 case OVS_ACTION_ATTR_TRUNC:
7766 case OVS_ACTION_ATTR_PUSH_ETH:
7767 case OVS_ACTION_ATTR_POP_ETH:
7768 case OVS_ACTION_ATTR_CLONE:
7769 case OVS_ACTION_ATTR_PUSH_NSH:
7770 case OVS_ACTION_ATTR_POP_NSH:
7771 case OVS_ACTION_ATTR_CT_CLEAR:
7772 case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7773 case OVS_ACTION_ATTR_DROP:
7774 case __OVS_ACTION_ATTR_MAX:
7775 OVS_NOT_REACHED();
7776 }
7777
7778 dp_packet_delete_batch(packets_, should_steal);
7779 }
7780
7781 static void
7782 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7783 struct dp_packet_batch *packets,
7784 bool should_steal, const struct flow *flow,
7785 const struct nlattr *actions, size_t actions_len)
7786 {
7787 struct dp_netdev_execute_aux aux = { pmd, flow };
7788
7789 odp_execute_actions(&aux, packets, should_steal, actions,
7790 actions_len, dp_execute_cb);
7791 }
7792
7793 struct dp_netdev_ct_dump {
7794 struct ct_dpif_dump_state up;
7795 struct conntrack_dump dump;
7796 struct conntrack *ct;
7797 struct dp_netdev *dp;
7798 };
7799
7800 static int
7801 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7802 const uint16_t *pzone, int *ptot_bkts)
7803 {
7804 struct dp_netdev *dp = get_dp_netdev(dpif);
7805 struct dp_netdev_ct_dump *dump;
7806
7807 dump = xzalloc(sizeof *dump);
7808 dump->dp = dp;
7809 dump->ct = dp->conntrack;
7810
7811 conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7812
7813 *dump_ = &dump->up;
7814
7815 return 0;
7816 }
7817
7818 static int
7819 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7820 struct ct_dpif_dump_state *dump_,
7821 struct ct_dpif_entry *entry)
7822 {
7823 struct dp_netdev_ct_dump *dump;
7824
7825 INIT_CONTAINER(dump, dump_, up);
7826
7827 return conntrack_dump_next(&dump->dump, entry);
7828 }
7829
7830 static int
7831 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7832 struct ct_dpif_dump_state *dump_)
7833 {
7834 struct dp_netdev_ct_dump *dump;
7835 int err;
7836
7837 INIT_CONTAINER(dump, dump_, up);
7838
7839 err = conntrack_dump_done(&dump->dump);
7840
7841 free(dump);
7842
7843 return err;
7844 }
7845
7846 static int
7847 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7848 const struct ct_dpif_tuple *tuple)
7849 {
7850 struct dp_netdev *dp = get_dp_netdev(dpif);
7851
7852 if (tuple) {
7853 return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7854 }
7855 return conntrack_flush(dp->conntrack, zone);
7856 }
7857
7858 static int
7859 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7860 {
7861 struct dp_netdev *dp = get_dp_netdev(dpif);
7862
7863 return conntrack_set_maxconns(dp->conntrack, maxconns);
7864 }
7865
7866 static int
7867 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7868 {
7869 struct dp_netdev *dp = get_dp_netdev(dpif);
7870
7871 return conntrack_get_maxconns(dp->conntrack, maxconns);
7872 }
7873
7874 static int
7875 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7876 {
7877 struct dp_netdev *dp = get_dp_netdev(dpif);
7878
7879 return conntrack_get_nconns(dp->conntrack, nconns);
7880 }
7881
7882 static int
7883 dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7884 {
7885 struct dp_netdev *dp = get_dp_netdev(dpif);
7886
7887 return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7888 }
7889
7890 static int
7891 dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7892 {
7893 struct dp_netdev *dp = get_dp_netdev(dpif);
7894 *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7895 return 0;
7896 }
7897
7898 static int
7899 dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
7900 const uint32_t *default_limits,
7901 const struct ovs_list *zone_limits)
7902 {
7903 int err = 0;
7904 struct dp_netdev *dp = get_dp_netdev(dpif);
7905 if (default_limits) {
7906 err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
7907 if (err != 0) {
7908 return err;
7909 }
7910 }
7911
7912 struct ct_dpif_zone_limit *zone_limit;
7913 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7914 err = zone_limit_update(dp->conntrack, zone_limit->zone,
7915 zone_limit->limit);
7916 if (err != 0) {
7917 break;
7918 }
7919 }
7920 return err;
7921 }
7922
7923 static int
7924 dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
7925 uint32_t *default_limit,
7926 const struct ovs_list *zone_limits_request,
7927 struct ovs_list *zone_limits_reply)
7928 {
7929 struct dp_netdev *dp = get_dp_netdev(dpif);
7930 struct conntrack_zone_limit czl;
7931
7932 czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
7933 if (czl.zone == DEFAULT_ZONE) {
7934 *default_limit = czl.limit;
7935 } else {
7936 return EINVAL;
7937 }
7938
7939 if (!ovs_list_is_empty(zone_limits_request)) {
7940 struct ct_dpif_zone_limit *zone_limit;
7941 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
7942 czl = zone_limit_get(dp->conntrack, zone_limit->zone);
7943 if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
7944 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
7945 czl.limit, czl.count);
7946 } else {
7947 return EINVAL;
7948 }
7949 }
7950 } else {
7951 for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
7952 czl = zone_limit_get(dp->conntrack, z);
7953 if (czl.zone == z) {
7954 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
7955 czl.count);
7956 }
7957 }
7958 }
7959
7960 return 0;
7961 }
7962
7963 static int
7964 dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
7965 const struct ovs_list *zone_limits)
7966 {
7967 int err = 0;
7968 struct dp_netdev *dp = get_dp_netdev(dpif);
7969 struct ct_dpif_zone_limit *zone_limit;
7970 LIST_FOR_EACH (zone_limit, node, zone_limits) {
7971 err = zone_limit_delete(dp->conntrack, zone_limit->zone);
7972 if (err != 0) {
7973 break;
7974 }
7975 }
7976
7977 return err;
7978 }
7979
7980 static int
7981 dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
7982 const struct ct_dpif_timeout_policy *dpif_tp)
7983 {
7984 struct timeout_policy tp;
7985 struct dp_netdev *dp;
7986
7987 dp = get_dp_netdev(dpif);
7988 memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
7989 return timeout_policy_update(dp->conntrack, &tp);
7990 }
7991
7992 static int
7993 dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
7994 struct ct_dpif_timeout_policy *dpif_tp)
7995 {
7996 struct timeout_policy *tp;
7997 struct dp_netdev *dp;
7998 int err = 0;
7999
8000 dp = get_dp_netdev(dpif);
8001 tp = timeout_policy_get(dp->conntrack, tp_id);
8002 if (!tp) {
8003 return ENOENT;
8004 }
8005 memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8006 return err;
8007 }
8008
8009 static int
8010 dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8011 uint32_t tp_id)
8012 {
8013 struct dp_netdev *dp;
8014 int err = 0;
8015
8016 dp = get_dp_netdev(dpif);
8017 err = timeout_policy_delete(dp->conntrack, tp_id);
8018 return err;
8019 }
8020
8021 static int
8022 dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8023 uint32_t tp_id,
8024 uint16_t dl_type OVS_UNUSED,
8025 uint8_t nw_proto OVS_UNUSED,
8026 char **tp_name, bool *is_generic)
8027 {
8028 struct ds ds = DS_EMPTY_INITIALIZER;
8029
8030 ds_put_format(&ds, "%"PRIu32, tp_id);
8031 *tp_name = ds_steal_cstr(&ds);
8032 *is_generic = true;
8033 return 0;
8034 }
8035
8036 static int
8037 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8038 {
8039 struct dp_netdev *dp = get_dp_netdev(dpif);
8040 return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8041 }
8042
8043 static int
8044 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8045 {
8046 struct dp_netdev *dp = get_dp_netdev(dpif);
8047 return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8048 }
8049
8050 static int
8051 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8052 {
8053 struct dp_netdev *dp = get_dp_netdev(dpif);
8054 return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8055 }
8056
8057 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8058 * diverge. */
8059 static int
8060 dpif_netdev_ipf_get_status(struct dpif *dpif,
8061 struct dpif_ipf_status *dpif_ipf_status)
8062 {
8063 struct dp_netdev *dp = get_dp_netdev(dpif);
8064 ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8065 (struct ipf_status *) dpif_ipf_status);
8066 return 0;
8067 }
8068
8069 static int
8070 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8071 struct ipf_dump_ctx **ipf_dump_ctx)
8072 {
8073 return ipf_dump_start(ipf_dump_ctx);
8074 }
8075
8076 static int
8077 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8078 {
8079 struct dp_netdev *dp = get_dp_netdev(dpif);
8080 return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8081 dump);
8082 }
8083
8084 static int
8085 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8086 {
8087 return ipf_dump_done(ipf_dump_ctx);
8088
8089 }
8090
8091 static int
8092 dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8093 odp_port_t *slave_map)
8094 {
8095 struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8096 struct dp_netdev *dp = get_dp_netdev(dpif);
8097 struct dp_netdev_pmd_thread *pmd;
8098
8099 /* Prepare new bond mapping. */
8100 new_tx->bond_id = bond_id;
8101 for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8102 new_tx->slave_buckets[bucket].slave_id = slave_map[bucket];
8103 }
8104
8105 ovs_mutex_lock(&dp->bond_mutex);
8106 /* Check if bond already existed. */
8107 struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8108 if (old_tx) {
8109 cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8110 hash_bond_id(bond_id));
8111 ovsrcu_postpone(free, old_tx);
8112 } else {
8113 cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8114 }
8115 ovs_mutex_unlock(&dp->bond_mutex);
8116
8117 /* Update all PMDs with new bond mapping. */
8118 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8119 dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8120 }
8121 return 0;
8122 }
8123
8124 static int
8125 dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8126 {
8127 struct dp_netdev *dp = get_dp_netdev(dpif);
8128 struct dp_netdev_pmd_thread *pmd;
8129 struct tx_bond *tx;
8130
8131 ovs_mutex_lock(&dp->bond_mutex);
8132 /* Check if bond existed. */
8133 tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8134 if (tx) {
8135 cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8136 ovsrcu_postpone(free, tx);
8137 } else {
8138 /* Bond is not present. */
8139 ovs_mutex_unlock(&dp->bond_mutex);
8140 return ENOENT;
8141 }
8142 ovs_mutex_unlock(&dp->bond_mutex);
8143
8144 /* Remove the bond map in all pmds. */
8145 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8146 dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8147 }
8148 return 0;
8149 }
8150
8151 static int
8152 dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8153 uint64_t *n_bytes)
8154 {
8155 struct dp_netdev *dp = get_dp_netdev(dpif);
8156 struct dp_netdev_pmd_thread *pmd;
8157
8158 if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8159 return ENOENT;
8160 }
8161
8162 /* Search the bond in all PMDs. */
8163 CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8164 struct tx_bond *pmd_bond_entry
8165 = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8166
8167 if (!pmd_bond_entry) {
8168 continue;
8169 }
8170
8171 /* Read bond stats. */
8172 for (int i = 0; i < BOND_BUCKETS; i++) {
8173 uint64_t pmd_n_bytes;
8174
8175 atomic_read_relaxed(&pmd_bond_entry->slave_buckets[i].n_bytes,
8176 &pmd_n_bytes);
8177 n_bytes[i] += pmd_n_bytes;
8178 }
8179 }
8180 return 0;
8181 }
8182
8183 const struct dpif_class dpif_netdev_class = {
8184 "netdev",
8185 true, /* cleanup_required */
8186 dpif_netdev_init,
8187 dpif_netdev_enumerate,
8188 dpif_netdev_port_open_type,
8189 dpif_netdev_open,
8190 dpif_netdev_close,
8191 dpif_netdev_destroy,
8192 dpif_netdev_run,
8193 dpif_netdev_wait,
8194 dpif_netdev_get_stats,
8195 NULL, /* set_features */
8196 dpif_netdev_port_add,
8197 dpif_netdev_port_del,
8198 dpif_netdev_port_set_config,
8199 dpif_netdev_port_query_by_number,
8200 dpif_netdev_port_query_by_name,
8201 NULL, /* port_get_pid */
8202 dpif_netdev_port_dump_start,
8203 dpif_netdev_port_dump_next,
8204 dpif_netdev_port_dump_done,
8205 dpif_netdev_port_poll,
8206 dpif_netdev_port_poll_wait,
8207 dpif_netdev_flow_flush,
8208 dpif_netdev_flow_dump_create,
8209 dpif_netdev_flow_dump_destroy,
8210 dpif_netdev_flow_dump_thread_create,
8211 dpif_netdev_flow_dump_thread_destroy,
8212 dpif_netdev_flow_dump_next,
8213 dpif_netdev_operate,
8214 NULL, /* recv_set */
8215 NULL, /* handlers_set */
8216 dpif_netdev_set_config,
8217 dpif_netdev_queue_to_priority,
8218 NULL, /* recv */
8219 NULL, /* recv_wait */
8220 NULL, /* recv_purge */
8221 dpif_netdev_register_dp_purge_cb,
8222 dpif_netdev_register_upcall_cb,
8223 dpif_netdev_enable_upcall,
8224 dpif_netdev_disable_upcall,
8225 dpif_netdev_get_datapath_version,
8226 dpif_netdev_ct_dump_start,
8227 dpif_netdev_ct_dump_next,
8228 dpif_netdev_ct_dump_done,
8229 dpif_netdev_ct_flush,
8230 dpif_netdev_ct_set_maxconns,
8231 dpif_netdev_ct_get_maxconns,
8232 dpif_netdev_ct_get_nconns,
8233 dpif_netdev_ct_set_tcp_seq_chk,
8234 dpif_netdev_ct_get_tcp_seq_chk,
8235 dpif_netdev_ct_set_limits,
8236 dpif_netdev_ct_get_limits,
8237 dpif_netdev_ct_del_limits,
8238 dpif_netdev_ct_set_timeout_policy,
8239 dpif_netdev_ct_get_timeout_policy,
8240 dpif_netdev_ct_del_timeout_policy,
8241 NULL, /* ct_timeout_policy_dump_start */
8242 NULL, /* ct_timeout_policy_dump_next */
8243 NULL, /* ct_timeout_policy_dump_done */
8244 dpif_netdev_ct_get_timeout_policy_name,
8245 dpif_netdev_ipf_set_enabled,
8246 dpif_netdev_ipf_set_min_frag,
8247 dpif_netdev_ipf_set_max_nfrags,
8248 dpif_netdev_ipf_get_status,
8249 dpif_netdev_ipf_dump_start,
8250 dpif_netdev_ipf_dump_next,
8251 dpif_netdev_ipf_dump_done,
8252 dpif_netdev_meter_get_features,
8253 dpif_netdev_meter_set,
8254 dpif_netdev_meter_get,
8255 dpif_netdev_meter_del,
8256 dpif_netdev_bond_add,
8257 dpif_netdev_bond_del,
8258 dpif_netdev_bond_stats_get,
8259 };
8260
8261 static void
8262 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8263 const char *argv[], void *aux OVS_UNUSED)
8264 {
8265 struct dp_netdev_port *port;
8266 struct dp_netdev *dp;
8267 odp_port_t port_no;
8268
8269 ovs_mutex_lock(&dp_netdev_mutex);
8270 dp = shash_find_data(&dp_netdevs, argv[1]);
8271 if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8272 ovs_mutex_unlock(&dp_netdev_mutex);
8273 unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8274 return;
8275 }
8276 ovs_refcount_ref(&dp->ref_cnt);
8277 ovs_mutex_unlock(&dp_netdev_mutex);
8278
8279 ovs_mutex_lock(&dp->port_mutex);
8280 if (get_port_by_name(dp, argv[2], &port)) {
8281 unixctl_command_reply_error(conn, "unknown port");
8282 goto exit;
8283 }
8284
8285 port_no = u32_to_odp(atoi(argv[3]));
8286 if (!port_no || port_no == ODPP_NONE) {
8287 unixctl_command_reply_error(conn, "bad port number");
8288 goto exit;
8289 }
8290 if (dp_netdev_lookup_port(dp, port_no)) {
8291 unixctl_command_reply_error(conn, "port number already in use");
8292 goto exit;
8293 }
8294
8295 /* Remove port. */
8296 hmap_remove(&dp->ports, &port->node);
8297 reconfigure_datapath(dp);
8298
8299 /* Reinsert with new port number. */
8300 port->port_no = port_no;
8301 hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8302 reconfigure_datapath(dp);
8303
8304 seq_change(dp->port_seq);
8305 unixctl_command_reply(conn, NULL);
8306
8307 exit:
8308 ovs_mutex_unlock(&dp->port_mutex);
8309 dp_netdev_unref(dp);
8310 }
8311
8312 static void
8313 dpif_dummy_register__(const char *type)
8314 {
8315 struct dpif_class *class;
8316
8317 class = xmalloc(sizeof *class);
8318 *class = dpif_netdev_class;
8319 class->type = xstrdup(type);
8320 dp_register_provider(class);
8321 }
8322
8323 static void
8324 dpif_dummy_override(const char *type)
8325 {
8326 int error;
8327
8328 /*
8329 * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8330 * a userland-only build. It's useful for testsuite.
8331 */
8332 error = dp_unregister_provider(type);
8333 if (error == 0 || error == EAFNOSUPPORT) {
8334 dpif_dummy_register__(type);
8335 }
8336 }
8337
8338 void
8339 dpif_dummy_register(enum dummy_level level)
8340 {
8341 if (level == DUMMY_OVERRIDE_ALL) {
8342 struct sset types;
8343 const char *type;
8344
8345 sset_init(&types);
8346 dp_enumerate_types(&types);
8347 SSET_FOR_EACH (type, &types) {
8348 dpif_dummy_override(type);
8349 }
8350 sset_destroy(&types);
8351 } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8352 dpif_dummy_override("system");
8353 }
8354
8355 dpif_dummy_register__("dummy");
8356
8357 unixctl_command_register("dpif-dummy/change-port-number",
8358 "dp port new-number",
8359 3, 3, dpif_dummy_change_port_number, NULL);
8360 }
8361 \f
8362 /* Datapath Classifier. */
8363
8364 static void
8365 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8366 {
8367 cmap_destroy(&subtable->rules);
8368 ovsrcu_postpone(free, subtable->mf_masks);
8369 ovsrcu_postpone(free, subtable);
8370 }
8371
8372 /* Initializes 'cls' as a classifier that initially contains no classification
8373 * rules. */
8374 static void
8375 dpcls_init(struct dpcls *cls)
8376 {
8377 cmap_init(&cls->subtables_map);
8378 pvector_init(&cls->subtables);
8379 }
8380
8381 static void
8382 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8383 {
8384 VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
8385 pvector_remove(&cls->subtables, subtable);
8386 cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8387 subtable->mask.hash);
8388 ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
8389 }
8390
8391 /* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
8392 * caller's responsibility.
8393 * May only be called after all the readers have been terminated. */
8394 static void
8395 dpcls_destroy(struct dpcls *cls)
8396 {
8397 if (cls) {
8398 struct dpcls_subtable *subtable;
8399
8400 CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
8401 ovs_assert(cmap_count(&subtable->rules) == 0);
8402 dpcls_destroy_subtable(cls, subtable);
8403 }
8404 cmap_destroy(&cls->subtables_map);
8405 pvector_destroy(&cls->subtables);
8406 }
8407 }
8408
8409 static struct dpcls_subtable *
8410 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8411 {
8412 struct dpcls_subtable *subtable;
8413
8414 /* Need to add one. */
8415 subtable = xmalloc(sizeof *subtable
8416 - sizeof subtable->mask.mf + mask->len);
8417 cmap_init(&subtable->rules);
8418 subtable->hit_cnt = 0;
8419 netdev_flow_key_clone(&subtable->mask, mask);
8420
8421 /* The count of bits in the mask defines the space required for masks.
8422 * Then call gen_masks() to create the appropriate masks, avoiding the cost
8423 * of doing runtime calculations. */
8424 uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
8425 uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
8426 subtable->mf_bits_set_unit0 = unit0;
8427 subtable->mf_bits_set_unit1 = unit1;
8428 subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
8429 netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
8430
8431 /* Probe for a specialized generic lookup function. */
8432 subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
8433
8434 /* If not set, assign generic lookup. Generic works for any miniflow. */
8435 if (!subtable->lookup_func) {
8436 subtable->lookup_func = dpcls_subtable_lookup_generic;
8437 }
8438
8439 cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
8440 /* Add the new subtable at the end of the pvector (with no hits yet) */
8441 pvector_insert(&cls->subtables, subtable, 0);
8442 VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
8443 cmap_count(&cls->subtables_map), subtable, cls->in_port);
8444 pvector_publish(&cls->subtables);
8445
8446 return subtable;
8447 }
8448
8449 static inline struct dpcls_subtable *
8450 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8451 {
8452 struct dpcls_subtable *subtable;
8453
8454 CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8455 &cls->subtables_map) {
8456 if (netdev_flow_key_equal(&subtable->mask, mask)) {
8457 return subtable;
8458 }
8459 }
8460 return dpcls_create_subtable(cls, mask);
8461 }
8462
8463
8464 /* Periodically sort the dpcls subtable vectors according to hit counts */
8465 static void
8466 dpcls_sort_subtable_vector(struct dpcls *cls)
8467 {
8468 struct pvector *pvec = &cls->subtables;
8469 struct dpcls_subtable *subtable;
8470
8471 PVECTOR_FOR_EACH (subtable, pvec) {
8472 pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8473 subtable->hit_cnt = 0;
8474 }
8475 pvector_publish(pvec);
8476 }
8477
8478 static inline void
8479 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8480 struct polled_queue *poll_list, int poll_cnt)
8481 {
8482 struct dpcls *cls;
8483 uint64_t tot_idle = 0, tot_proc = 0;
8484 unsigned int pmd_load = 0;
8485
8486 if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
8487 uint64_t curr_tsc;
8488 struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8489 if (pmd_alb->is_enabled && !pmd->isolated
8490 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8491 pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8492 && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8493 pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8494 {
8495 tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8496 pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8497 tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8498 pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8499
8500 if (tot_proc) {
8501 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8502 }
8503
8504 if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8505 atomic_count_inc(&pmd->pmd_overloaded);
8506 } else {
8507 atomic_count_set(&pmd->pmd_overloaded, 0);
8508 }
8509 }
8510
8511 pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8512 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8513 pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8514 pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8515
8516 /* Get the cycles that were used to process each queue and store. */
8517 for (unsigned i = 0; i < poll_cnt; i++) {
8518 uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8519 RXQ_CYCLES_PROC_CURR);
8520 dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8521 dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8522 0);
8523 }
8524 curr_tsc = cycles_counter_update(&pmd->perf_stats);
8525 if (pmd->intrvl_tsc_prev) {
8526 /* There is a prev timestamp, store a new intrvl cycle count. */
8527 atomic_store_relaxed(&pmd->intrvl_cycles,
8528 curr_tsc - pmd->intrvl_tsc_prev);
8529 }
8530 pmd->intrvl_tsc_prev = curr_tsc;
8531 /* Start new measuring interval */
8532 pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
8533 }
8534
8535 if (pmd->ctx.now > pmd->next_optimization) {
8536 /* Try to obtain the flow lock to block out revalidator threads.
8537 * If not possible, just try next time. */
8538 if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8539 /* Optimize each classifier */
8540 CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8541 dpcls_sort_subtable_vector(cls);
8542 }
8543 ovs_mutex_unlock(&pmd->flow_mutex);
8544 /* Start new measuring interval */
8545 pmd->next_optimization = pmd->ctx.now
8546 + DPCLS_OPTIMIZATION_INTERVAL;
8547 }
8548 }
8549 }
8550
8551 /* Insert 'rule' into 'cls'. */
8552 static void
8553 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8554 const struct netdev_flow_key *mask)
8555 {
8556 struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8557
8558 /* Refer to subtable's mask, also for later removal. */
8559 rule->mask = &subtable->mask;
8560 cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8561 }
8562
8563 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
8564 static void
8565 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8566 {
8567 struct dpcls_subtable *subtable;
8568
8569 ovs_assert(rule->mask);
8570
8571 /* Get subtable from reference in rule->mask. */
8572 INIT_CONTAINER(subtable, rule->mask, mask);
8573 if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8574 == 0) {
8575 /* Delete empty subtable. */
8576 dpcls_destroy_subtable(cls, subtable);
8577 pvector_publish(&cls->subtables);
8578 }
8579 }
8580
8581 /* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8582 static inline void
8583 netdev_flow_key_gen_mask_unit(uint64_t iter,
8584 const uint64_t count,
8585 uint64_t *mf_masks)
8586 {
8587 int i;
8588 for (i = 0; i < count; i++) {
8589 uint64_t lowest_bit = (iter & -iter);
8590 iter &= ~lowest_bit;
8591 mf_masks[i] = (lowest_bit - 1);
8592 }
8593 /* Checks that count has covered all bits in the iter bitmap. */
8594 ovs_assert(iter == 0);
8595 }
8596
8597 /* Generate a mask for each block in the miniflow, based on the bits set. This
8598 * allows easily masking packets with the generated array here, without
8599 * calculations. This replaces runtime-calculating the masks.
8600 * @param key The table to generate the mf_masks for
8601 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8602 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8603 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8604 */
8605 void
8606 netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8607 uint64_t *mf_masks,
8608 const uint32_t mf_bits_u0,
8609 const uint32_t mf_bits_u1)
8610 {
8611 uint64_t iter_u0 = tbl->mf.map.bits[0];
8612 uint64_t iter_u1 = tbl->mf.map.bits[1];
8613
8614 netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8615 netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8616 }
8617
8618 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8619 * in 'mask' the values in 'key' and 'target' are the same. */
8620 bool
8621 dpcls_rule_matches_key(const struct dpcls_rule *rule,
8622 const struct netdev_flow_key *target)
8623 {
8624 const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8625 const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
8626 uint64_t value;
8627
8628 NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8629 if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
8630 return false;
8631 }
8632 }
8633 return true;
8634 }
8635
8636 /* For each miniflow in 'keys' performs a classifier lookup writing the result
8637 * into the corresponding slot in 'rules'. If a particular entry in 'keys' is
8638 * NULL it is skipped.
8639 *
8640 * This function is optimized for use in the userspace datapath and therefore
8641 * does not implement a lot of features available in the standard
8642 * classifier_lookup() function. Specifically, it does not implement
8643 * priorities, instead returning any rule which matches the flow.
8644 *
8645 * Returns true if all miniflows found a corresponding rule. */
8646 static bool
8647 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8648 struct dpcls_rule **rules, const size_t cnt,
8649 int *num_lookups_p)
8650 {
8651 /* The received 'cnt' miniflows are the search-keys that will be processed
8652 * to find a matching entry into the available subtables.
8653 * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8654 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8655 BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8656
8657 struct dpcls_subtable *subtable;
8658 uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8659
8660 if (cnt != MAP_BITS) {
8661 keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8662 }
8663 memset(rules, 0, cnt * sizeof *rules);
8664
8665 int lookups_match = 0, subtable_pos = 1;
8666 uint32_t found_map;
8667
8668 /* The Datapath classifier - aka dpcls - is composed of subtables.
8669 * Subtables are dynamically created as needed when new rules are inserted.
8670 * Each subtable collects rules with matches on a specific subset of packet
8671 * fields as defined by the subtable's mask. We proceed to process every
8672 * search-key against each subtable, but when a match is found for a
8673 * search-key, the search for that key can stop because the rules are
8674 * non-overlapping. */
8675 PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8676 /* Call the subtable specific lookup function. */
8677 found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8678
8679 /* Count the number of subtables searched for this packet match. This
8680 * estimates the "spread" of subtables looked at per matched packet. */
8681 uint32_t pkts_matched = count_1bits(found_map);
8682 lookups_match += pkts_matched * subtable_pos;
8683
8684 /* Clear the found rules, and return early if all packets are found. */
8685 keys_map &= ~found_map;
8686 if (!keys_map) {
8687 if (num_lookups_p) {
8688 *num_lookups_p = lookups_match;
8689 }
8690 return true;
8691 }
8692 subtable_pos++;
8693 }
8694
8695 if (num_lookups_p) {
8696 *num_lookups_p = lookups_match;
8697 }
8698 return false;
8699 }