lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <inttypes.h>
  24 #include <net/if.h>
  25 #include <sys/types.h>
  26 #include <netinet/in.h>
  27 #include <stdint.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/ioctl.h>
  31 #include <sys/socket.h>
  32 #include <sys/stat.h>
  33 #include <unistd.h>
  34
  35 #include "bitmap.h"
  36 #include "cmap.h"
  37 #include "conntrack.h"
  38 #include "coverage.h"
  39 #include "ct-dpif.h"
  40 #include "csum.h"
  41 #include "dp-packet.h"
  42 #include "dpif.h"
  43 #include "dpif-netdev-perf.h"
  44 #include "dpif-provider.h"
  45 #include "dummy.h"
  46 #include "fat-rwlock.h"
  47 #include "flow.h"
  48 #include "hmapx.h"
  49 #include "id-pool.h"
  50 #include "ipf.h"
  51 #include "netdev.h"
  52 #include "netdev-offload.h"
  53 #include "netdev-provider.h"
  54 #include "netdev-vport.h"
  55 #include "netlink.h"
  56 #include "odp-execute.h"
  57 #include "odp-util.h"
  58 #include "openvswitch/dynamic-string.h"
  59 #include "openvswitch/list.h"
  60 #include "openvswitch/match.h"
  61 #include "openvswitch/ofp-parse.h"
  62 #include "openvswitch/ofp-print.h"
  63 #include "openvswitch/ofpbuf.h"
  64 #include "openvswitch/shash.h"
  65 #include "openvswitch/vlog.h"
  66 #include "ovs-numa.h"
  67 #include "ovs-rcu.h"
  68 #include "packets.h"
  69 #include "openvswitch/poll-loop.h"
  70 #include "pvector.h"
  71 #include "random.h"
  72 #include "seq.h"
  73 #include "smap.h"
  74 #include "sset.h"
  75 #include "timeval.h"
  76 #include "tnl-neigh-cache.h"
  77 #include "tnl-ports.h"
  78 #include "unixctl.h"
  79 #include "util.h"
  80 #include "uuid.h"
  81
  82 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  83
  84 /* Auto Load Balancing Defaults */
  85 #define ALB_ACCEPTABLE_IMPROVEMENT       25
  86 #define ALB_PMD_LOAD_THRESHOLD           95
  87 #define ALB_PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
  88 #define MIN_TO_MSEC                  60000
  89
  90 #define FLOW_DUMP_MAX_BATCH 50
  91 /* Use per thread recirc_depth to prevent recirculation loop. */
  92 #define MAX_RECIRC_DEPTH 6
  93 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  94
  95 /* Use instant packet send by default. */
  96 #define DEFAULT_TX_FLUSH_INTERVAL 0
  97
  98 /* Configuration parameters. */
  99 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
 100 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
 101 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
 102 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
 103
 104 /* Protects against changes to 'dp_netdevs'. */
 105 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
 106
 107 /* Contains all 'struct dp_netdev's. */
 108 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 109     = SHASH_INITIALIZER(&dp_netdevs);
 110
 111 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 112
 113 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 114                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 115                                      | CS_SRC_NAT | CS_DST_NAT)
 116 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 117
 118 static struct odp_support dp_netdev_support = {
 119     .max_vlan_headers = SIZE_MAX,
 120     .max_mpls_depth = SIZE_MAX,
 121     .recirc = true,
 122     .ct_state = true,
 123     .ct_zone = true,
 124     .ct_mark = true,
 125     .ct_label = true,
 126     .ct_state_nat = true,
 127     .ct_orig_tuple = true,
 128     .ct_orig_tuple6 = true,
 129 };
 130
 131 /* Stores a miniflow with inline values */
 132
 133 struct netdev_flow_key {
 134     uint32_t hash;       /* Hash function differs for different users. */
 135     uint32_t len;        /* Length of the following miniflow (incl. map). */
 136     struct miniflow mf;
 137     uint64_t buf[FLOW_MAX_PACKET_U64S];
 138 };
 139
 140 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
 141  *
 142  * Exact match cache for frequently used flows
 143  *
 144  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 145  * search its entries for a miniflow that matches exactly the miniflow of the
 146  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 147  *
 148  * A cache entry holds a reference to its 'dp_netdev_flow'.
 149  *
 150  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 151  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 152  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 153  * value is the index of a cache entry where the miniflow could be.
 154  *
 155  *
 156  * Signature match cache (SMC)
 157  *
 158  * This cache stores a 16-bit signature for each flow without storing keys, and
 159  * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 160  * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 161  * SMC uses a set-associative design that each bucket contains
 162  * SMC_ENTRY_PER_BUCKET number of entries.
 163  * Since 16-bit flow_table index is used, if there are more than 2^16
 164  * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 165  *
 166  *
 167  * Thread-safety
 168  * =============
 169  *
 170  * Each pmd_thread has its own private exact match cache.
 171  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 172  */
 173
 174 #define EM_FLOW_HASH_SHIFT 13
 175 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 176 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 177 #define EM_FLOW_HASH_SEGS 2
 178
 179 /* SMC uses a set-associative design. A bucket contains a set of entries that
 180  * a flow item can occupy. For now, it uses one hash function rather than two
 181  * as for the EMC design. */
 182 #define SMC_ENTRY_PER_BUCKET 4
 183 #define SMC_ENTRIES (1u << 20)
 184 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
 185 #define SMC_MASK (SMC_BUCKET_CNT - 1)
 186
 187 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 188 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 189 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 190                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 191
 192 struct emc_entry {
 193     struct dp_netdev_flow *flow;
 194     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 195 };
 196
 197 struct emc_cache {
 198     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 199     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 200 };
 201
 202 struct smc_bucket {
 203     uint16_t sig[SMC_ENTRY_PER_BUCKET];
 204     uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
 205 };
 206
 207 /* Signature match cache, differentiate from EMC cache */
 208 struct smc_cache {
 209     struct smc_bucket buckets[SMC_BUCKET_CNT];
 210 };
 211
 212 struct dfc_cache {
 213     struct emc_cache emc_cache;
 214     struct smc_cache smc_cache;
 215 };
 216
 217 /* Iterate in the exact match cache through every entry that might contain a
 218  * miniflow with hash 'HASH'. */
 219 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 220     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 221          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 222          i__ < EM_FLOW_HASH_SEGS;                                            \
 223          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 224 \f
 225 /* Simple non-wildcarding single-priority classifier. */
 226
 227 /* Time in microseconds between successive optimizations of the dpcls
 228  * subtable vector */
 229 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
 230
 231 /* Time in microseconds of the interval in which rxq processing cycles used
 232  * in rxq to pmd assignments is measured and stored. */
 233 #define PMD_RXQ_INTERVAL_LEN 10000000LL
 234
 235 /* Number of intervals for which cycles are stored
 236  * and used during rxq to pmd assignment. */
 237 #define PMD_RXQ_INTERVAL_MAX 6
 238
 239 struct dpcls {
 240     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 241     odp_port_t in_port;
 242     struct cmap subtables_map;
 243     struct pvector subtables;
 244 };
 245
 246 /* A rule to be inserted to the classifier. */
 247 struct dpcls_rule {
 248     struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 249     struct netdev_flow_key *mask; /* Subtable's mask. */
 250     struct netdev_flow_key flow;  /* Matching key. */
 251     /* 'flow' must be the last field, additional space is allocated here. */
 252 };
 253
 254 /* Data structure to keep packet order till fastpath processing. */
 255 struct dp_packet_flow_map {
 256     struct dp_packet *packet;
 257     struct dp_netdev_flow *flow;
 258     uint16_t tcp_flags;
 259 };
 260
 261 static void dpcls_init(struct dpcls *);
 262 static void dpcls_destroy(struct dpcls *);
 263 static void dpcls_sort_subtable_vector(struct dpcls *);
 264 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 265                          const struct netdev_flow_key *mask);
 266 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 267 static bool dpcls_lookup(struct dpcls *cls,
 268                          const struct netdev_flow_key *keys[],
 269                          struct dpcls_rule **rules, size_t cnt,
 270                          int *num_lookups_p);
 271 static bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
 272                             const struct netdev_flow_key *target);
 273 /* Set of supported meter flags */
 274 #define DP_SUPPORTED_METER_FLAGS_MASK \
 275     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 276
 277 /* Set of supported meter band types */
 278 #define DP_SUPPORTED_METER_BAND_TYPES           \
 279     ( 1 << OFPMBT13_DROP )
 280
 281 struct dp_meter_band {
 282     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 283     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 284     uint64_t packet_count;
 285     uint64_t byte_count;
 286 };
 287
 288 struct dp_meter {
 289     uint16_t flags;
 290     uint16_t n_bands;
 291     uint32_t max_delta_t;
 292     uint64_t used;
 293     uint64_t packet_count;
 294     uint64_t byte_count;
 295     struct dp_meter_band bands[];
 296 };
 297
 298 struct pmd_auto_lb {
 299     bool auto_lb_requested;     /* Auto load balancing requested by user. */
 300     bool is_enabled;            /* Current status of Auto load balancing. */
 301     uint64_t rebalance_intvl;
 302     uint64_t rebalance_poll_timer;
 303 };
 304
 305 /* Datapath based on the network device interface from netdev.h.
 306  *
 307  *
 308  * Thread-safety
 309  * =============
 310  *
 311  * Some members, marked 'const', are immutable.  Accessing other members
 312  * requires synchronization, as noted in more detail below.
 313  *
 314  * Acquisition order is, from outermost to innermost:
 315  *
 316  *    dp_netdev_mutex (global)
 317  *    port_mutex
 318  *    non_pmd_mutex
 319  */
 320 struct dp_netdev {
 321     const struct dpif_class *const class;
 322     const char *const name;
 323     struct dpif *dpif;
 324     struct ovs_refcount ref_cnt;
 325     atomic_flag destroyed;
 326
 327     /* Ports.
 328      *
 329      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 330      * through 'ports' requires taking 'port_mutex'. */
 331     struct ovs_mutex port_mutex;
 332     struct hmap ports;
 333     struct seq *port_seq;       /* Incremented whenever a port changes. */
 334
 335     /* The time that a packet can wait in output batch for sending. */
 336     atomic_uint32_t tx_flush_interval;
 337
 338     /* Meters. */
 339     struct ovs_mutex meter_locks[N_METER_LOCKS];
 340     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 341
 342     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 343     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 344     /* Enable collection of PMD performance metrics. */
 345     atomic_bool pmd_perf_metrics;
 346     /* Enable the SMC cache from ovsdb config */
 347     atomic_bool smc_enable_db;
 348
 349     /* Protects access to ofproto-dpif-upcall interface during revalidator
 350      * thread synchronization. */
 351     struct fat_rwlock upcall_rwlock;
 352     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 353     void *upcall_aux;
 354
 355     /* Callback function for notifying the purging of dp flows (during
 356      * reseting pmd deletion). */
 357     dp_purge_callback *dp_purge_cb;
 358     void *dp_purge_aux;
 359
 360     /* Stores all 'struct dp_netdev_pmd_thread's. */
 361     struct cmap poll_threads;
 362     /* id pool for per thread static_tx_qid. */
 363     struct id_pool *tx_qid_pool;
 364     struct ovs_mutex tx_qid_pool_mutex;
 365     /* Use measured cycles for rxq to pmd assignment. */
 366     bool pmd_rxq_assign_cyc;
 367
 368     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 369      * instance for non-pmd thread. */
 370     struct ovs_mutex non_pmd_mutex;
 371
 372     /* Each pmd thread will store its pointer to
 373      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 374     ovsthread_key_t per_pmd_key;
 375
 376     struct seq *reconfigure_seq;
 377     uint64_t last_reconfigure_seq;
 378
 379     /* Cpu mask for pin of pmd threads. */
 380     char *pmd_cmask;
 381
 382     uint64_t last_tnl_conf_seq;
 383
 384     struct conntrack *conntrack;
 385     struct pmd_auto_lb pmd_alb;
 386 };
 387
 388 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 389     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 390 {
 391     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 392 }
 393
 394 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 395     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 396 {
 397     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 398 }
 399
 400
 401 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 402                                                     odp_port_t)
 403     OVS_REQUIRES(dp->port_mutex);
 404
 405 enum rxq_cycles_counter_type {
 406     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 407                                    processing packets during the current
 408                                    interval. */
 409     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 410                                    during rxq to pmd assignment. */
 411     RXQ_N_CYCLES
 412 };
 413
 414 enum {
 415     DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
 416     DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
 417     DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
 418 };
 419
 420 struct dp_flow_offload_item {
 421     struct dp_netdev_pmd_thread *pmd;
 422     struct dp_netdev_flow *flow;
 423     int op;
 424     struct match match;
 425     struct nlattr *actions;
 426     size_t actions_len;
 427
 428     struct ovs_list node;
 429 };
 430
 431 struct dp_flow_offload {
 432     struct ovs_mutex mutex;
 433     struct ovs_list list;
 434     pthread_cond_t cond;
 435 };
 436
 437 static struct dp_flow_offload dp_flow_offload = {
 438     .mutex = OVS_MUTEX_INITIALIZER,
 439     .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
 440 };
 441
 442 static struct ovsthread_once offload_thread_once
 443     = OVSTHREAD_ONCE_INITIALIZER;
 444
 445 #define XPS_TIMEOUT 500000LL    /* In microseconds. */
 446
 447 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 448 struct dp_netdev_rxq {
 449     struct dp_netdev_port *port;
 450     struct netdev_rxq *rx;
 451     unsigned core_id;                  /* Core to which this queue should be
 452                                           pinned. OVS_CORE_UNSPEC if the
 453                                           queue doesn't need to be pinned to a
 454                                           particular core. */
 455     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 456     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 457     bool is_vhost;                     /* Is rxq of a vhost port. */
 458
 459     /* Counters of cycles spent successfully polling and processing pkts. */
 460     atomic_ullong cycles[RXQ_N_CYCLES];
 461     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 462        sum them to yield the cycles used for an rxq. */
 463     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 464 };
 465
 466 /* A port in a netdev-based datapath. */
 467 struct dp_netdev_port {
 468     odp_port_t port_no;
 469     bool dynamic_txqs;          /* If true XPS will be used. */
 470     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 471     struct netdev *netdev;
 472     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 473     struct netdev_saved_flags *sf;
 474     struct dp_netdev_rxq *rxqs;
 475     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 476     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 477     struct ovs_mutex txq_used_mutex;
 478     bool emc_enabled;           /* If true EMC will be used. */
 479     char *type;                 /* Port type as requested by user. */
 480     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 481 };
 482
 483 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 484 struct dp_netdev_flow_stats {
 485     atomic_llong used;             /* Last used time, in monotonic msecs. */
 486     atomic_ullong packet_count;    /* Number of packets matched. */
 487     atomic_ullong byte_count;      /* Number of bytes matched. */
 488     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 489 };
 490
 491 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 492  *
 493  *
 494  * Thread-safety
 495  * =============
 496  *
 497  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 498  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 499  *
 500  * Motivation
 501  * ----------
 502  *
 503  * The thread safety rules described here for "struct dp_netdev_flow" are
 504  * motivated by two goals:
 505  *
 506  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 507  *      reading bad data due to changes by some thread concurrently modifying
 508  *      those members.
 509  *
 510  *    - Prevent two threads making changes to members of a given "struct
 511  *      dp_netdev_flow" from interfering with each other.
 512  *
 513  *
 514  * Rules
 515  * -----
 516  *
 517  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 518  * grace period.  Code that needs to hold onto a flow for a while
 519  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 520  *
 521  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 522  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 523  * from modification.
 524  *
 525  * Some members, marked 'const', are immutable.  Accessing other members
 526  * requires synchronization, as noted in more detail below.
 527  */
 528 struct dp_netdev_flow {
 529     const struct flow flow;      /* Unmasked flow that created this entry. */
 530     /* Hash table index by unmasked flow. */
 531     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 532                                  /* 'flow_table'. */
 533     const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
 534     const ovs_u128 ufid;         /* Unique flow identifier. */
 535     const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
 536     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 537                                  /* flow. */
 538
 539     /* Number of references.
 540      * The classifier owns one reference.
 541      * Any thread trying to keep a rule from being freed should hold its own
 542      * reference. */
 543     struct ovs_refcount ref_cnt;
 544
 545     bool dead;
 546     uint32_t mark;               /* Unique flow mark assigned to a flow */
 547
 548     /* Statistics. */
 549     struct dp_netdev_flow_stats stats;
 550
 551     /* Actions. */
 552     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 553
 554     /* While processing a group of input packets, the datapath uses the next
 555      * member to store a pointer to the output batch for the flow.  It is
 556      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 557      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 558     struct packet_batch_per_flow *batch;
 559
 560     /* Packet classification. */
 561     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 562     /* 'cr' must be the last member. */
 563 };
 564
 565 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 566 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 567 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 568                                          struct flow *, bool);
 569
 570 /* A set of datapath actions within a "struct dp_netdev_flow".
 571  *
 572  *
 573  * Thread-safety
 574  * =============
 575  *
 576  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 577 struct dp_netdev_actions {
 578     /* These members are immutable: they do not change during the struct's
 579      * lifetime.  */
 580     unsigned int size;          /* Size of 'actions', in bytes. */
 581     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 582 };
 583
 584 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 585                                                    size_t);
 586 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 587     const struct dp_netdev_flow *);
 588 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 589
 590 struct polled_queue {
 591     struct dp_netdev_rxq *rxq;
 592     odp_port_t port_no;
 593     bool emc_enabled;
 594     bool rxq_enabled;
 595     uint64_t change_seq;
 596 };
 597
 598 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 599 struct rxq_poll {
 600     struct dp_netdev_rxq *rxq;
 601     struct hmap_node node;
 602 };
 603
 604 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 605  * 'tnl_port_cache' or 'tx_ports'. */
 606 struct tx_port {
 607     struct dp_netdev_port *port;
 608     int qid;
 609     long long last_used;
 610     struct hmap_node node;
 611     long long flush_time;
 612     struct dp_packet_batch output_pkts;
 613     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 614 };
 615
 616 /* A set of properties for the current processing loop that is not directly
 617  * associated with the pmd thread itself, but with the packets being
 618  * processed or the short-term system configuration (for example, time).
 619  * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 620 struct dp_netdev_pmd_thread_ctx {
 621     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 622     long long now;
 623     /* RX queue from which last packet was received. */
 624     struct dp_netdev_rxq *last_rxq;
 625     /* EMC insertion probability context for the current processing cycle. */
 626     uint32_t emc_insert_min;
 627 };
 628
 629 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 630  * the performance overhead of interrupt processing.  Therefore netdev can
 631  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 632  * these device to check for recv buffer.  pmd-thread does polling for
 633  * devices assigned to itself.
 634  *
 635  * DPDK used PMD for accessing NIC.
 636  *
 637  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 638  * I/O of all non-pmd threads.  There will be no actual thread created
 639  * for the instance.
 640  *
 641  * Each struct has its own flow cache and classifier per managed ingress port.
 642  * For packets received on ingress port, a look up is done on corresponding PMD
 643  * thread's flow cache and in case of a miss, lookup is performed in the
 644  * corresponding classifier of port.  Packets are executed with the found
 645  * actions in either case.
 646  * */
 647 struct dp_netdev_pmd_thread {
 648     struct dp_netdev *dp;
 649     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 650     struct cmap_node node;          /* In 'dp->poll_threads'. */
 651
 652     /* Per thread exact-match cache.  Note, the instance for cpu core
 653      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 654      * need to be protected by 'non_pmd_mutex'.  Every other instance
 655      * will only be accessed by its own pmd thread. */
 656     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
 657
 658     /* Flow-Table and classifiers
 659      *
 660      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 661      * changes to 'classifiers' must be made while still holding the
 662      * 'flow_mutex'.
 663      */
 664     struct ovs_mutex flow_mutex;
 665     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 666
 667     /* One classifier per in_port polled by the pmd */
 668     struct cmap classifiers;
 669     /* Periodically sort subtable vectors according to hit frequencies */
 670     long long int next_optimization;
 671     /* End of the next time interval for which processing cycles
 672        are stored for each polled rxq. */
 673     long long int rxq_next_cycle_store;
 674
 675     /* Last interval timestamp. */
 676     uint64_t intrvl_tsc_prev;
 677     /* Last interval cycles. */
 678     atomic_ullong intrvl_cycles;
 679
 680     /* Current context of the PMD thread. */
 681     struct dp_netdev_pmd_thread_ctx ctx;
 682
 683     struct seq *reload_seq;
 684     uint64_t last_reload_seq;
 685
 686     /* These are atomic variables used as a synchronization and configuration
 687      * points for thread reload/exit.
 688      *
 689      * 'reload' atomic is the main one and it's used as a memory
 690      * synchronization point for all other knobs and data.
 691      *
 692      * For a thread that requests PMD reload:
 693      *
 694      *   * All changes that should be visible to the PMD thread must be made
 695      *     before setting the 'reload'.  These changes could use any memory
 696      *     ordering model including 'relaxed'.
 697      *   * Setting the 'reload' atomic should occur in the same thread where
 698      *     all other PMD configuration options updated.
 699      *   * Setting the 'reload' atomic should be done with 'release' memory
 700      *     ordering model or stricter.  This will guarantee that all previous
 701      *     changes (including non-atomic and 'relaxed') will be visible to
 702      *     the PMD thread.
 703      *   * To check that reload is done, thread should poll the 'reload' atomic
 704      *     to become 'false'.  Polling should be done with 'acquire' memory
 705      *     ordering model or stricter.  This ensures that PMD thread completed
 706      *     the reload process.
 707      *
 708      * For the PMD thread:
 709      *
 710      *   * PMD thread should read 'reload' atomic with 'acquire' memory
 711      *     ordering model or stricter.  This will guarantee that all changes
 712      *     made before setting the 'reload' in the requesting thread will be
 713      *     visible to the PMD thread.
 714      *   * All other configuration data could be read with any memory
 715      *     ordering model (including non-atomic and 'relaxed') but *only after*
 716      *     reading the 'reload' atomic set to 'true'.
 717      *   * When the PMD reload done, PMD should (optionally) set all the below
 718      *     knobs except the 'reload' to their default ('false') values and
 719      *     (mandatory), as the last step, set the 'reload' to 'false' using
 720      *     'release' memory ordering model or stricter.  This will inform the
 721      *     requesting thread that PMD has completed a reload cycle.
 722      */
 723     atomic_bool reload;             /* Do we need to reload ports? */
 724     atomic_bool wait_for_reload;    /* Can we busy wait for the next reload? */
 725     atomic_bool reload_tx_qid;      /* Do we need to reload static_tx_qid? */
 726     atomic_bool exit;               /* For terminating the pmd thread. */
 727
 728     pthread_t thread;
 729     unsigned core_id;               /* CPU core id of this pmd thread. */
 730     int numa_id;                    /* numa node id of this pmd thread. */
 731     bool isolated;
 732
 733     /* Queue id used by this pmd thread to send packets on all netdevs if
 734      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 735      * than 'cmap_count(dp->poll_threads)'. */
 736     uint32_t static_tx_qid;
 737
 738     /* Number of filled output batches. */
 739     int n_output_batches;
 740
 741     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 742     /* List of rx queues to poll. */
 743     struct hmap poll_list OVS_GUARDED;
 744     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 745      * read by the pmd thread. */
 746     struct hmap tx_ports OVS_GUARDED;
 747
 748     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 749      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 750      * with at least one txq (that support send).  A port can be in both.
 751      *
 752      * There are two separate maps to make sure that we don't try to execute
 753      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 754      *
 755      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 756      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 757      * other instance will only be accessed by its own pmd thread. */
 758     struct hmap tnl_port_cache;
 759     struct hmap send_port_cache;
 760
 761     /* Keep track of detailed PMD performance statistics. */
 762     struct pmd_perf_stats perf_stats;
 763
 764     /* Stats from previous iteration used by automatic pmd
 765      * load balance logic. */
 766     uint64_t prev_stats[PMD_N_STATS];
 767     atomic_count pmd_overloaded;
 768
 769     /* Set to true if the pmd thread needs to be reloaded. */
 770     bool need_reload;
 771 };
 772
 773 /* Interface to netdev-based datapath. */
 774 struct dpif_netdev {
 775     struct dpif dpif;
 776     struct dp_netdev *dp;
 777     uint64_t last_port_seq;
 778 };
 779
 780 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 781                               struct dp_netdev_port **portp)
 782     OVS_REQUIRES(dp->port_mutex);
 783 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 784                             struct dp_netdev_port **portp)
 785     OVS_REQUIRES(dp->port_mutex);
 786 static void dp_netdev_free(struct dp_netdev *)
 787     OVS_REQUIRES(dp_netdev_mutex);
 788 static int do_add_port(struct dp_netdev *dp, const char *devname,
 789                        const char *type, odp_port_t port_no)
 790     OVS_REQUIRES(dp->port_mutex);
 791 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 792     OVS_REQUIRES(dp->port_mutex);
 793 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 794                             bool create, struct dpif **);
 795 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 796                                       struct dp_packet_batch *,
 797                                       bool should_steal,
 798                                       const struct flow *flow,
 799                                       const struct nlattr *actions,
 800                                       size_t actions_len);
 801 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 802                             struct dp_packet_batch *, odp_port_t port_no);
 803 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 804                                   struct dp_packet_batch *);
 805
 806 static void dp_netdev_disable_upcall(struct dp_netdev *);
 807 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 808 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 809                                     struct dp_netdev *dp, unsigned core_id,
 810                                     int numa_id);
 811 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 812 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 813     OVS_REQUIRES(dp->port_mutex);
 814
 815 static void *pmd_thread_main(void *);
 816 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 817                                                       unsigned core_id);
 818 static struct dp_netdev_pmd_thread *
 819 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 820 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 821                               struct dp_netdev_pmd_thread *pmd);
 822 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 823 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 824 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 825                                          struct dp_netdev_port *port)
 826     OVS_REQUIRES(pmd->port_mutex);
 827 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 828                                            struct tx_port *tx)
 829     OVS_REQUIRES(pmd->port_mutex);
 830 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 831                                      struct dp_netdev_rxq *rxq)
 832     OVS_REQUIRES(pmd->port_mutex);
 833 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 834                                        struct rxq_poll *poll)
 835     OVS_REQUIRES(pmd->port_mutex);
 836 static int
 837 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 838                                    bool force);
 839
 840 static void reconfigure_datapath(struct dp_netdev *dp)
 841     OVS_REQUIRES(dp->port_mutex);
 842 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 843 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 844 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 845 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 846     OVS_REQUIRES(pmd->port_mutex);
 847 static inline void
 848 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 849                            struct polled_queue *poll_list, int poll_cnt);
 850 static void
 851 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 852                          enum rxq_cycles_counter_type type,
 853                          unsigned long long cycles);
 854 static uint64_t
 855 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 856                          enum rxq_cycles_counter_type type);
 857 static void
 858 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 859                            unsigned long long cycles);
 860 static uint64_t
 861 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 862 static void
 863 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 864                                bool purge);
 865 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 866                                       struct tx_port *tx);
 867
 868 static inline bool emc_entry_alive(struct emc_entry *ce);
 869 static void emc_clear_entry(struct emc_entry *ce);
 870 static void smc_clear_entry(struct smc_bucket *b, int idx);
 871
 872 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
 873 static inline bool
 874 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
 875 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 876                                   struct dp_netdev_flow *flow);
 877
 878 static void
 879 emc_cache_init(struct emc_cache *flow_cache)
 880 {
 881     int i;
 882
 883     flow_cache->sweep_idx = 0;
 884     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 885         flow_cache->entries[i].flow = NULL;
 886         flow_cache->entries[i].key.hash = 0;
 887         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 888         flowmap_init(&flow_cache->entries[i].key.mf.map);
 889     }
 890 }
 891
 892 static void
 893 smc_cache_init(struct smc_cache *smc_cache)
 894 {
 895     int i, j;
 896     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 897         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 898             smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
 899         }
 900     }
 901 }
 902
 903 static void
 904 dfc_cache_init(struct dfc_cache *flow_cache)
 905 {
 906     emc_cache_init(&flow_cache->emc_cache);
 907     smc_cache_init(&flow_cache->smc_cache);
 908 }
 909
 910 static void
 911 emc_cache_uninit(struct emc_cache *flow_cache)
 912 {
 913     int i;
 914
 915     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 916         emc_clear_entry(&flow_cache->entries[i]);
 917     }
 918 }
 919
 920 static void
 921 smc_cache_uninit(struct smc_cache *smc)
 922 {
 923     int i, j;
 924
 925     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 926         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 927             smc_clear_entry(&(smc->buckets[i]), j);
 928         }
 929     }
 930 }
 931
 932 static void
 933 dfc_cache_uninit(struct dfc_cache *flow_cache)
 934 {
 935     smc_cache_uninit(&flow_cache->smc_cache);
 936     emc_cache_uninit(&flow_cache->emc_cache);
 937 }
 938
 939 /* Check and clear dead flow references slowly (one entry at each
 940  * invocation).  */
 941 static void
 942 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 943 {
 944     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 945
 946     if (!emc_entry_alive(entry)) {
 947         emc_clear_entry(entry);
 948     }
 949     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 950 }
 951
 952 /* Updates the time in PMD threads context and should be called in three cases:
 953  *
 954  *     1. PMD structure initialization:
 955  *         - dp_netdev_configure_pmd()
 956  *
 957  *     2. Before processing of the new packet batch:
 958  *         - dpif_netdev_execute()
 959  *         - dp_netdev_process_rxq_port()
 960  *
 961  *     3. At least once per polling iteration in main polling threads if no
 962  *        packets received on current iteration:
 963  *         - dpif_netdev_run()
 964  *         - pmd_thread_main()
 965  *
 966  * 'pmd->ctx.now' should be used without update in all other cases if possible.
 967  */
 968 static inline void
 969 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
 970 {
 971     pmd->ctx.now = time_usec();
 972 }
 973
 974 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 975 bool
 976 dpif_is_netdev(const struct dpif *dpif)
 977 {
 978     return dpif->dpif_class->open == dpif_netdev_open;
 979 }
 980
 981 static struct dpif_netdev *
 982 dpif_netdev_cast(const struct dpif *dpif)
 983 {
 984     ovs_assert(dpif_is_netdev(dpif));
 985     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 986 }
 987
 988 static struct dp_netdev *
 989 get_dp_netdev(const struct dpif *dpif)
 990 {
 991     return dpif_netdev_cast(dpif)->dp;
 992 }
 993 \f
 994 enum pmd_info_type {
 995     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 996     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
 997     PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
 998     PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
 999 };
1000
1001 static void
1002 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1003 {
1004     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1005                         ? "main thread" : "pmd thread");
1006     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1007         ds_put_format(reply, " numa_id %d", pmd->numa_id);
1008     }
1009     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1010         ds_put_format(reply, " core_id %u", pmd->core_id);
1011     }
1012     ds_put_cstr(reply, ":\n");
1013 }
1014
1015 static void
1016 pmd_info_show_stats(struct ds *reply,
1017                     struct dp_netdev_pmd_thread *pmd)
1018 {
1019     uint64_t stats[PMD_N_STATS];
1020     uint64_t total_cycles, total_packets;
1021     double passes_per_pkt = 0;
1022     double lookups_per_hit = 0;
1023     double packets_per_batch = 0;
1024
1025     pmd_perf_read_counters(&pmd->perf_stats, stats);
1026     total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1027                          + stats[PMD_CYCLES_ITER_BUSY];
1028     total_packets = stats[PMD_STAT_RECV];
1029
1030     format_pmd_thread(reply, pmd);
1031
1032     if (total_packets > 0) {
1033         passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1034                             / (double) total_packets;
1035     }
1036     if (stats[PMD_STAT_MASKED_HIT] > 0) {
1037         lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1038                             / (double) stats[PMD_STAT_MASKED_HIT];
1039     }
1040     if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1041         packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1042                             / (double) stats[PMD_STAT_SENT_BATCHES];
1043     }
1044
1045     ds_put_format(reply,
1046                   "  packets received: %"PRIu64"\n"
1047                   "  packet recirculations: %"PRIu64"\n"
1048                   "  avg. datapath passes per packet: %.02f\n"
1049                   "  emc hits: %"PRIu64"\n"
1050                   "  smc hits: %"PRIu64"\n"
1051                   "  megaflow hits: %"PRIu64"\n"
1052                   "  avg. subtable lookups per megaflow hit: %.02f\n"
1053                   "  miss with success upcall: %"PRIu64"\n"
1054                   "  miss with failed upcall: %"PRIu64"\n"
1055                   "  avg. packets per output batch: %.02f\n",
1056                   total_packets, stats[PMD_STAT_RECIRC],
1057                   passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1058                   stats[PMD_STAT_SMC_HIT],
1059                   stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1060                   stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1061                   packets_per_batch);
1062
1063     if (total_cycles == 0) {
1064         return;
1065     }
1066
1067     ds_put_format(reply,
1068                   "  idle cycles: %"PRIu64" (%.02f%%)\n"
1069                   "  processing cycles: %"PRIu64" (%.02f%%)\n",
1070                   stats[PMD_CYCLES_ITER_IDLE],
1071                   stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1072                   stats[PMD_CYCLES_ITER_BUSY],
1073                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1074
1075     if (total_packets == 0) {
1076         return;
1077     }
1078
1079     ds_put_format(reply,
1080                   "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1081                   total_cycles / (double) total_packets,
1082                   total_cycles, total_packets);
1083
1084     ds_put_format(reply,
1085                   "  avg processing cycles per packet: "
1086                   "%.02f (%"PRIu64"/%"PRIu64")\n",
1087                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1088                   stats[PMD_CYCLES_ITER_BUSY], total_packets);
1089 }
1090
1091 static void
1092 pmd_info_show_perf(struct ds *reply,
1093                    struct dp_netdev_pmd_thread *pmd,
1094                    struct pmd_perf_params *par)
1095 {
1096     if (pmd->core_id != NON_PMD_CORE_ID) {
1097         char *time_str =
1098                 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1099         long long now = time_msec();
1100         double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1101
1102         ds_put_cstr(reply, "\n");
1103         ds_put_format(reply, "Time: %s\n", time_str);
1104         ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1105         ds_put_cstr(reply, "\n");
1106         format_pmd_thread(reply, pmd);
1107         ds_put_cstr(reply, "\n");
1108         pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1109         if (pmd_perf_metrics_enabled(pmd)) {
1110             /* Prevent parallel clearing of perf metrics. */
1111             ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1112             if (par->histograms) {
1113                 ds_put_cstr(reply, "\n");
1114                 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1115             }
1116             if (par->iter_hist_len > 0) {
1117                 ds_put_cstr(reply, "\n");
1118                 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1119                         par->iter_hist_len);
1120             }
1121             if (par->ms_hist_len > 0) {
1122                 ds_put_cstr(reply, "\n");
1123                 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1124                         par->ms_hist_len);
1125             }
1126             ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1127         }
1128         free(time_str);
1129     }
1130 }
1131
1132 static int
1133 compare_poll_list(const void *a_, const void *b_)
1134 {
1135     const struct rxq_poll *a = a_;
1136     const struct rxq_poll *b = b_;
1137
1138     const char *namea = netdev_rxq_get_name(a->rxq->rx);
1139     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1140
1141     int cmp = strcmp(namea, nameb);
1142     if (!cmp) {
1143         return netdev_rxq_get_queue_id(a->rxq->rx)
1144                - netdev_rxq_get_queue_id(b->rxq->rx);
1145     } else {
1146         return cmp;
1147     }
1148 }
1149
1150 static void
1151 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1152                  size_t *n)
1153     OVS_REQUIRES(pmd->port_mutex)
1154 {
1155     struct rxq_poll *ret, *poll;
1156     size_t i;
1157
1158     *n = hmap_count(&pmd->poll_list);
1159     if (!*n) {
1160         ret = NULL;
1161     } else {
1162         ret = xcalloc(*n, sizeof *ret);
1163         i = 0;
1164         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1165             ret[i] = *poll;
1166             i++;
1167         }
1168         ovs_assert(i == *n);
1169         qsort(ret, *n, sizeof *ret, compare_poll_list);
1170     }
1171
1172     *list = ret;
1173 }
1174
1175 static void
1176 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1177 {
1178     if (pmd->core_id != NON_PMD_CORE_ID) {
1179         struct rxq_poll *list;
1180         size_t n_rxq;
1181         uint64_t total_cycles = 0;
1182
1183         ds_put_format(reply,
1184                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
1185                       pmd->numa_id, pmd->core_id, (pmd->isolated)
1186                                                   ? "true" : "false");
1187
1188         ovs_mutex_lock(&pmd->port_mutex);
1189         sorted_poll_list(pmd, &list, &n_rxq);
1190
1191         /* Get the total pmd cycles for an interval. */
1192         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1193         /* Estimate the cycles to cover all intervals. */
1194         total_cycles *= PMD_RXQ_INTERVAL_MAX;
1195
1196         for (int i = 0; i < n_rxq; i++) {
1197             struct dp_netdev_rxq *rxq = list[i].rxq;
1198             const char *name = netdev_rxq_get_name(rxq->rx);
1199             uint64_t proc_cycles = 0;
1200
1201             for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1202                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1203             }
1204             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
1205                           netdev_rxq_get_queue_id(list[i].rxq->rx));
1206             ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1207                                         ? "(enabled) " : "(disabled)");
1208             ds_put_format(reply, "  pmd usage: ");
1209             if (total_cycles) {
1210                 ds_put_format(reply, "%2"PRIu64"",
1211                               proc_cycles * 100 / total_cycles);
1212                 ds_put_cstr(reply, " %");
1213             } else {
1214                 ds_put_format(reply, "%s", "NOT AVAIL");
1215             }
1216             ds_put_cstr(reply, "\n");
1217         }
1218         ovs_mutex_unlock(&pmd->port_mutex);
1219         free(list);
1220     }
1221 }
1222
1223 static int
1224 compare_poll_thread_list(const void *a_, const void *b_)
1225 {
1226     const struct dp_netdev_pmd_thread *a, *b;
1227
1228     a = *(struct dp_netdev_pmd_thread **)a_;
1229     b = *(struct dp_netdev_pmd_thread **)b_;
1230
1231     if (a->core_id < b->core_id) {
1232         return -1;
1233     }
1234     if (a->core_id > b->core_id) {
1235         return 1;
1236     }
1237     return 0;
1238 }
1239
1240 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1241  * this list, as long as we do not go to quiescent state. */
1242 static void
1243 sorted_poll_thread_list(struct dp_netdev *dp,
1244                         struct dp_netdev_pmd_thread ***list,
1245                         size_t *n)
1246 {
1247     struct dp_netdev_pmd_thread *pmd;
1248     struct dp_netdev_pmd_thread **pmd_list;
1249     size_t k = 0, n_pmds;
1250
1251     n_pmds = cmap_count(&dp->poll_threads);
1252     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1253
1254     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1255         if (k >= n_pmds) {
1256             break;
1257         }
1258         pmd_list[k++] = pmd;
1259     }
1260
1261     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1262
1263     *list = pmd_list;
1264     *n = k;
1265 }
1266
1267 static void
1268 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1269                           const char *argv[], void *aux OVS_UNUSED)
1270 {
1271     struct ds reply = DS_EMPTY_INITIALIZER;
1272     struct dp_netdev *dp = NULL;
1273
1274     ovs_mutex_lock(&dp_netdev_mutex);
1275
1276     if (argc == 2) {
1277         dp = shash_find_data(&dp_netdevs, argv[1]);
1278     } else if (shash_count(&dp_netdevs) == 1) {
1279         /* There's only one datapath */
1280         dp = shash_first(&dp_netdevs)->data;
1281     }
1282
1283     if (!dp) {
1284         ovs_mutex_unlock(&dp_netdev_mutex);
1285         unixctl_command_reply_error(conn,
1286                                     "please specify an existing datapath");
1287         return;
1288     }
1289
1290     dp_netdev_request_reconfigure(dp);
1291     ovs_mutex_unlock(&dp_netdev_mutex);
1292     ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1293     unixctl_command_reply(conn, ds_cstr(&reply));
1294     ds_destroy(&reply);
1295 }
1296
1297 static void
1298 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1299                      void *aux)
1300 {
1301     struct ds reply = DS_EMPTY_INITIALIZER;
1302     struct dp_netdev_pmd_thread **pmd_list;
1303     struct dp_netdev *dp = NULL;
1304     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1305     unsigned int core_id;
1306     bool filter_on_pmd = false;
1307     size_t n;
1308
1309     ovs_mutex_lock(&dp_netdev_mutex);
1310
1311     while (argc > 1) {
1312         if (!strcmp(argv[1], "-pmd") && argc > 2) {
1313             if (str_to_uint(argv[2], 10, &core_id)) {
1314                 filter_on_pmd = true;
1315             }
1316             argc -= 2;
1317             argv += 2;
1318         } else {
1319             dp = shash_find_data(&dp_netdevs, argv[1]);
1320             argc -= 1;
1321             argv += 1;
1322         }
1323     }
1324
1325     if (!dp) {
1326         if (shash_count(&dp_netdevs) == 1) {
1327             /* There's only one datapath */
1328             dp = shash_first(&dp_netdevs)->data;
1329         } else {
1330             ovs_mutex_unlock(&dp_netdev_mutex);
1331             unixctl_command_reply_error(conn,
1332                                         "please specify an existing datapath");
1333             return;
1334         }
1335     }
1336
1337     sorted_poll_thread_list(dp, &pmd_list, &n);
1338     for (size_t i = 0; i < n; i++) {
1339         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1340         if (!pmd) {
1341             break;
1342         }
1343         if (filter_on_pmd && pmd->core_id != core_id) {
1344             continue;
1345         }
1346         if (type == PMD_INFO_SHOW_RXQ) {
1347             pmd_info_show_rxq(&reply, pmd);
1348         } else if (type == PMD_INFO_CLEAR_STATS) {
1349             pmd_perf_stats_clear(&pmd->perf_stats);
1350         } else if (type == PMD_INFO_SHOW_STATS) {
1351             pmd_info_show_stats(&reply, pmd);
1352         } else if (type == PMD_INFO_PERF_SHOW) {
1353             pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1354         }
1355     }
1356     free(pmd_list);
1357
1358     ovs_mutex_unlock(&dp_netdev_mutex);
1359
1360     unixctl_command_reply(conn, ds_cstr(&reply));
1361     ds_destroy(&reply);
1362 }
1363
1364 static void
1365 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1366                           const char *argv[],
1367                           void *aux OVS_UNUSED)
1368 {
1369     struct pmd_perf_params par;
1370     long int it_hist = 0, ms_hist = 0;
1371     par.histograms = true;
1372
1373     while (argc > 1) {
1374         if (!strcmp(argv[1], "-nh")) {
1375             par.histograms = false;
1376             argc -= 1;
1377             argv += 1;
1378         } else if (!strcmp(argv[1], "-it") && argc > 2) {
1379             it_hist = strtol(argv[2], NULL, 10);
1380             if (it_hist < 0) {
1381                 it_hist = 0;
1382             } else if (it_hist > HISTORY_LEN) {
1383                 it_hist = HISTORY_LEN;
1384             }
1385             argc -= 2;
1386             argv += 2;
1387         } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1388             ms_hist = strtol(argv[2], NULL, 10);
1389             if (ms_hist < 0) {
1390                 ms_hist = 0;
1391             } else if (ms_hist > HISTORY_LEN) {
1392                 ms_hist = HISTORY_LEN;
1393             }
1394             argc -= 2;
1395             argv += 2;
1396         } else {
1397             break;
1398         }
1399     }
1400     par.iter_hist_len = it_hist;
1401     par.ms_hist_len = ms_hist;
1402     par.command_type = PMD_INFO_PERF_SHOW;
1403     dpif_netdev_pmd_info(conn, argc, argv, &par);
1404 }
1405 \f
1406 static int
1407 dpif_netdev_init(void)
1408 {
1409     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1410                               clear_aux = PMD_INFO_CLEAR_STATS,
1411                               poll_aux = PMD_INFO_SHOW_RXQ;
1412
1413     unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1414                              0, 3, dpif_netdev_pmd_info,
1415                              (void *)&show_aux);
1416     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1417                              0, 3, dpif_netdev_pmd_info,
1418                              (void *)&clear_aux);
1419     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1420                              0, 3, dpif_netdev_pmd_info,
1421                              (void *)&poll_aux);
1422     unixctl_command_register("dpif-netdev/pmd-perf-show",
1423                              "[-nh] [-it iter-history-len]"
1424                              " [-ms ms-history-len]"
1425                              " [-pmd core] [dp]",
1426                              0, 8, pmd_perf_show_cmd,
1427                              NULL);
1428     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1429                              0, 1, dpif_netdev_pmd_rebalance,
1430                              NULL);
1431     unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1432                              "on|off [-b before] [-a after] [-e|-ne] "
1433                              "[-us usec] [-q qlen]",
1434                              0, 10, pmd_perf_log_set_cmd,
1435                              NULL);
1436     return 0;
1437 }
1438
1439 static int
1440 dpif_netdev_enumerate(struct sset *all_dps,
1441                       const struct dpif_class *dpif_class)
1442 {
1443     struct shash_node *node;
1444
1445     ovs_mutex_lock(&dp_netdev_mutex);
1446     SHASH_FOR_EACH(node, &dp_netdevs) {
1447         struct dp_netdev *dp = node->data;
1448         if (dpif_class != dp->class) {
1449             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1450              * If the class doesn't match, skip this dpif. */
1451              continue;
1452         }
1453         sset_add(all_dps, node->name);
1454     }
1455     ovs_mutex_unlock(&dp_netdev_mutex);
1456
1457     return 0;
1458 }
1459
1460 static bool
1461 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1462 {
1463     return class != &dpif_netdev_class;
1464 }
1465
1466 static const char *
1467 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1468 {
1469     return strcmp(type, "internal") ? type
1470                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1471                   : "tap";
1472 }
1473
1474 static struct dpif *
1475 create_dpif_netdev(struct dp_netdev *dp)
1476 {
1477     uint16_t netflow_id = hash_string(dp->name, 0);
1478     struct dpif_netdev *dpif;
1479
1480     ovs_refcount_ref(&dp->ref_cnt);
1481
1482     dpif = xmalloc(sizeof *dpif);
1483     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1484     dpif->dp = dp;
1485     dpif->last_port_seq = seq_read(dp->port_seq);
1486
1487     return &dpif->dpif;
1488 }
1489
1490 /* Choose an unused, non-zero port number and return it on success.
1491  * Return ODPP_NONE on failure. */
1492 static odp_port_t
1493 choose_port(struct dp_netdev *dp, const char *name)
1494     OVS_REQUIRES(dp->port_mutex)
1495 {
1496     uint32_t port_no;
1497
1498     if (dp->class != &dpif_netdev_class) {
1499         const char *p;
1500         int start_no = 0;
1501
1502         /* If the port name begins with "br", start the number search at
1503          * 100 to make writing tests easier. */
1504         if (!strncmp(name, "br", 2)) {
1505             start_no = 100;
1506         }
1507
1508         /* If the port name contains a number, try to assign that port number.
1509          * This can make writing unit tests easier because port numbers are
1510          * predictable. */
1511         for (p = name; *p != '\0'; p++) {
1512             if (isdigit((unsigned char) *p)) {
1513                 port_no = start_no + strtol(p, NULL, 10);
1514                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1515                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1516                     return u32_to_odp(port_no);
1517                 }
1518                 break;
1519             }
1520         }
1521     }
1522
1523     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1524         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1525             return u32_to_odp(port_no);
1526         }
1527     }
1528
1529     return ODPP_NONE;
1530 }
1531
1532 static int
1533 create_dp_netdev(const char *name, const struct dpif_class *class,
1534                  struct dp_netdev **dpp)
1535     OVS_REQUIRES(dp_netdev_mutex)
1536 {
1537     struct dp_netdev *dp;
1538     int error;
1539
1540     dp = xzalloc(sizeof *dp);
1541     shash_add(&dp_netdevs, name, dp);
1542
1543     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1544     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1545     ovs_refcount_init(&dp->ref_cnt);
1546     atomic_flag_clear(&dp->destroyed);
1547
1548     ovs_mutex_init(&dp->port_mutex);
1549     hmap_init(&dp->ports);
1550     dp->port_seq = seq_create();
1551     fat_rwlock_init(&dp->upcall_rwlock);
1552
1553     dp->reconfigure_seq = seq_create();
1554     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1555
1556     for (int i = 0; i < N_METER_LOCKS; ++i) {
1557         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1558     }
1559
1560     /* Disable upcalls by default. */
1561     dp_netdev_disable_upcall(dp);
1562     dp->upcall_aux = NULL;
1563     dp->upcall_cb = NULL;
1564
1565     dp->conntrack = conntrack_init();
1566
1567     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1568     atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1569
1570     cmap_init(&dp->poll_threads);
1571     dp->pmd_rxq_assign_cyc = true;
1572
1573     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1574     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1575     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1576
1577     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1578     ovsthread_key_create(&dp->per_pmd_key, NULL);
1579
1580     ovs_mutex_lock(&dp->port_mutex);
1581     /* non-PMD will be created before all other threads and will
1582      * allocate static_tx_qid = 0. */
1583     dp_netdev_set_nonpmd(dp);
1584
1585     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1586                                                              "internal"),
1587                         ODPP_LOCAL);
1588     ovs_mutex_unlock(&dp->port_mutex);
1589     if (error) {
1590         dp_netdev_free(dp);
1591         return error;
1592     }
1593
1594     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1595     *dpp = dp;
1596     return 0;
1597 }
1598
1599 static void
1600 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1601 {
1602     seq_change(dp->reconfigure_seq);
1603 }
1604
1605 static bool
1606 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1607 {
1608     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1609 }
1610
1611 static int
1612 dpif_netdev_open(const struct dpif_class *class, const char *name,
1613                  bool create, struct dpif **dpifp)
1614 {
1615     struct dp_netdev *dp;
1616     int error;
1617
1618     ovs_mutex_lock(&dp_netdev_mutex);
1619     dp = shash_find_data(&dp_netdevs, name);
1620     if (!dp) {
1621         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1622     } else {
1623         error = (dp->class != class ? EINVAL
1624                  : create ? EEXIST
1625                  : 0);
1626     }
1627     if (!error) {
1628         *dpifp = create_dpif_netdev(dp);
1629         dp->dpif = *dpifp;
1630     }
1631     ovs_mutex_unlock(&dp_netdev_mutex);
1632
1633     return error;
1634 }
1635
1636 static void
1637 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1638     OVS_NO_THREAD_SAFETY_ANALYSIS
1639 {
1640     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1641     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1642
1643     /* Before freeing a lock we should release it */
1644     fat_rwlock_unlock(&dp->upcall_rwlock);
1645     fat_rwlock_destroy(&dp->upcall_rwlock);
1646 }
1647
1648 static void
1649 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1650     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1651 {
1652     if (dp->meters[meter_id]) {
1653         free(dp->meters[meter_id]);
1654         dp->meters[meter_id] = NULL;
1655     }
1656 }
1657
1658 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1659  * through the 'dp_netdevs' shash while freeing 'dp'. */
1660 static void
1661 dp_netdev_free(struct dp_netdev *dp)
1662     OVS_REQUIRES(dp_netdev_mutex)
1663 {
1664     struct dp_netdev_port *port, *next;
1665
1666     shash_find_and_delete(&dp_netdevs, dp->name);
1667
1668     ovs_mutex_lock(&dp->port_mutex);
1669     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1670         do_del_port(dp, port);
1671     }
1672     ovs_mutex_unlock(&dp->port_mutex);
1673
1674     dp_netdev_destroy_all_pmds(dp, true);
1675     cmap_destroy(&dp->poll_threads);
1676
1677     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1678     id_pool_destroy(dp->tx_qid_pool);
1679
1680     ovs_mutex_destroy(&dp->non_pmd_mutex);
1681     ovsthread_key_delete(dp->per_pmd_key);
1682
1683     conntrack_destroy(dp->conntrack);
1684
1685
1686     seq_destroy(dp->reconfigure_seq);
1687
1688     seq_destroy(dp->port_seq);
1689     hmap_destroy(&dp->ports);
1690     ovs_mutex_destroy(&dp->port_mutex);
1691
1692     /* Upcalls must be disabled at this point */
1693     dp_netdev_destroy_upcall_lock(dp);
1694
1695     int i;
1696
1697     for (i = 0; i < MAX_METERS; ++i) {
1698         meter_lock(dp, i);
1699         dp_delete_meter(dp, i);
1700         meter_unlock(dp, i);
1701     }
1702     for (i = 0; i < N_METER_LOCKS; ++i) {
1703         ovs_mutex_destroy(&dp->meter_locks[i]);
1704     }
1705
1706     free(dp->pmd_cmask);
1707     free(CONST_CAST(char *, dp->name));
1708     free(dp);
1709 }
1710
1711 static void
1712 dp_netdev_unref(struct dp_netdev *dp)
1713 {
1714     if (dp) {
1715         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1716          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1717         ovs_mutex_lock(&dp_netdev_mutex);
1718         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1719             dp_netdev_free(dp);
1720         }
1721         ovs_mutex_unlock(&dp_netdev_mutex);
1722     }
1723 }
1724
1725 static void
1726 dpif_netdev_close(struct dpif *dpif)
1727 {
1728     struct dp_netdev *dp = get_dp_netdev(dpif);
1729
1730     dp_netdev_unref(dp);
1731     free(dpif);
1732 }
1733
1734 static int
1735 dpif_netdev_destroy(struct dpif *dpif)
1736 {
1737     struct dp_netdev *dp = get_dp_netdev(dpif);
1738
1739     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1740         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1741             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1742             OVS_NOT_REACHED();
1743         }
1744     }
1745
1746     return 0;
1747 }
1748
1749 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1750  * load/store semantics.  While the increment is not atomic, the load and
1751  * store operations are, making it impossible to read inconsistent values.
1752  *
1753  * This is used to update thread local stats counters. */
1754 static void
1755 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1756 {
1757     unsigned long long tmp;
1758
1759     atomic_read_relaxed(var, &tmp);
1760     tmp += n;
1761     atomic_store_relaxed(var, tmp);
1762 }
1763
1764 static int
1765 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1766 {
1767     struct dp_netdev *dp = get_dp_netdev(dpif);
1768     struct dp_netdev_pmd_thread *pmd;
1769     uint64_t pmd_stats[PMD_N_STATS];
1770
1771     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1772     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1773         stats->n_flows += cmap_count(&pmd->flow_table);
1774         pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1775         stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1776         stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1777         stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1778         stats->n_missed += pmd_stats[PMD_STAT_MISS];
1779         stats->n_lost += pmd_stats[PMD_STAT_LOST];
1780     }
1781     stats->n_masks = UINT32_MAX;
1782     stats->n_mask_hit = UINT64_MAX;
1783
1784     return 0;
1785 }
1786
1787 static void
1788 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1789 {
1790     if (pmd->core_id == NON_PMD_CORE_ID) {
1791         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1792         ovs_mutex_lock(&pmd->port_mutex);
1793         pmd_load_cached_ports(pmd);
1794         ovs_mutex_unlock(&pmd->port_mutex);
1795         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1796         return;
1797     }
1798
1799     seq_change(pmd->reload_seq);
1800     atomic_store_explicit(&pmd->reload, true, memory_order_release);
1801 }
1802
1803 static uint32_t
1804 hash_port_no(odp_port_t port_no)
1805 {
1806     return hash_int(odp_to_u32(port_no), 0);
1807 }
1808
1809 static int
1810 port_create(const char *devname, const char *type,
1811             odp_port_t port_no, struct dp_netdev_port **portp)
1812 {
1813     struct netdev_saved_flags *sf;
1814     struct dp_netdev_port *port;
1815     enum netdev_flags flags;
1816     struct netdev *netdev;
1817     int error;
1818
1819     *portp = NULL;
1820
1821     /* Open and validate network device. */
1822     error = netdev_open(devname, type, &netdev);
1823     if (error) {
1824         return error;
1825     }
1826     /* XXX reject non-Ethernet devices */
1827
1828     netdev_get_flags(netdev, &flags);
1829     if (flags & NETDEV_LOOPBACK) {
1830         VLOG_ERR("%s: cannot add a loopback device", devname);
1831         error = EINVAL;
1832         goto out;
1833     }
1834
1835     error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1836     if (error) {
1837         VLOG_ERR("%s: cannot set promisc flag", devname);
1838         goto out;
1839     }
1840
1841     port = xzalloc(sizeof *port);
1842     port->port_no = port_no;
1843     port->netdev = netdev;
1844     port->type = xstrdup(type);
1845     port->sf = sf;
1846     port->emc_enabled = true;
1847     port->need_reconfigure = true;
1848     ovs_mutex_init(&port->txq_used_mutex);
1849
1850     *portp = port;
1851
1852     return 0;
1853
1854 out:
1855     netdev_close(netdev);
1856     return error;
1857 }
1858
1859 static int
1860 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1861             odp_port_t port_no)
1862     OVS_REQUIRES(dp->port_mutex)
1863 {
1864     struct dp_netdev_port *port;
1865     int error;
1866
1867     /* Reject devices already in 'dp'. */
1868     if (!get_port_by_name(dp, devname, &port)) {
1869         return EEXIST;
1870     }
1871
1872     error = port_create(devname, type, port_no, &port);
1873     if (error) {
1874         return error;
1875     }
1876
1877     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1878     seq_change(dp->port_seq);
1879
1880     reconfigure_datapath(dp);
1881
1882     return 0;
1883 }
1884
1885 static int
1886 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1887                      odp_port_t *port_nop)
1888 {
1889     struct dp_netdev *dp = get_dp_netdev(dpif);
1890     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1891     const char *dpif_port;
1892     odp_port_t port_no;
1893     int error;
1894
1895     ovs_mutex_lock(&dp->port_mutex);
1896     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1897     if (*port_nop != ODPP_NONE) {
1898         port_no = *port_nop;
1899         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1900     } else {
1901         port_no = choose_port(dp, dpif_port);
1902         error = port_no == ODPP_NONE ? EFBIG : 0;
1903     }
1904     if (!error) {
1905         *port_nop = port_no;
1906         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1907     }
1908     ovs_mutex_unlock(&dp->port_mutex);
1909
1910     return error;
1911 }
1912
1913 static int
1914 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1915 {
1916     struct dp_netdev *dp = get_dp_netdev(dpif);
1917     int error;
1918
1919     ovs_mutex_lock(&dp->port_mutex);
1920     if (port_no == ODPP_LOCAL) {
1921         error = EINVAL;
1922     } else {
1923         struct dp_netdev_port *port;
1924
1925         error = get_port_by_number(dp, port_no, &port);
1926         if (!error) {
1927             do_del_port(dp, port);
1928         }
1929     }
1930     ovs_mutex_unlock(&dp->port_mutex);
1931
1932     return error;
1933 }
1934
1935 static bool
1936 is_valid_port_number(odp_port_t port_no)
1937 {
1938     return port_no != ODPP_NONE;
1939 }
1940
1941 static struct dp_netdev_port *
1942 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1943     OVS_REQUIRES(dp->port_mutex)
1944 {
1945     struct dp_netdev_port *port;
1946
1947     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1948         if (port->port_no == port_no) {
1949             return port;
1950         }
1951     }
1952     return NULL;
1953 }
1954
1955 static int
1956 get_port_by_number(struct dp_netdev *dp,
1957                    odp_port_t port_no, struct dp_netdev_port **portp)
1958     OVS_REQUIRES(dp->port_mutex)
1959 {
1960     if (!is_valid_port_number(port_no)) {
1961         *portp = NULL;
1962         return EINVAL;
1963     } else {
1964         *portp = dp_netdev_lookup_port(dp, port_no);
1965         return *portp ? 0 : ENODEV;
1966     }
1967 }
1968
1969 static void
1970 port_destroy(struct dp_netdev_port *port)
1971 {
1972     if (!port) {
1973         return;
1974     }
1975
1976     netdev_close(port->netdev);
1977     netdev_restore_flags(port->sf);
1978
1979     for (unsigned i = 0; i < port->n_rxq; i++) {
1980         netdev_rxq_close(port->rxqs[i].rx);
1981     }
1982     ovs_mutex_destroy(&port->txq_used_mutex);
1983     free(port->rxq_affinity_list);
1984     free(port->txq_used);
1985     free(port->rxqs);
1986     free(port->type);
1987     free(port);
1988 }
1989
1990 static int
1991 get_port_by_name(struct dp_netdev *dp,
1992                  const char *devname, struct dp_netdev_port **portp)
1993     OVS_REQUIRES(dp->port_mutex)
1994 {
1995     struct dp_netdev_port *port;
1996
1997     HMAP_FOR_EACH (port, node, &dp->ports) {
1998         if (!strcmp(netdev_get_name(port->netdev), devname)) {
1999             *portp = port;
2000             return 0;
2001         }
2002     }
2003
2004     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2005      * existing port. */
2006     return ENODEV;
2007 }
2008
2009 /* Returns 'true' if there is a port with pmd netdev. */
2010 static bool
2011 has_pmd_port(struct dp_netdev *dp)
2012     OVS_REQUIRES(dp->port_mutex)
2013 {
2014     struct dp_netdev_port *port;
2015
2016     HMAP_FOR_EACH (port, node, &dp->ports) {
2017         if (netdev_is_pmd(port->netdev)) {
2018             return true;
2019         }
2020     }
2021
2022     return false;
2023 }
2024
2025 static void
2026 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2027     OVS_REQUIRES(dp->port_mutex)
2028 {
2029     hmap_remove(&dp->ports, &port->node);
2030     seq_change(dp->port_seq);
2031
2032     reconfigure_datapath(dp);
2033
2034     port_destroy(port);
2035 }
2036
2037 static void
2038 answer_port_query(const struct dp_netdev_port *port,
2039                   struct dpif_port *dpif_port)
2040 {
2041     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2042     dpif_port->type = xstrdup(port->type);
2043     dpif_port->port_no = port->port_no;
2044 }
2045
2046 static int
2047 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2048                                  struct dpif_port *dpif_port)
2049 {
2050     struct dp_netdev *dp = get_dp_netdev(dpif);
2051     struct dp_netdev_port *port;
2052     int error;
2053
2054     ovs_mutex_lock(&dp->port_mutex);
2055     error = get_port_by_number(dp, port_no, &port);
2056     if (!error && dpif_port) {
2057         answer_port_query(port, dpif_port);
2058     }
2059     ovs_mutex_unlock(&dp->port_mutex);
2060
2061     return error;
2062 }
2063
2064 static int
2065 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2066                                struct dpif_port *dpif_port)
2067 {
2068     struct dp_netdev *dp = get_dp_netdev(dpif);
2069     struct dp_netdev_port *port;
2070     int error;
2071
2072     ovs_mutex_lock(&dp->port_mutex);
2073     error = get_port_by_name(dp, devname, &port);
2074     if (!error && dpif_port) {
2075         answer_port_query(port, dpif_port);
2076     }
2077     ovs_mutex_unlock(&dp->port_mutex);
2078
2079     return error;
2080 }
2081
2082 static void
2083 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2084 {
2085     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2086     free(flow);
2087 }
2088
2089 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2090 {
2091     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2092         ovsrcu_postpone(dp_netdev_flow_free, flow);
2093     }
2094 }
2095
2096 static uint32_t
2097 dp_netdev_flow_hash(const ovs_u128 *ufid)
2098 {
2099     return ufid->u32[0];
2100 }
2101
2102 static inline struct dpcls *
2103 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2104                            odp_port_t in_port)
2105 {
2106     struct dpcls *cls;
2107     uint32_t hash = hash_port_no(in_port);
2108     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2109         if (cls->in_port == in_port) {
2110             /* Port classifier exists already */
2111             return cls;
2112         }
2113     }
2114     return NULL;
2115 }
2116
2117 static inline struct dpcls *
2118 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2119                          odp_port_t in_port)
2120     OVS_REQUIRES(pmd->flow_mutex)
2121 {
2122     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2123     uint32_t hash = hash_port_no(in_port);
2124
2125     if (!cls) {
2126         /* Create new classifier for in_port */
2127         cls = xmalloc(sizeof(*cls));
2128         dpcls_init(cls);
2129         cls->in_port = in_port;
2130         cmap_insert(&pmd->classifiers, &cls->node, hash);
2131         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2132     }
2133     return cls;
2134 }
2135
2136 #define MAX_FLOW_MARK       (UINT32_MAX - 1)
2137 #define INVALID_FLOW_MARK   (UINT32_MAX)
2138
2139 struct megaflow_to_mark_data {
2140     const struct cmap_node node;
2141     ovs_u128 mega_ufid;
2142     uint32_t mark;
2143 };
2144
2145 struct flow_mark {
2146     struct cmap megaflow_to_mark;
2147     struct cmap mark_to_flow;
2148     struct id_pool *pool;
2149 };
2150
2151 static struct flow_mark flow_mark = {
2152     .megaflow_to_mark = CMAP_INITIALIZER,
2153     .mark_to_flow = CMAP_INITIALIZER,
2154 };
2155
2156 static uint32_t
2157 flow_mark_alloc(void)
2158 {
2159     uint32_t mark;
2160
2161     if (!flow_mark.pool) {
2162         /* Haven't initiated yet, do it here */
2163         flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
2164     }
2165
2166     if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2167         return mark;
2168     }
2169
2170     return INVALID_FLOW_MARK;
2171 }
2172
2173 static void
2174 flow_mark_free(uint32_t mark)
2175 {
2176     id_pool_free_id(flow_mark.pool, mark);
2177 }
2178
2179 /* associate megaflow with a mark, which is a 1:1 mapping */
2180 static void
2181 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2182 {
2183     size_t hash = dp_netdev_flow_hash(mega_ufid);
2184     struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2185
2186     data->mega_ufid = *mega_ufid;
2187     data->mark = mark;
2188
2189     cmap_insert(&flow_mark.megaflow_to_mark,
2190                 CONST_CAST(struct cmap_node *, &data->node), hash);
2191 }
2192
2193 /* disassociate meagaflow with a mark */
2194 static void
2195 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2196 {
2197     size_t hash = dp_netdev_flow_hash(mega_ufid);
2198     struct megaflow_to_mark_data *data;
2199
2200     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2201         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2202             cmap_remove(&flow_mark.megaflow_to_mark,
2203                         CONST_CAST(struct cmap_node *, &data->node), hash);
2204             ovsrcu_postpone(free, data);
2205             return;
2206         }
2207     }
2208
2209     VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2210               UUID_ARGS((struct uuid *)mega_ufid));
2211 }
2212
2213 static inline uint32_t
2214 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2215 {
2216     size_t hash = dp_netdev_flow_hash(mega_ufid);
2217     struct megaflow_to_mark_data *data;
2218
2219     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2220         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2221             return data->mark;
2222         }
2223     }
2224
2225     VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2226              UUID_ARGS((struct uuid *)mega_ufid));
2227     return INVALID_FLOW_MARK;
2228 }
2229
2230 /* associate mark with a flow, which is 1:N mapping */
2231 static void
2232 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2233 {
2234     dp_netdev_flow_ref(flow);
2235
2236     cmap_insert(&flow_mark.mark_to_flow,
2237                 CONST_CAST(struct cmap_node *, &flow->mark_node),
2238                 hash_int(mark, 0));
2239     flow->mark = mark;
2240
2241     VLOG_DBG("Associated dp_netdev flow %p with mark %u\n", flow, mark);
2242 }
2243
2244 static bool
2245 flow_mark_has_no_ref(uint32_t mark)
2246 {
2247     struct dp_netdev_flow *flow;
2248
2249     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2250                              &flow_mark.mark_to_flow) {
2251         if (flow->mark == mark) {
2252             return false;
2253         }
2254     }
2255
2256     return true;
2257 }
2258
2259 static int
2260 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2261                           struct dp_netdev_flow *flow)
2262 {
2263     int ret = 0;
2264     uint32_t mark = flow->mark;
2265     struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2266                                              &flow->mark_node);
2267
2268     cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2269     flow->mark = INVALID_FLOW_MARK;
2270
2271     /*
2272      * no flow is referencing the mark any more? If so, let's
2273      * remove the flow from hardware and free the mark.
2274      */
2275     if (flow_mark_has_no_ref(mark)) {
2276         struct dp_netdev_port *port;
2277         odp_port_t in_port = flow->flow.in_port.odp_port;
2278
2279         ovs_mutex_lock(&pmd->dp->port_mutex);
2280         port = dp_netdev_lookup_port(pmd->dp, in_port);
2281         if (port) {
2282             ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
2283         }
2284         ovs_mutex_unlock(&pmd->dp->port_mutex);
2285
2286         flow_mark_free(mark);
2287         VLOG_DBG("Freed flow mark %u\n", mark);
2288
2289         megaflow_to_mark_disassociate(&flow->mega_ufid);
2290     }
2291     dp_netdev_flow_unref(flow);
2292
2293     return ret;
2294 }
2295
2296 static void
2297 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2298 {
2299     struct dp_netdev_flow *flow;
2300
2301     CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2302         if (flow->pmd_id == pmd->core_id) {
2303             queue_netdev_flow_del(pmd, flow);
2304         }
2305     }
2306 }
2307
2308 static struct dp_netdev_flow *
2309 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2310                   const uint32_t mark)
2311 {
2312     struct dp_netdev_flow *flow;
2313
2314     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2315                              &flow_mark.mark_to_flow) {
2316         if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2317             flow->dead == false) {
2318             return flow;
2319         }
2320     }
2321
2322     return NULL;
2323 }
2324
2325 static struct dp_flow_offload_item *
2326 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2327                              struct dp_netdev_flow *flow,
2328                              int op)
2329 {
2330     struct dp_flow_offload_item *offload;
2331
2332     offload = xzalloc(sizeof(*offload));
2333     offload->pmd = pmd;
2334     offload->flow = flow;
2335     offload->op = op;
2336
2337     dp_netdev_flow_ref(flow);
2338     dp_netdev_pmd_try_ref(pmd);
2339
2340     return offload;
2341 }
2342
2343 static void
2344 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2345 {
2346     dp_netdev_pmd_unref(offload->pmd);
2347     dp_netdev_flow_unref(offload->flow);
2348
2349     free(offload->actions);
2350     free(offload);
2351 }
2352
2353 static void
2354 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2355 {
2356     ovs_mutex_lock(&dp_flow_offload.mutex);
2357     ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2358     xpthread_cond_signal(&dp_flow_offload.cond);
2359     ovs_mutex_unlock(&dp_flow_offload.mutex);
2360 }
2361
2362 static int
2363 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2364 {
2365     return mark_to_flow_disassociate(offload->pmd, offload->flow);
2366 }
2367
2368 /*
2369  * There are two flow offload operations here: addition and modification.
2370  *
2371  * For flow addition, this function does:
2372  * - allocate a new flow mark id
2373  * - perform hardware flow offload
2374  * - associate the flow mark with flow and mega flow
2375  *
2376  * For flow modification, both flow mark and the associations are still
2377  * valid, thus only item 2 needed.
2378  */
2379 static int
2380 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2381 {
2382     struct dp_netdev_port *port;
2383     struct dp_netdev_pmd_thread *pmd = offload->pmd;
2384     struct dp_netdev_flow *flow = offload->flow;
2385     odp_port_t in_port = flow->flow.in_port.odp_port;
2386     bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2387     struct offload_info info;
2388     uint32_t mark;
2389     int ret;
2390
2391     if (flow->dead) {
2392         return -1;
2393     }
2394
2395     if (modification) {
2396         mark = flow->mark;
2397         ovs_assert(mark != INVALID_FLOW_MARK);
2398     } else {
2399         /*
2400          * If a mega flow has already been offloaded (from other PMD
2401          * instances), do not offload it again.
2402          */
2403         mark = megaflow_to_mark_find(&flow->mega_ufid);
2404         if (mark != INVALID_FLOW_MARK) {
2405             VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2406             if (flow->mark != INVALID_FLOW_MARK) {
2407                 ovs_assert(flow->mark == mark);
2408             } else {
2409                 mark_to_flow_associate(mark, flow);
2410             }
2411             return 0;
2412         }
2413
2414         mark = flow_mark_alloc();
2415         if (mark == INVALID_FLOW_MARK) {
2416             VLOG_ERR("Failed to allocate flow mark!\n");
2417         }
2418     }
2419     info.flow_mark = mark;
2420
2421     ovs_mutex_lock(&pmd->dp->port_mutex);
2422     port = dp_netdev_lookup_port(pmd->dp, in_port);
2423     if (!port || netdev_vport_is_vport_class(port->netdev->netdev_class)) {
2424         ovs_mutex_unlock(&pmd->dp->port_mutex);
2425         goto err_free;
2426     }
2427     ret = netdev_flow_put(port->netdev, &offload->match,
2428                           CONST_CAST(struct nlattr *, offload->actions),
2429                           offload->actions_len, &flow->mega_ufid, &info,
2430                           NULL);
2431     ovs_mutex_unlock(&pmd->dp->port_mutex);
2432
2433     if (ret) {
2434         goto err_free;
2435     }
2436
2437     if (!modification) {
2438         megaflow_to_mark_associate(&flow->mega_ufid, mark);
2439         mark_to_flow_associate(mark, flow);
2440     }
2441     return 0;
2442
2443 err_free:
2444     if (!modification) {
2445         flow_mark_free(mark);
2446     } else {
2447         mark_to_flow_disassociate(pmd, flow);
2448     }
2449     return -1;
2450 }
2451
2452 static void *
2453 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2454 {
2455     struct dp_flow_offload_item *offload;
2456     struct ovs_list *list;
2457     const char *op;
2458     int ret;
2459
2460     for (;;) {
2461         ovs_mutex_lock(&dp_flow_offload.mutex);
2462         if (ovs_list_is_empty(&dp_flow_offload.list)) {
2463             ovsrcu_quiesce_start();
2464             ovs_mutex_cond_wait(&dp_flow_offload.cond,
2465                                 &dp_flow_offload.mutex);
2466             ovsrcu_quiesce_end();
2467         }
2468         list = ovs_list_pop_front(&dp_flow_offload.list);
2469         offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2470         ovs_mutex_unlock(&dp_flow_offload.mutex);
2471
2472         switch (offload->op) {
2473         case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2474             op = "add";
2475             ret = dp_netdev_flow_offload_put(offload);
2476             break;
2477         case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2478             op = "modify";
2479             ret = dp_netdev_flow_offload_put(offload);
2480             break;
2481         case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2482             op = "delete";
2483             ret = dp_netdev_flow_offload_del(offload);
2484             break;
2485         default:
2486             OVS_NOT_REACHED();
2487         }
2488
2489         VLOG_DBG("%s to %s netdev flow\n",
2490                  ret == 0 ? "succeed" : "failed", op);
2491         dp_netdev_free_flow_offload(offload);
2492     }
2493
2494     return NULL;
2495 }
2496
2497 static void
2498 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2499                       struct dp_netdev_flow *flow)
2500 {
2501     struct dp_flow_offload_item *offload;
2502
2503     if (ovsthread_once_start(&offload_thread_once)) {
2504         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2505         ovs_thread_create("dp_netdev_flow_offload",
2506                           dp_netdev_flow_offload_main, NULL);
2507         ovsthread_once_done(&offload_thread_once);
2508     }
2509
2510     offload = dp_netdev_alloc_flow_offload(pmd, flow,
2511                                            DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2512     dp_netdev_append_flow_offload(offload);
2513 }
2514
2515 static void
2516 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2517                       struct dp_netdev_flow *flow, struct match *match,
2518                       const struct nlattr *actions, size_t actions_len)
2519 {
2520     struct dp_flow_offload_item *offload;
2521     int op;
2522
2523     if (!netdev_is_flow_api_enabled()) {
2524         return;
2525     }
2526
2527     if (ovsthread_once_start(&offload_thread_once)) {
2528         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2529         ovs_thread_create("dp_netdev_flow_offload",
2530                           dp_netdev_flow_offload_main, NULL);
2531         ovsthread_once_done(&offload_thread_once);
2532     }
2533
2534     if (flow->mark != INVALID_FLOW_MARK) {
2535         op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2536     } else {
2537         op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2538     }
2539     offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2540     offload->match = *match;
2541     offload->actions = xmalloc(actions_len);
2542     memcpy(offload->actions, actions, actions_len);
2543     offload->actions_len = actions_len;
2544
2545     dp_netdev_append_flow_offload(offload);
2546 }
2547
2548 static void
2549 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2550                           struct dp_netdev_flow *flow)
2551     OVS_REQUIRES(pmd->flow_mutex)
2552 {
2553     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2554     struct dpcls *cls;
2555     odp_port_t in_port = flow->flow.in_port.odp_port;
2556
2557     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2558     ovs_assert(cls != NULL);
2559     dpcls_remove(cls, &flow->cr);
2560     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2561     if (flow->mark != INVALID_FLOW_MARK) {
2562         queue_netdev_flow_del(pmd, flow);
2563     }
2564     flow->dead = true;
2565
2566     dp_netdev_flow_unref(flow);
2567 }
2568
2569 static void
2570 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2571 {
2572     struct dp_netdev_flow *netdev_flow;
2573
2574     ovs_mutex_lock(&pmd->flow_mutex);
2575     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2576         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2577     }
2578     ovs_mutex_unlock(&pmd->flow_mutex);
2579 }
2580
2581 static int
2582 dpif_netdev_flow_flush(struct dpif *dpif)
2583 {
2584     struct dp_netdev *dp = get_dp_netdev(dpif);
2585     struct dp_netdev_pmd_thread *pmd;
2586
2587     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2588         dp_netdev_pmd_flow_flush(pmd);
2589     }
2590
2591     return 0;
2592 }
2593
2594 struct dp_netdev_port_state {
2595     struct hmap_position position;
2596     char *name;
2597 };
2598
2599 static int
2600 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2601 {
2602     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2603     return 0;
2604 }
2605
2606 static int
2607 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2608                            struct dpif_port *dpif_port)
2609 {
2610     struct dp_netdev_port_state *state = state_;
2611     struct dp_netdev *dp = get_dp_netdev(dpif);
2612     struct hmap_node *node;
2613     int retval;
2614
2615     ovs_mutex_lock(&dp->port_mutex);
2616     node = hmap_at_position(&dp->ports, &state->position);
2617     if (node) {
2618         struct dp_netdev_port *port;
2619
2620         port = CONTAINER_OF(node, struct dp_netdev_port, node);
2621
2622         free(state->name);
2623         state->name = xstrdup(netdev_get_name(port->netdev));
2624         dpif_port->name = state->name;
2625         dpif_port->type = port->type;
2626         dpif_port->port_no = port->port_no;
2627
2628         retval = 0;
2629     } else {
2630         retval = EOF;
2631     }
2632     ovs_mutex_unlock(&dp->port_mutex);
2633
2634     return retval;
2635 }
2636
2637 static int
2638 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2639 {
2640     struct dp_netdev_port_state *state = state_;
2641     free(state->name);
2642     free(state);
2643     return 0;
2644 }
2645
2646 static int
2647 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2648 {
2649     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2650     uint64_t new_port_seq;
2651     int error;
2652
2653     new_port_seq = seq_read(dpif->dp->port_seq);
2654     if (dpif->last_port_seq != new_port_seq) {
2655         dpif->last_port_seq = new_port_seq;
2656         error = ENOBUFS;
2657     } else {
2658         error = EAGAIN;
2659     }
2660
2661     return error;
2662 }
2663
2664 static void
2665 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2666 {
2667     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2668
2669     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2670 }
2671
2672 static struct dp_netdev_flow *
2673 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2674 {
2675     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2676 }
2677
2678 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2679 {
2680     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2681 }
2682
2683 /* netdev_flow_key utilities.
2684  *
2685  * netdev_flow_key is basically a miniflow.  We use these functions
2686  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2687  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2688  *
2689  * - Since we are dealing exclusively with miniflows created by
2690  *   miniflow_extract(), if the map is different the miniflow is different.
2691  *   Therefore we can be faster by comparing the map and the miniflow in a
2692  *   single memcmp().
2693  * - These functions can be inlined by the compiler. */
2694
2695 /* Given the number of bits set in miniflow's maps, returns the size of the
2696  * 'netdev_flow_key.mf' */
2697 static inline size_t
2698 netdev_flow_key_size(size_t flow_u64s)
2699 {
2700     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2701 }
2702
2703 static inline bool
2704 netdev_flow_key_equal(const struct netdev_flow_key *a,
2705                       const struct netdev_flow_key *b)
2706 {
2707     /* 'b->len' may be not set yet. */
2708     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2709 }
2710
2711 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2712  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2713  * generated by miniflow_extract. */
2714 static inline bool
2715 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2716                          const struct miniflow *mf)
2717 {
2718     return !memcmp(&key->mf, mf, key->len);
2719 }
2720
2721 static inline void
2722 netdev_flow_key_clone(struct netdev_flow_key *dst,
2723                       const struct netdev_flow_key *src)
2724 {
2725     memcpy(dst, src,
2726            offsetof(struct netdev_flow_key, mf) + src->len);
2727 }
2728
2729 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2730 static inline void
2731 netdev_flow_mask_init(struct netdev_flow_key *mask,
2732                       const struct match *match)
2733 {
2734     uint64_t *dst = miniflow_values(&mask->mf);
2735     struct flowmap fmap;
2736     uint32_t hash = 0;
2737     size_t idx;
2738
2739     /* Only check masks that make sense for the flow. */
2740     flow_wc_map(&match->flow, &fmap);
2741     flowmap_init(&mask->mf.map);
2742
2743     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2744         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2745
2746         if (mask_u64) {
2747             flowmap_set(&mask->mf.map, idx, 1);
2748             *dst++ = mask_u64;
2749             hash = hash_add64(hash, mask_u64);
2750         }
2751     }
2752
2753     map_t map;
2754
2755     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2756         hash = hash_add64(hash, map);
2757     }
2758
2759     size_t n = dst - miniflow_get_values(&mask->mf);
2760
2761     mask->hash = hash_finish(hash, n * 8);
2762     mask->len = netdev_flow_key_size(n);
2763 }
2764
2765 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2766 static inline void
2767 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2768                             const struct flow *flow,
2769                             const struct netdev_flow_key *mask)
2770 {
2771     uint64_t *dst_u64 = miniflow_values(&dst->mf);
2772     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2773     uint32_t hash = 0;
2774     uint64_t value;
2775
2776     dst->len = mask->len;
2777     dst->mf = mask->mf;   /* Copy maps. */
2778
2779     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2780         *dst_u64 = value & *mask_u64++;
2781         hash = hash_add64(hash, *dst_u64++);
2782     }
2783     dst->hash = hash_finish(hash,
2784                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2785 }
2786
2787 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2788 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
2789     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2790
2791 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2792  * 'mask'. */
2793 static inline uint32_t
2794 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2795                              const struct netdev_flow_key *mask)
2796 {
2797     const uint64_t *p = miniflow_get_values(&mask->mf);
2798     uint32_t hash = 0;
2799     uint64_t value;
2800
2801     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2802         hash = hash_add64(hash, value & *p++);
2803     }
2804
2805     return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2806 }
2807
2808 static inline bool
2809 emc_entry_alive(struct emc_entry *ce)
2810 {
2811     return ce->flow && !ce->flow->dead;
2812 }
2813
2814 static void
2815 emc_clear_entry(struct emc_entry *ce)
2816 {
2817     if (ce->flow) {
2818         dp_netdev_flow_unref(ce->flow);
2819         ce->flow = NULL;
2820     }
2821 }
2822
2823 static inline void
2824 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2825                  const struct netdev_flow_key *key)
2826 {
2827     if (ce->flow != flow) {
2828         if (ce->flow) {
2829             dp_netdev_flow_unref(ce->flow);
2830         }
2831
2832         if (dp_netdev_flow_ref(flow)) {
2833             ce->flow = flow;
2834         } else {
2835             ce->flow = NULL;
2836         }
2837     }
2838     if (key) {
2839         netdev_flow_key_clone(&ce->key, key);
2840     }
2841 }
2842
2843 static inline void
2844 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2845            struct dp_netdev_flow *flow)
2846 {
2847     struct emc_entry *to_be_replaced = NULL;
2848     struct emc_entry *current_entry;
2849
2850     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2851         if (netdev_flow_key_equal(&current_entry->key, key)) {
2852             /* We found the entry with the 'mf' miniflow */
2853             emc_change_entry(current_entry, flow, NULL);
2854             return;
2855         }
2856
2857         /* Replacement policy: put the flow in an empty (not alive) entry, or
2858          * in the first entry where it can be */
2859         if (!to_be_replaced
2860             || (emc_entry_alive(to_be_replaced)
2861                 && !emc_entry_alive(current_entry))
2862             || current_entry->key.hash < to_be_replaced->key.hash) {
2863             to_be_replaced = current_entry;
2864         }
2865     }
2866     /* We didn't find the miniflow in the cache.
2867      * The 'to_be_replaced' entry is where the new flow will be stored */
2868
2869     emc_change_entry(to_be_replaced, flow, key);
2870 }
2871
2872 static inline void
2873 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2874                          const struct netdev_flow_key *key,
2875                          struct dp_netdev_flow *flow)
2876 {
2877     /* Insert an entry into the EMC based on probability value 'min'. By
2878      * default the value is UINT32_MAX / 100 which yields an insertion
2879      * probability of 1/100 ie. 1% */
2880
2881     uint32_t min = pmd->ctx.emc_insert_min;
2882
2883     if (min && random_uint32() <= min) {
2884         emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2885     }
2886 }
2887
2888 static inline struct dp_netdev_flow *
2889 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2890 {
2891     struct emc_entry *current_entry;
2892
2893     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2894         if (current_entry->key.hash == key->hash
2895             && emc_entry_alive(current_entry)
2896             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2897
2898             /* We found the entry with the 'key->mf' miniflow */
2899             return current_entry->flow;
2900         }
2901     }
2902
2903     return NULL;
2904 }
2905
2906 static inline const struct cmap_node *
2907 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2908 {
2909     struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2910     struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2911     uint16_t sig = hash >> 16;
2912     uint16_t index = UINT16_MAX;
2913
2914     for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2915         if (bucket->sig[i] == sig) {
2916             index = bucket->flow_idx[i];
2917             break;
2918         }
2919     }
2920     if (index != UINT16_MAX) {
2921         return cmap_find_by_index(&pmd->flow_table, index);
2922     }
2923     return NULL;
2924 }
2925
2926 static void
2927 smc_clear_entry(struct smc_bucket *b, int idx)
2928 {
2929     b->flow_idx[idx] = UINT16_MAX;
2930 }
2931
2932 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2933  * turned off, 2) the flow_table index is larger than uint16_t can handle.
2934  * If there is already an SMC entry having same signature, the index will be
2935  * updated. If there is no existing entry, but an empty entry is available,
2936  * the empty entry will be taken. If no empty entry or existing same signature,
2937  * a random entry from the hashed bucket will be picked. */
2938 static inline void
2939 smc_insert(struct dp_netdev_pmd_thread *pmd,
2940            const struct netdev_flow_key *key,
2941            uint32_t hash)
2942 {
2943     struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2944     struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2945     uint16_t index;
2946     uint32_t cmap_index;
2947     bool smc_enable_db;
2948     int i;
2949
2950     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
2951     if (!smc_enable_db) {
2952         return;
2953     }
2954
2955     cmap_index = cmap_find_index(&pmd->flow_table, hash);
2956     index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2957
2958     /* If the index is larger than SMC can handle (uint16_t), we don't
2959      * insert */
2960     if (index == UINT16_MAX) {
2961         return;
2962     }
2963
2964     /* If an entry with same signature already exists, update the index */
2965     uint16_t sig = key->hash >> 16;
2966     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2967         if (bucket->sig[i] == sig) {
2968             bucket->flow_idx[i] = index;
2969             return;
2970         }
2971     }
2972     /* If there is an empty entry, occupy it. */
2973     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2974         if (bucket->flow_idx[i] == UINT16_MAX) {
2975             bucket->sig[i] = sig;
2976             bucket->flow_idx[i] = index;
2977             return;
2978         }
2979     }
2980     /* Otherwise, pick a random entry. */
2981     i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2982     bucket->sig[i] = sig;
2983     bucket->flow_idx[i] = index;
2984 }
2985
2986 static struct dp_netdev_flow *
2987 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2988                           const struct netdev_flow_key *key,
2989                           int *lookup_num_p)
2990 {
2991     struct dpcls *cls;
2992     struct dpcls_rule *rule;
2993     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2994                                                      in_port.odp_port));
2995     struct dp_netdev_flow *netdev_flow = NULL;
2996
2997     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2998     if (OVS_LIKELY(cls)) {
2999         dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3000         netdev_flow = dp_netdev_flow_cast(rule);
3001     }
3002     return netdev_flow;
3003 }
3004
3005 static struct dp_netdev_flow *
3006 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3007                         const ovs_u128 *ufidp, const struct nlattr *key,
3008                         size_t key_len)
3009 {
3010     struct dp_netdev_flow *netdev_flow;
3011     struct flow flow;
3012     ovs_u128 ufid;
3013
3014     /* If a UFID is not provided, determine one based on the key. */
3015     if (!ufidp && key && key_len
3016         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3017         dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
3018         ufidp = &ufid;
3019     }
3020
3021     if (ufidp) {
3022         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3023                                  &pmd->flow_table) {
3024             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3025                 return netdev_flow;
3026             }
3027         }
3028     }
3029
3030     return NULL;
3031 }
3032
3033 static void
3034 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
3035                     struct dpif_flow_stats *stats)
3036 {
3037     struct dp_netdev_flow *netdev_flow;
3038     unsigned long long n;
3039     long long used;
3040     uint16_t flags;
3041
3042     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3043
3044     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3045     stats->n_packets = n;
3046     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3047     stats->n_bytes = n;
3048     atomic_read_relaxed(&netdev_flow->stats.used, &used);
3049     stats->used = used;
3050     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3051     stats->tcp_flags = flags;
3052 }
3053
3054 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3055  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3056  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3057  * protect them. */
3058 static void
3059 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
3060                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3061                             struct dpif_flow *flow, bool terse)
3062 {
3063     if (terse) {
3064         memset(flow, 0, sizeof *flow);
3065     } else {
3066         struct flow_wildcards wc;
3067         struct dp_netdev_actions *actions;
3068         size_t offset;
3069         struct odp_flow_key_parms odp_parms = {
3070             .flow = &netdev_flow->flow,
3071             .mask = &wc.masks,
3072             .support = dp_netdev_support,
3073         };
3074
3075         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3076         /* in_port is exact matched, but we have left it out from the mask for
3077          * optimnization reasons. Add in_port back to the mask. */
3078         wc.masks.in_port.odp_port = ODPP_NONE;
3079
3080         /* Key */
3081         offset = key_buf->size;
3082         flow->key = ofpbuf_tail(key_buf);
3083         odp_flow_key_from_flow(&odp_parms, key_buf);
3084         flow->key_len = key_buf->size - offset;
3085
3086         /* Mask */
3087         offset = mask_buf->size;
3088         flow->mask = ofpbuf_tail(mask_buf);
3089         odp_parms.key_buf = key_buf;
3090         odp_flow_key_from_mask(&odp_parms, mask_buf);
3091         flow->mask_len = mask_buf->size - offset;
3092
3093         /* Actions */
3094         actions = dp_netdev_flow_get_actions(netdev_flow);
3095         flow->actions = actions->actions;
3096         flow->actions_len = actions->size;
3097     }
3098
3099     flow->ufid = netdev_flow->ufid;
3100     flow->ufid_present = true;
3101     flow->pmd_id = netdev_flow->pmd_id;
3102     get_dpif_flow_stats(netdev_flow, &flow->stats);
3103
3104     flow->attrs.offloaded = false;
3105     flow->attrs.dp_layer = "ovs";
3106 }
3107
3108 static int
3109 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3110                               const struct nlattr *mask_key,
3111                               uint32_t mask_key_len, const struct flow *flow,
3112                               struct flow_wildcards *wc, bool probe)
3113 {
3114     enum odp_key_fitness fitness;
3115
3116     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3117     if (fitness) {
3118         if (!probe) {
3119             /* This should not happen: it indicates that
3120              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3121              * disagree on the acceptable form of a mask.  Log the problem
3122              * as an error, with enough details to enable debugging. */
3123             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3124
3125             if (!VLOG_DROP_ERR(&rl)) {
3126                 struct ds s;
3127
3128                 ds_init(&s);
3129                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3130                                 true);
3131                 VLOG_ERR("internal error parsing flow mask %s (%s)",
3132                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3133                 ds_destroy(&s);
3134             }
3135         }
3136
3137         return EINVAL;
3138     }
3139
3140     return 0;
3141 }
3142
3143 static int
3144 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3145                               struct flow *flow, bool probe)
3146 {
3147     if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3148         if (!probe) {
3149             /* This should not happen: it indicates that
3150              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3151              * the acceptable form of a flow.  Log the problem as an error,
3152              * with enough details to enable debugging. */
3153             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3154
3155             if (!VLOG_DROP_ERR(&rl)) {
3156                 struct ds s;
3157
3158                 ds_init(&s);
3159                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3160                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3161                 ds_destroy(&s);
3162             }
3163         }
3164
3165         return EINVAL;
3166     }
3167
3168     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3169         return EINVAL;
3170     }
3171
3172     return 0;
3173 }
3174
3175 static int
3176 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3177 {
3178     struct dp_netdev *dp = get_dp_netdev(dpif);
3179     struct dp_netdev_flow *netdev_flow;
3180     struct dp_netdev_pmd_thread *pmd;
3181     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3182     struct hmapx_node *node;
3183     int error = EINVAL;
3184
3185     if (get->pmd_id == PMD_ID_NULL) {
3186         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3187             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3188                 dp_netdev_pmd_unref(pmd);
3189             }
3190         }
3191     } else {
3192         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3193         if (!pmd) {
3194             goto out;
3195         }
3196         hmapx_add(&to_find, pmd);
3197     }
3198
3199     if (!hmapx_count(&to_find)) {
3200         goto out;
3201     }
3202
3203     HMAPX_FOR_EACH (node, &to_find) {
3204         pmd = (struct dp_netdev_pmd_thread *) node->data;
3205         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3206                                               get->key_len);
3207         if (netdev_flow) {
3208             dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
3209                                         get->flow, false);
3210             error = 0;
3211             break;
3212         } else {
3213             error = ENOENT;
3214         }
3215     }
3216
3217     HMAPX_FOR_EACH (node, &to_find) {
3218         pmd = (struct dp_netdev_pmd_thread *) node->data;
3219         dp_netdev_pmd_unref(pmd);
3220     }
3221 out:
3222     hmapx_destroy(&to_find);
3223     return error;
3224 }
3225
3226 static void
3227 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3228 {
3229     struct flow masked_flow;
3230     size_t i;
3231
3232     for (i = 0; i < sizeof(struct flow); i++) {
3233         ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3234                                        ((uint8_t *)&match->wc)[i];
3235     }
3236     dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
3237 }
3238
3239 static struct dp_netdev_flow *
3240 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3241                    struct match *match, const ovs_u128 *ufid,
3242                    const struct nlattr *actions, size_t actions_len)
3243     OVS_REQUIRES(pmd->flow_mutex)
3244 {
3245     struct dp_netdev_flow *flow;
3246     struct netdev_flow_key mask;
3247     struct dpcls *cls;
3248
3249     /* Make sure in_port is exact matched before we read it. */
3250     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3251     odp_port_t in_port = match->flow.in_port.odp_port;
3252
3253     /* As we select the dpcls based on the port number, each netdev flow
3254      * belonging to the same dpcls will have the same odp_port value.
3255      * For performance reasons we wildcard odp_port here in the mask.  In the
3256      * typical case dp_hash is also wildcarded, and the resulting 8-byte
3257      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3258      * will not be part of the subtable mask.
3259      * This will speed up the hash computation during dpcls_lookup() because
3260      * there is one less call to hash_add64() in this case. */
3261     match->wc.masks.in_port.odp_port = 0;
3262     netdev_flow_mask_init(&mask, match);
3263     match->wc.masks.in_port.odp_port = ODPP_NONE;
3264
3265     /* Make sure wc does not have metadata. */
3266     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3267                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3268
3269     /* Do not allocate extra space. */
3270     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3271     memset(&flow->stats, 0, sizeof flow->stats);
3272     flow->dead = false;
3273     flow->batch = NULL;
3274     flow->mark = INVALID_FLOW_MARK;
3275     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3276     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3277     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3278     ovs_refcount_init(&flow->ref_cnt);
3279     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3280
3281     dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3282     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3283
3284     /* Select dpcls for in_port. Relies on in_port to be exact match. */
3285     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3286     dpcls_insert(cls, &flow->cr, &mask);
3287
3288     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3289                 dp_netdev_flow_hash(&flow->ufid));
3290
3291     queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3292
3293     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3294         struct ds ds = DS_EMPTY_INITIALIZER;
3295         struct ofpbuf key_buf, mask_buf;
3296         struct odp_flow_key_parms odp_parms = {
3297             .flow = &match->flow,
3298             .mask = &match->wc.masks,
3299             .support = dp_netdev_support,
3300         };
3301
3302         ofpbuf_init(&key_buf, 0);
3303         ofpbuf_init(&mask_buf, 0);
3304
3305         odp_flow_key_from_flow(&odp_parms, &key_buf);
3306         odp_parms.key_buf = &key_buf;
3307         odp_flow_key_from_mask(&odp_parms, &mask_buf);
3308
3309         ds_put_cstr(&ds, "flow_add: ");
3310         odp_format_ufid(ufid, &ds);
3311         ds_put_cstr(&ds, " ");
3312         odp_flow_format(key_buf.data, key_buf.size,
3313                         mask_buf.data, mask_buf.size,
3314                         NULL, &ds, false);
3315         ds_put_cstr(&ds, ", actions:");
3316         format_odp_actions(&ds, actions, actions_len, NULL);
3317
3318         VLOG_DBG("%s", ds_cstr(&ds));
3319
3320         ofpbuf_uninit(&key_buf);
3321         ofpbuf_uninit(&mask_buf);
3322
3323         /* Add a printout of the actual match installed. */
3324         struct match m;
3325         ds_clear(&ds);
3326         ds_put_cstr(&ds, "flow match: ");
3327         miniflow_expand(&flow->cr.flow.mf, &m.flow);
3328         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3329         memset(&m.tun_md, 0, sizeof m.tun_md);
3330         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3331
3332         VLOG_DBG("%s", ds_cstr(&ds));
3333
3334         ds_destroy(&ds);
3335     }
3336
3337     return flow;
3338 }
3339
3340 static int
3341 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3342                 struct netdev_flow_key *key,
3343                 struct match *match,
3344                 ovs_u128 *ufid,
3345                 const struct dpif_flow_put *put,
3346                 struct dpif_flow_stats *stats)
3347 {
3348     struct dp_netdev_flow *netdev_flow;
3349     int error = 0;
3350
3351     if (stats) {
3352         memset(stats, 0, sizeof *stats);
3353     }
3354
3355     ovs_mutex_lock(&pmd->flow_mutex);
3356     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3357     if (!netdev_flow) {
3358         if (put->flags & DPIF_FP_CREATE) {
3359             if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
3360                 dp_netdev_flow_add(pmd, match, ufid, put->actions,
3361                                    put->actions_len);
3362                 error = 0;
3363             } else {
3364                 error = EFBIG;
3365             }
3366         } else {
3367             error = ENOENT;
3368         }
3369     } else {
3370         if (put->flags & DPIF_FP_MODIFY) {
3371             struct dp_netdev_actions *new_actions;
3372             struct dp_netdev_actions *old_actions;
3373
3374             new_actions = dp_netdev_actions_create(put->actions,
3375                                                    put->actions_len);
3376
3377             old_actions = dp_netdev_flow_get_actions(netdev_flow);
3378             ovsrcu_set(&netdev_flow->actions, new_actions);
3379
3380             queue_netdev_flow_put(pmd, netdev_flow, match,
3381                                   put->actions, put->actions_len);
3382
3383             if (stats) {
3384                 get_dpif_flow_stats(netdev_flow, stats);
3385             }
3386             if (put->flags & DPIF_FP_ZERO_STATS) {
3387                 /* XXX: The userspace datapath uses thread local statistics
3388                  * (for flows), which should be updated only by the owning
3389                  * thread.  Since we cannot write on stats memory here,
3390                  * we choose not to support this flag.  Please note:
3391                  * - This feature is currently used only by dpctl commands with
3392                  *   option --clear.
3393                  * - Should the need arise, this operation can be implemented
3394                  *   by keeping a base value (to be update here) for each
3395                  *   counter, and subtracting it before outputting the stats */
3396                 error = EOPNOTSUPP;
3397             }
3398
3399             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3400         } else if (put->flags & DPIF_FP_CREATE) {
3401             error = EEXIST;
3402         } else {
3403             /* Overlapping flow. */
3404             error = EINVAL;
3405         }
3406     }
3407     ovs_mutex_unlock(&pmd->flow_mutex);
3408     return error;
3409 }
3410
3411 static int
3412 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3413 {
3414     struct dp_netdev *dp = get_dp_netdev(dpif);
3415     struct netdev_flow_key key, mask;
3416     struct dp_netdev_pmd_thread *pmd;
3417     struct match match;
3418     ovs_u128 ufid;
3419     int error;
3420     bool probe = put->flags & DPIF_FP_PROBE;
3421
3422     if (put->stats) {
3423         memset(put->stats, 0, sizeof *put->stats);
3424     }
3425     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3426                                           probe);
3427     if (error) {
3428         return error;
3429     }
3430     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3431                                           put->mask, put->mask_len,
3432                                           &match.flow, &match.wc, probe);
3433     if (error) {
3434         return error;
3435     }
3436
3437     if (put->ufid) {
3438         ufid = *put->ufid;
3439     } else {
3440         dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
3441     }
3442
3443     /* The Netlink encoding of datapath flow keys cannot express
3444      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3445      * tag is interpreted as exact match on the fact that there is no
3446      * VLAN.  Unless we refactor a lot of code that translates between
3447      * Netlink and struct flow representations, we have to do the same
3448      * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3449     if (!match.wc.masks.vlans[0].tci) {
3450         match.wc.masks.vlans[0].tci = htons(0xffff);
3451     }
3452
3453     /* Must produce a netdev_flow_key for lookup.
3454      * Use the same method as employed to create the key when adding
3455      * the flow to the dplcs to make sure they match. */
3456     netdev_flow_mask_init(&mask, &match);
3457     netdev_flow_key_init_masked(&key, &match.flow, &mask);
3458
3459     if (put->pmd_id == PMD_ID_NULL) {
3460         if (cmap_count(&dp->poll_threads) == 0) {
3461             return EINVAL;
3462         }
3463         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3464             struct dpif_flow_stats pmd_stats;
3465             int pmd_error;
3466
3467             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3468                                         &pmd_stats);
3469             if (pmd_error) {
3470                 error = pmd_error;
3471             } else if (put->stats) {
3472                 put->stats->n_packets += pmd_stats.n_packets;
3473                 put->stats->n_bytes += pmd_stats.n_bytes;
3474                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3475                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3476             }
3477         }
3478     } else {
3479         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3480         if (!pmd) {
3481             return EINVAL;
3482         }
3483         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3484         dp_netdev_pmd_unref(pmd);
3485     }
3486
3487     return error;
3488 }
3489
3490 static int
3491 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3492                 struct dpif_flow_stats *stats,
3493                 const struct dpif_flow_del *del)
3494 {
3495     struct dp_netdev_flow *netdev_flow;
3496     int error = 0;
3497
3498     ovs_mutex_lock(&pmd->flow_mutex);
3499     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3500                                           del->key_len);
3501     if (netdev_flow) {
3502         if (stats) {
3503             get_dpif_flow_stats(netdev_flow, stats);
3504         }
3505         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3506     } else {
3507         error = ENOENT;
3508     }
3509     ovs_mutex_unlock(&pmd->flow_mutex);
3510
3511     return error;
3512 }
3513
3514 static int
3515 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3516 {
3517     struct dp_netdev *dp = get_dp_netdev(dpif);
3518     struct dp_netdev_pmd_thread *pmd;
3519     int error = 0;
3520
3521     if (del->stats) {
3522         memset(del->stats, 0, sizeof *del->stats);
3523     }
3524
3525     if (del->pmd_id == PMD_ID_NULL) {
3526         if (cmap_count(&dp->poll_threads) == 0) {
3527             return EINVAL;
3528         }
3529         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3530             struct dpif_flow_stats pmd_stats;
3531             int pmd_error;
3532
3533             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3534             if (pmd_error) {
3535                 error = pmd_error;
3536             } else if (del->stats) {
3537                 del->stats->n_packets += pmd_stats.n_packets;
3538                 del->stats->n_bytes += pmd_stats.n_bytes;
3539                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3540                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3541             }
3542         }
3543     } else {
3544         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3545         if (!pmd) {
3546             return EINVAL;
3547         }
3548         error = flow_del_on_pmd(pmd, del->stats, del);
3549         dp_netdev_pmd_unref(pmd);
3550     }
3551
3552
3553     return error;
3554 }
3555
3556 struct dpif_netdev_flow_dump {
3557     struct dpif_flow_dump up;
3558     struct cmap_position poll_thread_pos;
3559     struct cmap_position flow_pos;
3560     struct dp_netdev_pmd_thread *cur_pmd;
3561     int status;
3562     struct ovs_mutex mutex;
3563 };
3564
3565 static struct dpif_netdev_flow_dump *
3566 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3567 {
3568     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3569 }
3570
3571 static struct dpif_flow_dump *
3572 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3573                              struct dpif_flow_dump_types *types OVS_UNUSED)
3574 {
3575     struct dpif_netdev_flow_dump *dump;
3576
3577     dump = xzalloc(sizeof *dump);
3578     dpif_flow_dump_init(&dump->up, dpif_);
3579     dump->up.terse = terse;
3580     ovs_mutex_init(&dump->mutex);
3581
3582     return &dump->up;
3583 }
3584
3585 static int
3586 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3587 {
3588     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3589
3590     ovs_mutex_destroy(&dump->mutex);
3591     free(dump);
3592     return 0;
3593 }
3594
3595 struct dpif_netdev_flow_dump_thread {
3596     struct dpif_flow_dump_thread up;
3597     struct dpif_netdev_flow_dump *dump;
3598     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3599     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3600 };
3601
3602 static struct dpif_netdev_flow_dump_thread *
3603 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3604 {
3605     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3606 }
3607
3608 static struct dpif_flow_dump_thread *
3609 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3610 {
3611     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3612     struct dpif_netdev_flow_dump_thread *thread;
3613
3614     thread = xmalloc(sizeof *thread);
3615     dpif_flow_dump_thread_init(&thread->up, &dump->up);
3616     thread->dump = dump;
3617     return &thread->up;
3618 }
3619
3620 static void
3621 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3622 {
3623     struct dpif_netdev_flow_dump_thread *thread
3624         = dpif_netdev_flow_dump_thread_cast(thread_);
3625
3626     free(thread);
3627 }
3628
3629 static int
3630 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3631                            struct dpif_flow *flows, int max_flows)
3632 {
3633     struct dpif_netdev_flow_dump_thread *thread
3634         = dpif_netdev_flow_dump_thread_cast(thread_);
3635     struct dpif_netdev_flow_dump *dump = thread->dump;
3636     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3637     int n_flows = 0;
3638     int i;
3639
3640     ovs_mutex_lock(&dump->mutex);
3641     if (!dump->status) {
3642         struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3643         struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3644         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3645         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3646
3647         /* First call to dump_next(), extracts the first pmd thread.
3648          * If there is no pmd thread, returns immediately. */
3649         if (!pmd) {
3650             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3651             if (!pmd) {
3652                 ovs_mutex_unlock(&dump->mutex);
3653                 return n_flows;
3654
3655             }
3656         }
3657
3658         do {
3659             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3660                 struct cmap_node *node;
3661
3662                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3663                 if (!node) {
3664                     break;
3665                 }
3666                 netdev_flows[n_flows] = CONTAINER_OF(node,
3667                                                      struct dp_netdev_flow,
3668                                                      node);
3669             }
3670             /* When finishing dumping the current pmd thread, moves to
3671              * the next. */
3672             if (n_flows < flow_limit) {
3673                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3674                 dp_netdev_pmd_unref(pmd);
3675                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3676                 if (!pmd) {
3677                     dump->status = EOF;
3678                     break;
3679                 }
3680             }
3681             /* Keeps the reference to next caller. */
3682             dump->cur_pmd = pmd;
3683
3684             /* If the current dump is empty, do not exit the loop, since the
3685              * remaining pmds could have flows to be dumped.  Just dumps again
3686              * on the new 'pmd'. */
3687         } while (!n_flows);
3688     }
3689     ovs_mutex_unlock(&dump->mutex);
3690
3691     for (i = 0; i < n_flows; i++) {
3692         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3693         struct odputil_keybuf *keybuf = &thread->keybuf[i];
3694         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3695         struct dpif_flow *f = &flows[i];
3696         struct ofpbuf key, mask;
3697
3698         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3699         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3700         dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
3701                                     dump->up.terse);
3702     }
3703
3704     return n_flows;
3705 }
3706
3707 static int
3708 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3709     OVS_NO_THREAD_SAFETY_ANALYSIS
3710 {
3711     struct dp_netdev *dp = get_dp_netdev(dpif);
3712     struct dp_netdev_pmd_thread *pmd;
3713     struct dp_packet_batch pp;
3714
3715     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3716         dp_packet_size(execute->packet) > UINT16_MAX) {
3717         return EINVAL;
3718     }
3719
3720     /* Tries finding the 'pmd'.  If NULL is returned, that means
3721      * the current thread is a non-pmd thread and should use
3722      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3723     pmd = ovsthread_getspecific(dp->per_pmd_key);
3724     if (!pmd) {
3725         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3726         if (!pmd) {
3727             return EBUSY;
3728         }
3729     }
3730
3731     if (execute->probe) {
3732         /* If this is part of a probe, Drop the packet, since executing
3733          * the action may actually cause spurious packets be sent into
3734          * the network. */
3735         if (pmd->core_id == NON_PMD_CORE_ID) {
3736             dp_netdev_pmd_unref(pmd);
3737         }
3738         return 0;
3739     }
3740
3741     /* If the current thread is non-pmd thread, acquires
3742      * the 'non_pmd_mutex'. */
3743     if (pmd->core_id == NON_PMD_CORE_ID) {
3744         ovs_mutex_lock(&dp->non_pmd_mutex);
3745     }
3746
3747     /* Update current time in PMD context. We don't care about EMC insertion
3748      * probability, because we are on a slow path. */
3749     pmd_thread_ctx_time_update(pmd);
3750
3751     /* The action processing expects the RSS hash to be valid, because
3752      * it's always initialized at the beginning of datapath processing.
3753      * In this case, though, 'execute->packet' may not have gone through
3754      * the datapath at all, it may have been generated by the upper layer
3755      * (OpenFlow packet-out, BFD frame, ...). */
3756     if (!dp_packet_rss_valid(execute->packet)) {
3757         dp_packet_set_rss_hash(execute->packet,
3758                                flow_hash_5tuple(execute->flow, 0));
3759     }
3760
3761     dp_packet_batch_init_packet(&pp, execute->packet);
3762     pp.do_not_steal = true;
3763     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3764                               execute->actions, execute->actions_len);
3765     dp_netdev_pmd_flush_output_packets(pmd, true);
3766
3767     if (pmd->core_id == NON_PMD_CORE_ID) {
3768         ovs_mutex_unlock(&dp->non_pmd_mutex);
3769         dp_netdev_pmd_unref(pmd);
3770     }
3771
3772     return 0;
3773 }
3774
3775 static void
3776 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3777                     enum dpif_offload_type offload_type OVS_UNUSED)
3778 {
3779     size_t i;
3780
3781     for (i = 0; i < n_ops; i++) {
3782         struct dpif_op *op = ops[i];
3783
3784         switch (op->type) {
3785         case DPIF_OP_FLOW_PUT:
3786             op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3787             break;
3788
3789         case DPIF_OP_FLOW_DEL:
3790             op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3791             break;
3792
3793         case DPIF_OP_EXECUTE:
3794             op->error = dpif_netdev_execute(dpif, &op->execute);
3795             break;
3796
3797         case DPIF_OP_FLOW_GET:
3798             op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3799             break;
3800         }
3801     }
3802 }
3803
3804 /* Enable or Disable PMD auto load balancing. */
3805 static void
3806 set_pmd_auto_lb(struct dp_netdev *dp)
3807 {
3808     unsigned int cnt = 0;
3809     struct dp_netdev_pmd_thread *pmd;
3810     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3811
3812     bool enable_alb = false;
3813     bool multi_rxq = false;
3814     bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
3815
3816     /* Ensure that there is at least 2 non-isolated PMDs and
3817      * one of them is polling more than one rxq. */
3818     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3819         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3820             continue;
3821         }
3822
3823         if (hmap_count(&pmd->poll_list) > 1) {
3824             multi_rxq = true;
3825         }
3826         if (cnt && multi_rxq) {
3827                 enable_alb = true;
3828                 break;
3829         }
3830         cnt++;
3831     }
3832
3833     /* Enable auto LB if it is requested and cycle based assignment is true. */
3834     enable_alb = enable_alb && pmd_rxq_assign_cyc &&
3835                     pmd_alb->auto_lb_requested;
3836
3837     if (pmd_alb->is_enabled != enable_alb) {
3838         pmd_alb->is_enabled = enable_alb;
3839         if (pmd_alb->is_enabled) {
3840             VLOG_INFO("PMD auto load balance is enabled "
3841                       "(with rebalance interval:%"PRIu64" msec)",
3842                        pmd_alb->rebalance_intvl);
3843         } else {
3844             pmd_alb->rebalance_poll_timer = 0;
3845             VLOG_INFO("PMD auto load balance is disabled");
3846         }
3847     }
3848
3849 }
3850
3851 /* Applies datapath configuration from the database. Some of the changes are
3852  * actually applied in dpif_netdev_run(). */
3853 static int
3854 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3855 {
3856     struct dp_netdev *dp = get_dp_netdev(dpif);
3857     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3858     const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3859                                              "cycles");
3860     unsigned long long insert_prob =
3861         smap_get_ullong(other_config, "emc-insert-inv-prob",
3862                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
3863     uint32_t insert_min, cur_min;
3864     uint32_t tx_flush_interval, cur_tx_flush_interval;
3865     uint64_t rebalance_intvl;
3866
3867     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3868                                      DEFAULT_TX_FLUSH_INTERVAL);
3869     atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3870     if (tx_flush_interval != cur_tx_flush_interval) {
3871         atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3872         VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3873                   tx_flush_interval);
3874     }
3875
3876     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3877         free(dp->pmd_cmask);
3878         dp->pmd_cmask = nullable_xstrdup(cmask);
3879         dp_netdev_request_reconfigure(dp);
3880     }
3881
3882     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
3883     if (insert_prob <= UINT32_MAX) {
3884         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
3885     } else {
3886         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
3887         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
3888     }
3889
3890     if (insert_min != cur_min) {
3891         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
3892         if (insert_min == 0) {
3893             VLOG_INFO("EMC insertion probability changed to zero");
3894         } else {
3895             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
3896                       insert_prob, (100 / (float)insert_prob));
3897         }
3898     }
3899
3900     bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
3901     bool cur_perf_enabled;
3902     atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
3903     if (perf_enabled != cur_perf_enabled) {
3904         atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
3905         if (perf_enabled) {
3906             VLOG_INFO("PMD performance metrics collection enabled");
3907         } else {
3908             VLOG_INFO("PMD performance metrics collection disabled");
3909         }
3910     }
3911
3912     bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
3913     bool cur_smc;
3914     atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
3915     if (smc_enable != cur_smc) {
3916         atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
3917         if (smc_enable) {
3918             VLOG_INFO("SMC cache is enabled");
3919         } else {
3920             VLOG_INFO("SMC cache is disabled");
3921         }
3922     }
3923
3924     bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
3925     if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
3926         VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
3927                       "Defaulting to 'cycles'.");
3928         pmd_rxq_assign_cyc = true;
3929         pmd_rxq_assign = "cycles";
3930     }
3931     if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
3932         dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
3933         VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
3934                   pmd_rxq_assign);
3935         dp_netdev_request_reconfigure(dp);
3936     }
3937
3938     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3939     pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
3940                               false);
3941
3942     rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
3943                               ALB_PMD_REBALANCE_POLL_INTERVAL);
3944
3945     /* Input is in min, convert it to msec. */
3946     rebalance_intvl =
3947         rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
3948
3949     if (pmd_alb->rebalance_intvl != rebalance_intvl) {
3950         pmd_alb->rebalance_intvl = rebalance_intvl;
3951     }
3952
3953     set_pmd_auto_lb(dp);
3954     return 0;
3955 }
3956
3957 /* Parses affinity list and returns result in 'core_ids'. */
3958 static int
3959 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
3960 {
3961     unsigned i;
3962     char *list, *copy, *key, *value;
3963     int error = 0;
3964
3965     for (i = 0; i < n_rxq; i++) {
3966         core_ids[i] = OVS_CORE_UNSPEC;
3967     }
3968
3969     if (!affinity_list) {
3970         return 0;
3971     }
3972
3973     list = copy = xstrdup(affinity_list);
3974
3975     while (ofputil_parse_key_value(&list, &key, &value)) {
3976         int rxq_id, core_id;
3977
3978         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
3979             || !str_to_int(value, 0, &core_id) || core_id < 0) {
3980             error = EINVAL;
3981             break;
3982         }
3983
3984         if (rxq_id < n_rxq) {
3985             core_ids[rxq_id] = core_id;
3986         }
3987     }
3988
3989     free(copy);
3990     return error;
3991 }
3992
3993 /* Parses 'affinity_list' and applies configuration if it is valid. */
3994 static int
3995 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
3996                                   const char *affinity_list)
3997 {
3998     unsigned *core_ids, i;
3999     int error = 0;
4000
4001     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4002     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4003         error = EINVAL;
4004         goto exit;
4005     }
4006
4007     for (i = 0; i < port->n_rxq; i++) {
4008         port->rxqs[i].core_id = core_ids[i];
4009     }
4010
4011 exit:
4012     free(core_ids);
4013     return error;
4014 }
4015
4016 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4017  * of given PMD thread. */
4018 static bool
4019 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4020                            struct dp_netdev_port *port)
4021     OVS_EXCLUDED(pmd->port_mutex)
4022 {
4023     struct rxq_poll *poll;
4024     bool found = false;
4025
4026     ovs_mutex_lock(&pmd->port_mutex);
4027     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4028         if (port == poll->rxq->port) {
4029             found = true;
4030             break;
4031         }
4032     }
4033     ovs_mutex_unlock(&pmd->port_mutex);
4034     return found;
4035 }
4036
4037 /* Updates port configuration from the database.  The changes are actually
4038  * applied in dpif_netdev_run(). */
4039 static int
4040 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4041                             const struct smap *cfg)
4042 {
4043     struct dp_netdev *dp = get_dp_netdev(dpif);
4044     struct dp_netdev_port *port;
4045     int error = 0;
4046     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4047     bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4048
4049     ovs_mutex_lock(&dp->port_mutex);
4050     error = get_port_by_number(dp, port_no, &port);
4051     if (error) {
4052         goto unlock;
4053     }
4054
4055     if (emc_enabled != port->emc_enabled) {
4056         struct dp_netdev_pmd_thread *pmd;
4057         struct ds ds = DS_EMPTY_INITIALIZER;
4058         uint32_t cur_min, insert_prob;
4059
4060         port->emc_enabled = emc_enabled;
4061         /* Mark for reload all the threads that polls this port and request
4062          * for reconfiguration for the actual reloading of threads. */
4063         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4064             if (dpif_netdev_pmd_polls_port(pmd, port)) {
4065                 pmd->need_reload = true;
4066             }
4067         }
4068         dp_netdev_request_reconfigure(dp);
4069
4070         ds_put_format(&ds, "%s: EMC has been %s.",
4071                       netdev_get_name(port->netdev),
4072                       (emc_enabled) ? "enabled" : "disabled");
4073         if (emc_enabled) {
4074             ds_put_cstr(&ds, " Current insertion probability is ");
4075             atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4076             if (!cur_min) {
4077                 ds_put_cstr(&ds, "zero.");
4078             } else {
4079                 insert_prob = UINT32_MAX / cur_min;
4080                 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4081                               insert_prob, 100 / (float) insert_prob);
4082             }
4083         }
4084         VLOG_INFO("%s", ds_cstr(&ds));
4085         ds_destroy(&ds);
4086     }
4087
4088     /* Checking for RXq affinity changes. */
4089     if (!netdev_is_pmd(port->netdev)
4090         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4091         goto unlock;
4092     }
4093
4094     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4095     if (error) {
4096         goto unlock;
4097     }
4098     free(port->rxq_affinity_list);
4099     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4100
4101     dp_netdev_request_reconfigure(dp);
4102 unlock:
4103     ovs_mutex_unlock(&dp->port_mutex);
4104     return error;
4105 }
4106
4107 static int
4108 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4109                               uint32_t queue_id, uint32_t *priority)
4110 {
4111     *priority = queue_id;
4112     return 0;
4113 }
4114
4115 \f
4116 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4117  * a copy of the 'size' bytes of 'actions' input parameters. */
4118 struct dp_netdev_actions *
4119 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4120 {
4121     struct dp_netdev_actions *netdev_actions;
4122
4123     netdev_actions = xmalloc(sizeof *netdev_actions + size);
4124     memcpy(netdev_actions->actions, actions, size);
4125     netdev_actions->size = size;
4126
4127     return netdev_actions;
4128 }
4129
4130 struct dp_netdev_actions *
4131 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4132 {
4133     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4134 }
4135
4136 static void
4137 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4138 {
4139     free(actions);
4140 }
4141 \f
4142 static void
4143 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4144                          enum rxq_cycles_counter_type type,
4145                          unsigned long long cycles)
4146 {
4147    atomic_store_relaxed(&rx->cycles[type], cycles);
4148 }
4149
4150 static void
4151 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4152                          enum rxq_cycles_counter_type type,
4153                          unsigned long long cycles)
4154 {
4155     non_atomic_ullong_add(&rx->cycles[type], cycles);
4156 }
4157
4158 static uint64_t
4159 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4160                          enum rxq_cycles_counter_type type)
4161 {
4162     unsigned long long processing_cycles;
4163     atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4164     return processing_cycles;
4165 }
4166
4167 static void
4168 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4169                                 unsigned long long cycles)
4170 {
4171     unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4172     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4173 }
4174
4175 static uint64_t
4176 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4177 {
4178     unsigned long long processing_cycles;
4179     atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4180     return processing_cycles;
4181 }
4182
4183 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4184 static inline bool
4185 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4186 {
4187     bool pmd_perf_enabled;
4188     atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4189     return pmd_perf_enabled;
4190 }
4191 #else
4192 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4193  * performance metrics are not available as locked access to 64 bit
4194  * integers would be prohibitively expensive. */
4195 static inline bool
4196 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4197 {
4198     return false;
4199 }
4200 #endif
4201
4202 static int
4203 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4204                                    struct tx_port *p)
4205 {
4206     int i;
4207     int tx_qid;
4208     int output_cnt;
4209     bool dynamic_txqs;
4210     struct cycle_timer timer;
4211     uint64_t cycles;
4212     uint32_t tx_flush_interval;
4213
4214     cycle_timer_start(&pmd->perf_stats, &timer);
4215
4216     dynamic_txqs = p->port->dynamic_txqs;
4217     if (dynamic_txqs) {
4218         tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4219     } else {
4220         tx_qid = pmd->static_tx_qid;
4221     }
4222
4223     output_cnt = dp_packet_batch_size(&p->output_pkts);
4224     ovs_assert(output_cnt > 0);
4225
4226     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4227     dp_packet_batch_init(&p->output_pkts);
4228
4229     /* Update time of the next flush. */
4230     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4231     p->flush_time = pmd->ctx.now + tx_flush_interval;
4232
4233     ovs_assert(pmd->n_output_batches > 0);
4234     pmd->n_output_batches--;
4235
4236     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4237     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4238
4239     /* Distribute send cycles evenly among transmitted packets and assign to
4240      * their respective rx queues. */
4241     cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4242     for (i = 0; i < output_cnt; i++) {
4243         if (p->output_pkts_rxqs[i]) {
4244             dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4245                                      RXQ_CYCLES_PROC_CURR, cycles);
4246         }
4247     }
4248
4249     return output_cnt;
4250 }
4251
4252 static int
4253 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4254                                    bool force)
4255 {
4256     struct tx_port *p;
4257     int output_cnt = 0;
4258
4259     if (!pmd->n_output_batches) {
4260         return 0;
4261     }
4262
4263     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4264         if (!dp_packet_batch_is_empty(&p->output_pkts)
4265             && (force || pmd->ctx.now >= p->flush_time)) {
4266             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4267         }
4268     }
4269     return output_cnt;
4270 }
4271
4272 static int
4273 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4274                            struct dp_netdev_rxq *rxq,
4275                            odp_port_t port_no)
4276 {
4277     struct pmd_perf_stats *s = &pmd->perf_stats;
4278     struct dp_packet_batch batch;
4279     struct cycle_timer timer;
4280     int error;
4281     int batch_cnt = 0;
4282     int rem_qlen = 0, *qlen_p = NULL;
4283     uint64_t cycles;
4284
4285     /* Measure duration for polling and processing rx burst. */
4286     cycle_timer_start(&pmd->perf_stats, &timer);
4287
4288     pmd->ctx.last_rxq = rxq;
4289     dp_packet_batch_init(&batch);
4290
4291     /* Fetch the rx queue length only for vhostuser ports. */
4292     if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4293         qlen_p = &rem_qlen;
4294     }
4295
4296     error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4297     if (!error) {
4298         /* At least one packet received. */
4299         *recirc_depth_get() = 0;
4300         pmd_thread_ctx_time_update(pmd);
4301         batch_cnt = batch.count;
4302         if (pmd_perf_metrics_enabled(pmd)) {
4303             /* Update batch histogram. */
4304             s->current.batches++;
4305             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4306             /* Update the maximum vhost rx queue fill level. */
4307             if (rxq->is_vhost && rem_qlen >= 0) {
4308                 uint32_t qfill = batch_cnt + rem_qlen;
4309                 if (qfill > s->current.max_vhost_qfill) {
4310                     s->current.max_vhost_qfill = qfill;
4311                 }
4312             }
4313         }
4314         /* Process packet batch. */
4315         dp_netdev_input(pmd, &batch, port_no);
4316
4317         /* Assign processing cycles to rx queue. */
4318         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4319         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4320
4321         dp_netdev_pmd_flush_output_packets(pmd, false);
4322     } else {
4323         /* Discard cycles. */
4324         cycle_timer_stop(&pmd->perf_stats, &timer);
4325         if (error != EAGAIN && error != EOPNOTSUPP) {
4326             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4327
4328             VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4329                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4330         }
4331     }
4332
4333     pmd->ctx.last_rxq = NULL;
4334
4335     return batch_cnt;
4336 }
4337
4338 static struct tx_port *
4339 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4340 {
4341     struct tx_port *tx;
4342
4343     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4344         if (tx->port->port_no == port_no) {
4345             return tx;
4346         }
4347     }
4348
4349     return NULL;
4350 }
4351
4352 static int
4353 port_reconfigure(struct dp_netdev_port *port)
4354 {
4355     struct netdev *netdev = port->netdev;
4356     int i, err;
4357
4358     /* Closes the existing 'rxq's. */
4359     for (i = 0; i < port->n_rxq; i++) {
4360         netdev_rxq_close(port->rxqs[i].rx);
4361         port->rxqs[i].rx = NULL;
4362     }
4363     unsigned last_nrxq = port->n_rxq;
4364     port->n_rxq = 0;
4365
4366     /* Allows 'netdev' to apply the pending configuration changes. */
4367     if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4368         err = netdev_reconfigure(netdev);
4369         if (err && (err != EOPNOTSUPP)) {
4370             VLOG_ERR("Failed to set interface %s new configuration",
4371                      netdev_get_name(netdev));
4372             return err;
4373         }
4374     }
4375     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4376     port->rxqs = xrealloc(port->rxqs,
4377                           sizeof *port->rxqs * netdev_n_rxq(netdev));
4378     /* Realloc 'used' counters for tx queues. */
4379     free(port->txq_used);
4380     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4381
4382     for (i = 0; i < netdev_n_rxq(netdev); i++) {
4383         bool new_queue = i >= last_nrxq;
4384         if (new_queue) {
4385             memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4386         }
4387
4388         port->rxqs[i].port = port;
4389         port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4390
4391         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4392         if (err) {
4393             return err;
4394         }
4395         port->n_rxq++;
4396     }
4397
4398     /* Parse affinity list to apply configuration for new queues. */
4399     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4400
4401     /* If reconfiguration was successful mark it as such, so we can use it */
4402     port->need_reconfigure = false;
4403
4404     return 0;
4405 }
4406
4407 struct rr_numa_list {
4408     struct hmap numas;  /* Contains 'struct rr_numa' */
4409 };
4410
4411 struct rr_numa {
4412     struct hmap_node node;
4413
4414     int numa_id;
4415
4416     /* Non isolated pmds on numa node 'numa_id' */
4417     struct dp_netdev_pmd_thread **pmds;
4418     int n_pmds;
4419
4420     int cur_index;
4421     bool idx_inc;
4422 };
4423
4424 static struct rr_numa *
4425 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4426 {
4427     struct rr_numa *numa;
4428
4429     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4430         if (numa->numa_id == numa_id) {
4431             return numa;
4432         }
4433     }
4434
4435     return NULL;
4436 }
4437
4438 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4439  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4440  * Returns NULL if 'rr' numa list is empty. */
4441 static struct rr_numa *
4442 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4443 {
4444     struct hmap_node *node = NULL;
4445
4446     if (numa) {
4447         node = hmap_next(&rr->numas, &numa->node);
4448     }
4449     if (!node) {
4450         node = hmap_first(&rr->numas);
4451     }
4452
4453     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4454 }
4455
4456 static void
4457 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4458 {
4459     struct dp_netdev_pmd_thread *pmd;
4460     struct rr_numa *numa;
4461
4462     hmap_init(&rr->numas);
4463
4464     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4465         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4466             continue;
4467         }
4468
4469         numa = rr_numa_list_lookup(rr, pmd->numa_id);
4470         if (!numa) {
4471             numa = xzalloc(sizeof *numa);
4472             numa->numa_id = pmd->numa_id;
4473             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4474         }
4475         numa->n_pmds++;
4476         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4477         numa->pmds[numa->n_pmds - 1] = pmd;
4478         /* At least one pmd so initialise curr_idx and idx_inc. */
4479         numa->cur_index = 0;
4480         numa->idx_inc = true;
4481     }
4482 }
4483
4484 /*
4485  * Returns the next pmd from the numa node.
4486  *
4487  * If 'updown' is 'true' it will alternate between selecting the next pmd in
4488  * either an up or down walk, switching between up/down when the first or last
4489  * core is reached. e.g. 1,2,3,3,2,1,1,2...
4490  *
4491  * If 'updown' is 'false' it will select the next pmd wrapping around when last
4492  * core reached. e.g. 1,2,3,1,2,3,1,2...
4493  */
4494 static struct dp_netdev_pmd_thread *
4495 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4496 {
4497     int numa_idx = numa->cur_index;
4498
4499     if (numa->idx_inc == true) {
4500         /* Incrementing through list of pmds. */
4501         if (numa->cur_index == numa->n_pmds-1) {
4502             /* Reached the last pmd. */
4503             if (updown) {
4504                 numa->idx_inc = false;
4505             } else {
4506                 numa->cur_index = 0;
4507             }
4508         } else {
4509             numa->cur_index++;
4510         }
4511     } else {
4512         /* Decrementing through list of pmds. */
4513         if (numa->cur_index == 0) {
4514             /* Reached the first pmd. */
4515             numa->idx_inc = true;
4516         } else {
4517             numa->cur_index--;
4518         }
4519     }
4520     return numa->pmds[numa_idx];
4521 }
4522
4523 static void
4524 rr_numa_list_destroy(struct rr_numa_list *rr)
4525 {
4526     struct rr_numa *numa;
4527
4528     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4529         free(numa->pmds);
4530         free(numa);
4531     }
4532     hmap_destroy(&rr->numas);
4533 }
4534
4535 /* Sort Rx Queues by the processing cycles they are consuming. */
4536 static int
4537 compare_rxq_cycles(const void *a, const void *b)
4538 {
4539     struct dp_netdev_rxq *qa;
4540     struct dp_netdev_rxq *qb;
4541     uint64_t cycles_qa, cycles_qb;
4542
4543     qa = *(struct dp_netdev_rxq **) a;
4544     qb = *(struct dp_netdev_rxq **) b;
4545
4546     cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4547     cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4548
4549     if (cycles_qa != cycles_qb) {
4550         return (cycles_qa < cycles_qb) ? 1 : -1;
4551     } else {
4552         /* Cycles are the same so tiebreak on port/queue id.
4553          * Tiebreaking (as opposed to return 0) ensures consistent
4554          * sort results across multiple OS's. */
4555         uint32_t port_qa = odp_to_u32(qa->port->port_no);
4556         uint32_t port_qb = odp_to_u32(qb->port->port_no);
4557         if (port_qa != port_qb) {
4558             return port_qa > port_qb ? 1 : -1;
4559         } else {
4560             return netdev_rxq_get_queue_id(qa->rx)
4561                     - netdev_rxq_get_queue_id(qb->rx);
4562         }
4563     }
4564 }
4565
4566 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
4567  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
4568  * pmds to unpinned queues.
4569  *
4570  * The function doesn't touch the pmd threads, it just stores the assignment
4571  * in the 'pmd' member of each rxq. */
4572 static void
4573 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4574 {
4575     struct dp_netdev_port *port;
4576     struct rr_numa_list rr;
4577     struct rr_numa *non_local_numa = NULL;
4578     struct dp_netdev_rxq ** rxqs = NULL;
4579     int n_rxqs = 0;
4580     struct rr_numa *numa = NULL;
4581     int numa_id;
4582     bool assign_cyc = dp->pmd_rxq_assign_cyc;
4583
4584     HMAP_FOR_EACH (port, node, &dp->ports) {
4585         if (!netdev_is_pmd(port->netdev)) {
4586             continue;
4587         }
4588
4589         for (int qid = 0; qid < port->n_rxq; qid++) {
4590             struct dp_netdev_rxq *q = &port->rxqs[qid];
4591
4592             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4593                 struct dp_netdev_pmd_thread *pmd;
4594
4595                 pmd = dp_netdev_get_pmd(dp, q->core_id);
4596                 if (!pmd) {
4597                     VLOG_WARN("There is no PMD thread on core %d. Queue "
4598                               "%d on port \'%s\' will not be polled.",
4599                               q->core_id, qid, netdev_get_name(port->netdev));
4600                 } else {
4601                     q->pmd = pmd;
4602                     pmd->isolated = true;
4603                     dp_netdev_pmd_unref(pmd);
4604                 }
4605             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4606                 uint64_t cycle_hist = 0;
4607
4608                 if (n_rxqs == 0) {
4609                     rxqs = xmalloc(sizeof *rxqs);
4610                 } else {
4611                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4612                 }
4613
4614                 if (assign_cyc) {
4615                     /* Sum the queue intervals and store the cycle history. */
4616                     for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4617                         cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4618                     }
4619                     dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4620                                              cycle_hist);
4621                 }
4622                 /* Store the queue. */
4623                 rxqs[n_rxqs++] = q;
4624             }
4625         }
4626     }
4627
4628     if (n_rxqs > 1 && assign_cyc) {
4629         /* Sort the queues in order of the processing cycles
4630          * they consumed during their last pmd interval. */
4631         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4632     }
4633
4634     rr_numa_list_populate(dp, &rr);
4635     /* Assign the sorted queues to pmds in round robin. */
4636     for (int i = 0; i < n_rxqs; i++) {
4637         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4638         numa = rr_numa_list_lookup(&rr, numa_id);
4639         if (!numa) {
4640             /* There are no pmds on the queue's local NUMA node.
4641                Round robin on the NUMA nodes that do have pmds. */
4642             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4643             if (!non_local_numa) {
4644                 VLOG_ERR("There is no available (non-isolated) pmd "
4645                          "thread for port \'%s\' queue %d. This queue "
4646                          "will not be polled. Is pmd-cpu-mask set to "
4647                          "zero? Or are all PMDs isolated to other "
4648                          "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4649                          netdev_rxq_get_queue_id(rxqs[i]->rx));
4650                 continue;
4651             }
4652             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4653             VLOG_WARN("There's no available (non-isolated) pmd thread "
4654                       "on numa node %d. Queue %d on port \'%s\' will "
4655                       "be assigned to the pmd on core %d "
4656                       "(numa node %d). Expect reduced performance.",
4657                       numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4658                       netdev_rxq_get_name(rxqs[i]->rx),
4659                       rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4660         } else {
4661             rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4662             if (assign_cyc) {
4663                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4664                           "rx queue %d "
4665                           "(measured processing cycles %"PRIu64").",
4666                           rxqs[i]->pmd->core_id, numa_id,
4667                           netdev_rxq_get_name(rxqs[i]->rx),
4668                           netdev_rxq_get_queue_id(rxqs[i]->rx),
4669                           dp_netdev_rxq_get_cycles(rxqs[i],
4670                                                    RXQ_CYCLES_PROC_HIST));
4671             } else {
4672                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4673                           "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4674                           netdev_rxq_get_name(rxqs[i]->rx),
4675                           netdev_rxq_get_queue_id(rxqs[i]->rx));
4676             }
4677         }
4678     }
4679
4680     rr_numa_list_destroy(&rr);
4681     free(rxqs);
4682 }
4683
4684 static void
4685 reload_affected_pmds(struct dp_netdev *dp)
4686 {
4687     struct dp_netdev_pmd_thread *pmd;
4688
4689     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4690         if (pmd->need_reload) {
4691             flow_mark_flush(pmd);
4692             dp_netdev_reload_pmd__(pmd);
4693         }
4694     }
4695
4696     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4697         if (pmd->need_reload) {
4698             if (pmd->core_id != NON_PMD_CORE_ID) {
4699                 bool reload;
4700
4701                 do {
4702                     atomic_read_explicit(&pmd->reload, &reload,
4703                                          memory_order_acquire);
4704                 } while (reload);
4705             }
4706             pmd->need_reload = false;
4707         }
4708     }
4709 }
4710
4711 static void
4712 reconfigure_pmd_threads(struct dp_netdev *dp)
4713     OVS_REQUIRES(dp->port_mutex)
4714 {
4715     struct dp_netdev_pmd_thread *pmd;
4716     struct ovs_numa_dump *pmd_cores;
4717     struct ovs_numa_info_core *core;
4718     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4719     struct hmapx_node *node;
4720     bool changed = false;
4721     bool need_to_adjust_static_tx_qids = false;
4722
4723     /* The pmd threads should be started only if there's a pmd port in the
4724      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
4725      * NR_PMD_THREADS per numa node. */
4726     if (!has_pmd_port(dp)) {
4727         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4728     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4729         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4730     } else {
4731         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4732     }
4733
4734     /* We need to adjust 'static_tx_qid's only if we're reducing number of
4735      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4736     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4737         /* Adjustment is required to keep 'static_tx_qid's sequential and
4738          * avoid possible issues, for example, imbalanced tx queue usage
4739          * and unnecessary locking caused by remapping on netdev level. */
4740         need_to_adjust_static_tx_qids = true;
4741     }
4742
4743     /* Check for unwanted pmd threads */
4744     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4745         if (pmd->core_id == NON_PMD_CORE_ID) {
4746             continue;
4747         }
4748         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4749                                                     pmd->core_id)) {
4750             hmapx_add(&to_delete, pmd);
4751         } else if (need_to_adjust_static_tx_qids) {
4752             atomic_store_relaxed(&pmd->reload_tx_qid, true);
4753             pmd->need_reload = true;
4754         }
4755     }
4756
4757     HMAPX_FOR_EACH (node, &to_delete) {
4758         pmd = (struct dp_netdev_pmd_thread *) node->data;
4759         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4760                   pmd->numa_id, pmd->core_id);
4761         dp_netdev_del_pmd(dp, pmd);
4762     }
4763     changed = !hmapx_is_empty(&to_delete);
4764     hmapx_destroy(&to_delete);
4765
4766     if (need_to_adjust_static_tx_qids) {
4767         /* 'static_tx_qid's are not sequential now.
4768          * Reload remaining threads to fix this. */
4769         reload_affected_pmds(dp);
4770     }
4771
4772     /* Check for required new pmd threads */
4773     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4774         pmd = dp_netdev_get_pmd(dp, core->core_id);
4775         if (!pmd) {
4776             pmd = xzalloc(sizeof *pmd);
4777             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4778             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
4779             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4780                       pmd->numa_id, pmd->core_id);
4781             changed = true;
4782         } else {
4783             dp_netdev_pmd_unref(pmd);
4784         }
4785     }
4786
4787     if (changed) {
4788         struct ovs_numa_info_numa *numa;
4789
4790         /* Log the number of pmd threads per numa node. */
4791         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
4792             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
4793                       numa->n_cores, numa->numa_id);
4794         }
4795     }
4796
4797     ovs_numa_dump_destroy(pmd_cores);
4798 }
4799
4800 static void
4801 pmd_remove_stale_ports(struct dp_netdev *dp,
4802                        struct dp_netdev_pmd_thread *pmd)
4803     OVS_EXCLUDED(pmd->port_mutex)
4804     OVS_REQUIRES(dp->port_mutex)
4805 {
4806     struct rxq_poll *poll, *poll_next;
4807     struct tx_port *tx, *tx_next;
4808
4809     ovs_mutex_lock(&pmd->port_mutex);
4810     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4811         struct dp_netdev_port *port = poll->rxq->port;
4812
4813         if (port->need_reconfigure
4814             || !hmap_contains(&dp->ports, &port->node)) {
4815             dp_netdev_del_rxq_from_pmd(pmd, poll);
4816         }
4817     }
4818     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
4819         struct dp_netdev_port *port = tx->port;
4820
4821         if (port->need_reconfigure
4822             || !hmap_contains(&dp->ports, &port->node)) {
4823             dp_netdev_del_port_tx_from_pmd(pmd, tx);
4824         }
4825     }
4826     ovs_mutex_unlock(&pmd->port_mutex);
4827 }
4828
4829 /* Must be called each time a port is added/removed or the cmask changes.
4830  * This creates and destroys pmd threads, reconfigures ports, opens their
4831  * rxqs and assigns all rxqs/txqs to pmd threads. */
4832 static void
4833 reconfigure_datapath(struct dp_netdev *dp)
4834     OVS_REQUIRES(dp->port_mutex)
4835 {
4836     struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
4837     struct dp_netdev_pmd_thread *pmd;
4838     struct dp_netdev_port *port;
4839     int wanted_txqs;
4840
4841     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
4842
4843     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
4844      * on the system and the user configuration. */
4845     reconfigure_pmd_threads(dp);
4846
4847     wanted_txqs = cmap_count(&dp->poll_threads);
4848
4849     /* The number of pmd threads might have changed, or a port can be new:
4850      * adjust the txqs. */
4851     HMAP_FOR_EACH (port, node, &dp->ports) {
4852         netdev_set_tx_multiq(port->netdev, wanted_txqs);
4853     }
4854
4855     /* Step 2: Remove from the pmd threads ports that have been removed or
4856      * need reconfiguration. */
4857
4858     /* Check for all the ports that need reconfiguration.  We cache this in
4859      * 'port->need_reconfigure', because netdev_is_reconf_required() can
4860      * change at any time. */
4861     HMAP_FOR_EACH (port, node, &dp->ports) {
4862         if (netdev_is_reconf_required(port->netdev)) {
4863             port->need_reconfigure = true;
4864         }
4865     }
4866
4867     /* Remove from the pmd threads all the ports that have been deleted or
4868      * need reconfiguration. */
4869     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4870         pmd_remove_stale_ports(dp, pmd);
4871     }
4872
4873     /* Reload affected pmd threads.  We must wait for the pmd threads before
4874      * reconfiguring the ports, because a port cannot be reconfigured while
4875      * it's being used. */
4876     reload_affected_pmds(dp);
4877
4878     /* Step 3: Reconfigure ports. */
4879
4880     /* We only reconfigure the ports that we determined above, because they're
4881      * not being used by any pmd thread at the moment.  If a port fails to
4882      * reconfigure we remove it from the datapath. */
4883     struct dp_netdev_port *next_port;
4884     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
4885         int err;
4886
4887         if (!port->need_reconfigure) {
4888             continue;
4889         }
4890
4891         err = port_reconfigure(port);
4892         if (err) {
4893             hmap_remove(&dp->ports, &port->node);
4894             seq_change(dp->port_seq);
4895             port_destroy(port);
4896         } else {
4897             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
4898         }
4899     }
4900
4901     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
4902      * for now, we just update the 'pmd' pointer in each rxq to point to the
4903      * wanted thread according to the scheduling policy. */
4904
4905     /* Reset all the pmd threads to non isolated. */
4906     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4907         pmd->isolated = false;
4908     }
4909
4910     /* Reset all the queues to unassigned */
4911     HMAP_FOR_EACH (port, node, &dp->ports) {
4912         for (int i = 0; i < port->n_rxq; i++) {
4913             port->rxqs[i].pmd = NULL;
4914         }
4915     }
4916
4917     /* Add pinned queues and mark pmd threads isolated. */
4918     rxq_scheduling(dp, true);
4919
4920     /* Add non-pinned queues. */
4921     rxq_scheduling(dp, false);
4922
4923     /* Step 5: Remove queues not compliant with new scheduling. */
4924
4925     /* Count all the threads that will have at least one queue to poll. */
4926     HMAP_FOR_EACH (port, node, &dp->ports) {
4927         for (int qid = 0; qid < port->n_rxq; qid++) {
4928             struct dp_netdev_rxq *q = &port->rxqs[qid];
4929
4930             if (q->pmd) {
4931                 hmapx_add(&busy_threads, q->pmd);
4932             }
4933         }
4934     }
4935
4936     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4937         struct rxq_poll *poll, *poll_next;
4938
4939         ovs_mutex_lock(&pmd->port_mutex);
4940         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
4941             if (poll->rxq->pmd != pmd) {
4942                 dp_netdev_del_rxq_from_pmd(pmd, poll);
4943
4944                 /* This pmd might sleep after this step if it has no rxq
4945                  * remaining. Tell it to busy wait for new assignment if it
4946                  * has at least one scheduled queue. */
4947                 if (hmap_count(&pmd->poll_list) == 0 &&
4948                     hmapx_contains(&busy_threads, pmd)) {
4949                     atomic_store_relaxed(&pmd->wait_for_reload, true);
4950                 }
4951             }
4952         }
4953         ovs_mutex_unlock(&pmd->port_mutex);
4954     }
4955
4956     hmapx_destroy(&busy_threads);
4957
4958     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
4959      * the old queues before readding them, otherwise a queue can be polled by
4960      * two threads at the same time. */
4961     reload_affected_pmds(dp);
4962
4963     /* Step 6: Add queues from scheduling, if they're not there already. */
4964     HMAP_FOR_EACH (port, node, &dp->ports) {
4965         if (!netdev_is_pmd(port->netdev)) {
4966             continue;
4967         }
4968
4969         for (int qid = 0; qid < port->n_rxq; qid++) {
4970             struct dp_netdev_rxq *q = &port->rxqs[qid];
4971
4972             if (q->pmd) {
4973                 ovs_mutex_lock(&q->pmd->port_mutex);
4974                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
4975                 ovs_mutex_unlock(&q->pmd->port_mutex);
4976             }
4977         }
4978     }
4979
4980     /* Add every port to the tx cache of every pmd thread, if it's not
4981      * there already and if this pmd has at least one rxq to poll. */
4982     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4983         ovs_mutex_lock(&pmd->port_mutex);
4984         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
4985             HMAP_FOR_EACH (port, node, &dp->ports) {
4986                 dp_netdev_add_port_tx_to_pmd(pmd, port);
4987             }
4988         }
4989         ovs_mutex_unlock(&pmd->port_mutex);
4990     }
4991
4992     /* Reload affected pmd threads. */
4993     reload_affected_pmds(dp);
4994
4995     /* Check if PMD Auto LB is to be enabled */
4996     set_pmd_auto_lb(dp);
4997 }
4998
4999 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5000 static bool
5001 ports_require_restart(const struct dp_netdev *dp)
5002     OVS_REQUIRES(dp->port_mutex)
5003 {
5004     struct dp_netdev_port *port;
5005
5006     HMAP_FOR_EACH (port, node, &dp->ports) {
5007         if (netdev_is_reconf_required(port->netdev)) {
5008             return true;
5009         }
5010     }
5011
5012     return false;
5013 }
5014
5015 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5016  * of elements in array to be considered for calculating vairance.
5017  * Usage example: data array 'a' contains the processing load of each pmd and
5018  * 'n' is the number of PMDs. It returns the variance in processing load of
5019  * PMDs*/
5020 static uint64_t
5021 variance(uint64_t a[], int n)
5022 {
5023     /* Compute mean (average of elements). */
5024     uint64_t sum = 0;
5025     uint64_t mean = 0;
5026     uint64_t sqDiff = 0;
5027
5028     if (!n) {
5029         return 0;
5030     }
5031
5032     for (int i = 0; i < n; i++) {
5033         sum += a[i];
5034     }
5035
5036     if (sum) {
5037         mean = sum / n;
5038
5039         /* Compute sum squared differences with mean. */
5040         for (int i = 0; i < n; i++) {
5041             sqDiff += (a[i] - mean)*(a[i] - mean);
5042         }
5043     }
5044     return (sqDiff ? (sqDiff / n) : 0);
5045 }
5046
5047
5048 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5049  * assignment to PMDs. */
5050 static bool
5051 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5052                      uint32_t num_pmds, uint64_t *predicted_variance)
5053     OVS_REQUIRES(dp->port_mutex)
5054 {
5055     struct dp_netdev_port *port;
5056     struct dp_netdev_pmd_thread *pmd;
5057     struct dp_netdev_rxq **rxqs = NULL;
5058     struct rr_numa *numa = NULL;
5059     struct rr_numa_list rr;
5060     int n_rxqs = 0;
5061     bool ret = false;
5062     uint64_t *pmd_usage;
5063
5064     if (!predicted_variance) {
5065         return ret;
5066     }
5067
5068     pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5069
5070     HMAP_FOR_EACH (port, node, &dp->ports) {
5071         if (!netdev_is_pmd(port->netdev)) {
5072             continue;
5073         }
5074
5075         for (int qid = 0; qid < port->n_rxq; qid++) {
5076             struct dp_netdev_rxq *q = &port->rxqs[qid];
5077             uint64_t cycle_hist = 0;
5078
5079             if (q->pmd->isolated) {
5080                 continue;
5081             }
5082
5083             if (n_rxqs == 0) {
5084                 rxqs = xmalloc(sizeof *rxqs);
5085             } else {
5086                 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5087             }
5088
5089             /* Sum the queue intervals and store the cycle history. */
5090             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5091                 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5092             }
5093             dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5094                                          cycle_hist);
5095             /* Store the queue. */
5096             rxqs[n_rxqs++] = q;
5097         }
5098     }
5099     if (n_rxqs > 1) {
5100         /* Sort the queues in order of the processing cycles
5101          * they consumed during their last pmd interval. */
5102         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5103     }
5104     rr_numa_list_populate(dp, &rr);
5105
5106     for (int i = 0; i < n_rxqs; i++) {
5107         int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5108         numa = rr_numa_list_lookup(&rr, numa_id);
5109         if (!numa) {
5110             /* Abort if cross NUMA polling. */
5111             VLOG_DBG("PMD auto lb dry run."
5112                      " Aborting due to cross-numa polling.");
5113             goto cleanup;
5114         }
5115
5116         pmd = rr_numa_get_pmd(numa, true);
5117         VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5118                   "to be assigned port \'%s\' rx queue %d "
5119                   "(measured processing cycles %"PRIu64").",
5120                   pmd->core_id, numa_id,
5121                   netdev_rxq_get_name(rxqs[i]->rx),
5122                   netdev_rxq_get_queue_id(rxqs[i]->rx),
5123                   dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5124
5125         for (int id = 0; id < num_pmds; id++) {
5126             if (pmd->core_id == core_list[id]) {
5127                 /* Add the processing cycles of rxq to pmd polling it. */
5128                 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5129                                         RXQ_CYCLES_PROC_HIST);
5130             }
5131         }
5132     }
5133
5134     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5135         uint64_t total_cycles = 0;
5136
5137         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5138             continue;
5139         }
5140
5141         /* Get the total pmd cycles for an interval. */
5142         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5143         /* Estimate the cycles to cover all intervals. */
5144         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5145         for (int id = 0; id < num_pmds; id++) {
5146             if (pmd->core_id == core_list[id]) {
5147                 if (pmd_usage[id]) {
5148                     pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5149                 }
5150                 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5151                          "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5152             }
5153         }
5154     }
5155     *predicted_variance = variance(pmd_usage, num_pmds);
5156     ret = true;
5157
5158 cleanup:
5159     rr_numa_list_destroy(&rr);
5160     free(rxqs);
5161     free(pmd_usage);
5162     return ret;
5163 }
5164
5165 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5166  * better distribution of load on PMDs. */
5167 static bool
5168 pmd_rebalance_dry_run(struct dp_netdev *dp)
5169     OVS_REQUIRES(dp->port_mutex)
5170 {
5171     struct dp_netdev_pmd_thread *pmd;
5172     uint64_t *curr_pmd_usage;
5173
5174     uint64_t curr_variance;
5175     uint64_t new_variance;
5176     uint64_t improvement = 0;
5177     uint32_t num_pmds;
5178     uint32_t *pmd_corelist;
5179     struct rxq_poll *poll;
5180     bool ret;
5181
5182     num_pmds = cmap_count(&dp->poll_threads);
5183
5184     if (num_pmds > 1) {
5185         curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5186         pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5187     } else {
5188         return false;
5189     }
5190
5191     num_pmds = 0;
5192     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5193         uint64_t total_cycles = 0;
5194         uint64_t total_proc = 0;
5195
5196         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5197             continue;
5198         }
5199
5200         /* Get the total pmd cycles for an interval. */
5201         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5202         /* Estimate the cycles to cover all intervals. */
5203         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5204
5205         ovs_mutex_lock(&pmd->port_mutex);
5206         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5207             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5208                 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5209             }
5210         }
5211         ovs_mutex_unlock(&pmd->port_mutex);
5212
5213         if (total_proc) {
5214             curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5215         }
5216
5217         VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5218                   pmd->core_id, curr_pmd_usage[num_pmds]);
5219
5220         if (atomic_count_get(&pmd->pmd_overloaded)) {
5221             atomic_count_set(&pmd->pmd_overloaded, 0);
5222         }
5223
5224         pmd_corelist[num_pmds] = pmd->core_id;
5225         num_pmds++;
5226     }
5227
5228     curr_variance = variance(curr_pmd_usage, num_pmds);
5229     ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5230
5231     if (ret) {
5232         VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5233                   " Predicted PMD variance: %"PRIu64"",
5234                   curr_variance, new_variance);
5235
5236         if (new_variance < curr_variance) {
5237             improvement =
5238                 ((curr_variance - new_variance) * 100) / curr_variance;
5239         }
5240         if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5241             ret = false;
5242         }
5243     }
5244
5245     free(curr_pmd_usage);
5246     free(pmd_corelist);
5247     return ret;
5248 }
5249
5250
5251 /* Return true if needs to revalidate datapath flows. */
5252 static bool
5253 dpif_netdev_run(struct dpif *dpif)
5254 {
5255     struct dp_netdev_port *port;
5256     struct dp_netdev *dp = get_dp_netdev(dpif);
5257     struct dp_netdev_pmd_thread *non_pmd;
5258     uint64_t new_tnl_seq;
5259     bool need_to_flush = true;
5260     bool pmd_rebalance = false;
5261     long long int now = time_msec();
5262     struct dp_netdev_pmd_thread *pmd;
5263
5264     ovs_mutex_lock(&dp->port_mutex);
5265     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5266     if (non_pmd) {
5267         ovs_mutex_lock(&dp->non_pmd_mutex);
5268         HMAP_FOR_EACH (port, node, &dp->ports) {
5269             if (!netdev_is_pmd(port->netdev)) {
5270                 int i;
5271
5272                 if (port->emc_enabled) {
5273                     atomic_read_relaxed(&dp->emc_insert_min,
5274                                         &non_pmd->ctx.emc_insert_min);
5275                 } else {
5276                     non_pmd->ctx.emc_insert_min = 0;
5277                 }
5278
5279                 for (i = 0; i < port->n_rxq; i++) {
5280
5281                     if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5282                         continue;
5283                     }
5284
5285                     if (dp_netdev_process_rxq_port(non_pmd,
5286                                                    &port->rxqs[i],
5287                                                    port->port_no)) {
5288                         need_to_flush = false;
5289                     }
5290                 }
5291             }
5292         }
5293         if (need_to_flush) {
5294             /* We didn't receive anything in the process loop.
5295              * Check if we need to send something.
5296              * There was no time updates on current iteration. */
5297             pmd_thread_ctx_time_update(non_pmd);
5298             dp_netdev_pmd_flush_output_packets(non_pmd, false);
5299         }
5300
5301         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5302         ovs_mutex_unlock(&dp->non_pmd_mutex);
5303
5304         dp_netdev_pmd_unref(non_pmd);
5305     }
5306
5307     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5308     if (pmd_alb->is_enabled) {
5309         if (!pmd_alb->rebalance_poll_timer) {
5310             pmd_alb->rebalance_poll_timer = now;
5311         } else if ((pmd_alb->rebalance_poll_timer +
5312                    pmd_alb->rebalance_intvl) < now) {
5313             pmd_alb->rebalance_poll_timer = now;
5314             CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5315                 if (atomic_count_get(&pmd->pmd_overloaded) >=
5316                                     PMD_RXQ_INTERVAL_MAX) {
5317                     pmd_rebalance = true;
5318                     break;
5319                 }
5320             }
5321
5322             if (pmd_rebalance &&
5323                 !dp_netdev_is_reconf_required(dp) &&
5324                 !ports_require_restart(dp) &&
5325                 pmd_rebalance_dry_run(dp)) {
5326                 VLOG_INFO("PMD auto lb dry run."
5327                           " requesting datapath reconfigure.");
5328                 dp_netdev_request_reconfigure(dp);
5329             }
5330         }
5331     }
5332
5333     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5334         reconfigure_datapath(dp);
5335     }
5336     ovs_mutex_unlock(&dp->port_mutex);
5337
5338     tnl_neigh_cache_run();
5339     tnl_port_map_run();
5340     new_tnl_seq = seq_read(tnl_conf_seq);
5341
5342     if (dp->last_tnl_conf_seq != new_tnl_seq) {
5343         dp->last_tnl_conf_seq = new_tnl_seq;
5344         return true;
5345     }
5346     return false;
5347 }
5348
5349 static void
5350 dpif_netdev_wait(struct dpif *dpif)
5351 {
5352     struct dp_netdev_port *port;
5353     struct dp_netdev *dp = get_dp_netdev(dpif);
5354
5355     ovs_mutex_lock(&dp_netdev_mutex);
5356     ovs_mutex_lock(&dp->port_mutex);
5357     HMAP_FOR_EACH (port, node, &dp->ports) {
5358         netdev_wait_reconf_required(port->netdev);
5359         if (!netdev_is_pmd(port->netdev)) {
5360             int i;
5361
5362             for (i = 0; i < port->n_rxq; i++) {
5363                 netdev_rxq_wait(port->rxqs[i].rx);
5364             }
5365         }
5366     }
5367     ovs_mutex_unlock(&dp->port_mutex);
5368     ovs_mutex_unlock(&dp_netdev_mutex);
5369     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5370 }
5371
5372 static void
5373 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5374 {
5375     struct tx_port *tx_port_cached;
5376
5377     /* Flush all the queued packets. */
5378     dp_netdev_pmd_flush_output_packets(pmd, true);
5379     /* Free all used tx queue ids. */
5380     dpif_netdev_xps_revalidate_pmd(pmd, true);
5381
5382     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5383         free(tx_port_cached);
5384     }
5385     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5386         free(tx_port_cached);
5387     }
5388 }
5389
5390 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5391  * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5392  * device, otherwise to 'pmd->send_port_cache' if the port has at least
5393  * one txq. */
5394 static void
5395 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5396     OVS_REQUIRES(pmd->port_mutex)
5397 {
5398     struct tx_port *tx_port, *tx_port_cached;
5399
5400     pmd_free_cached_ports(pmd);
5401     hmap_shrink(&pmd->send_port_cache);
5402     hmap_shrink(&pmd->tnl_port_cache);
5403
5404     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5405         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5406             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5407             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5408                         hash_port_no(tx_port_cached->port->port_no));
5409         }
5410
5411         if (netdev_n_txq(tx_port->port->netdev)) {
5412             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5413             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5414                         hash_port_no(tx_port_cached->port->port_no));
5415         }
5416     }
5417 }
5418
5419 static void
5420 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5421 {
5422     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5423     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5424         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5425                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
5426     }
5427     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5428
5429     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5430              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5431 }
5432
5433 static void
5434 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5435 {
5436     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5437     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5438     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5439 }
5440
5441 static int
5442 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5443                           struct polled_queue **ppoll_list)
5444 {
5445     struct polled_queue *poll_list = *ppoll_list;
5446     struct rxq_poll *poll;
5447     int i;
5448
5449     ovs_mutex_lock(&pmd->port_mutex);
5450     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5451                                     * sizeof *poll_list);
5452
5453     i = 0;
5454     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5455         poll_list[i].rxq = poll->rxq;
5456         poll_list[i].port_no = poll->rxq->port->port_no;
5457         poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5458         poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5459         poll_list[i].change_seq =
5460                      netdev_get_change_seq(poll->rxq->port->netdev);
5461         i++;
5462     }
5463
5464     pmd_load_cached_ports(pmd);
5465
5466     ovs_mutex_unlock(&pmd->port_mutex);
5467
5468     *ppoll_list = poll_list;
5469     return i;
5470 }
5471
5472 static void *
5473 pmd_thread_main(void *f_)
5474 {
5475     struct dp_netdev_pmd_thread *pmd = f_;
5476     struct pmd_perf_stats *s = &pmd->perf_stats;
5477     unsigned int lc = 0;
5478     struct polled_queue *poll_list;
5479     bool wait_for_reload = false;
5480     bool reload_tx_qid;
5481     bool exiting;
5482     bool reload;
5483     int poll_cnt;
5484     int i;
5485     int process_packets = 0;
5486
5487     poll_list = NULL;
5488
5489     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5490     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5491     ovs_numa_thread_setaffinity_core(pmd->core_id);
5492     dpdk_set_lcore_id(pmd->core_id);
5493     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5494     dfc_cache_init(&pmd->flow_cache);
5495     pmd_alloc_static_tx_qid(pmd);
5496
5497 reload:
5498     atomic_count_init(&pmd->pmd_overloaded, 0);
5499
5500     /* List port/core affinity */
5501     for (i = 0; i < poll_cnt; i++) {
5502        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5503                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5504                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5505        /* Reset the rxq current cycles counter. */
5506        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5507     }
5508
5509     if (!poll_cnt) {
5510         if (wait_for_reload) {
5511             /* Don't sleep, control thread will ask for a reload shortly. */
5512             do {
5513                 atomic_read_explicit(&pmd->reload, &reload,
5514                                      memory_order_acquire);
5515             } while (!reload);
5516         } else {
5517             while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5518                 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5519                 poll_block();
5520             }
5521         }
5522     }
5523
5524     pmd->intrvl_tsc_prev = 0;
5525     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5526     cycles_counter_update(s);
5527     /* Protect pmd stats from external clearing while polling. */
5528     ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5529     for (;;) {
5530         uint64_t rx_packets = 0, tx_packets = 0;
5531
5532         pmd_perf_start_iteration(s);
5533
5534         for (i = 0; i < poll_cnt; i++) {
5535
5536             if (!poll_list[i].rxq_enabled) {
5537                 continue;
5538             }
5539
5540             if (poll_list[i].emc_enabled) {
5541                 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5542                                     &pmd->ctx.emc_insert_min);
5543             } else {
5544                 pmd->ctx.emc_insert_min = 0;
5545             }
5546
5547             process_packets =
5548                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5549                                            poll_list[i].port_no);
5550             rx_packets += process_packets;
5551         }
5552
5553         if (!rx_packets) {
5554             /* We didn't receive anything in the process loop.
5555              * Check if we need to send something.
5556              * There was no time updates on current iteration. */
5557             pmd_thread_ctx_time_update(pmd);
5558             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5559         }
5560
5561         if (lc++ > 1024) {
5562             lc = 0;
5563
5564             coverage_try_clear();
5565             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5566             if (!ovsrcu_try_quiesce()) {
5567                 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5568             }
5569
5570             for (i = 0; i < poll_cnt; i++) {
5571                 uint64_t current_seq =
5572                          netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5573                 if (poll_list[i].change_seq != current_seq) {
5574                     poll_list[i].change_seq = current_seq;
5575                     poll_list[i].rxq_enabled =
5576                                  netdev_rxq_enabled(poll_list[i].rxq->rx);
5577                 }
5578             }
5579         }
5580
5581         atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5582         if (OVS_UNLIKELY(reload)) {
5583             break;
5584         }
5585
5586         pmd_perf_end_iteration(s, rx_packets, tx_packets,
5587                                pmd_perf_metrics_enabled(pmd));
5588     }
5589     ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5590
5591     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5592     atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5593     atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5594     atomic_read_relaxed(&pmd->exit, &exiting);
5595     /* Signal here to make sure the pmd finishes
5596      * reloading the updated configuration. */
5597     dp_netdev_pmd_reload_done(pmd);
5598
5599     if (reload_tx_qid) {
5600         pmd_free_static_tx_qid(pmd);
5601         pmd_alloc_static_tx_qid(pmd);
5602     }
5603
5604     if (!exiting) {
5605         goto reload;
5606     }
5607
5608     pmd_free_static_tx_qid(pmd);
5609     dfc_cache_uninit(&pmd->flow_cache);
5610     free(poll_list);
5611     pmd_free_cached_ports(pmd);
5612     return NULL;
5613 }
5614
5615 static void
5616 dp_netdev_disable_upcall(struct dp_netdev *dp)
5617     OVS_ACQUIRES(dp->upcall_rwlock)
5618 {
5619     fat_rwlock_wrlock(&dp->upcall_rwlock);
5620 }
5621
5622 \f
5623 /* Meters */
5624 static void
5625 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5626                                struct ofputil_meter_features *features)
5627 {
5628     features->max_meters = MAX_METERS;
5629     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5630     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5631     features->max_bands = MAX_BANDS;
5632     features->max_color = 0;
5633 }
5634
5635 /* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
5636  * that exceed a band are dropped in-place. */
5637 static void
5638 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5639                     uint32_t meter_id, long long int now)
5640 {
5641     struct dp_meter *meter;
5642     struct dp_meter_band *band;
5643     struct dp_packet *packet;
5644     long long int long_delta_t; /* msec */
5645     uint32_t delta_t; /* msec */
5646     const size_t cnt = dp_packet_batch_size(packets_);
5647     uint32_t bytes, volume;
5648     int exceeded_band[NETDEV_MAX_BURST];
5649     uint32_t exceeded_rate[NETDEV_MAX_BURST];
5650     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5651
5652     if (meter_id >= MAX_METERS) {
5653         return;
5654     }
5655
5656     meter_lock(dp, meter_id);
5657     meter = dp->meters[meter_id];
5658     if (!meter) {
5659         goto out;
5660     }
5661
5662     /* Initialize as negative values. */
5663     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5664     /* Initialize as zeroes. */
5665     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5666
5667     /* All packets will hit the meter at the same time. */
5668     long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5669
5670     /* Make sure delta_t will not be too large, so that bucket will not
5671      * wrap around below. */
5672     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5673         ? meter->max_delta_t : (uint32_t)long_delta_t;
5674
5675     /* Update meter stats. */
5676     meter->used = now;
5677     meter->packet_count += cnt;
5678     bytes = 0;
5679     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5680         bytes += dp_packet_size(packet);
5681     }
5682     meter->byte_count += bytes;
5683
5684     /* Meters can operate in terms of packets per second or kilobits per
5685      * second. */
5686     if (meter->flags & OFPMF13_PKTPS) {
5687         /* Rate in packets/second, bucket 1/1000 packets. */
5688         /* msec * packets/sec = 1/1000 packets. */
5689         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5690     } else {
5691         /* Rate in kbps, bucket in bits. */
5692         /* msec * kbps = bits */
5693         volume = bytes * 8;
5694     }
5695
5696     /* Update all bands and find the one hit with the highest rate for each
5697      * packet (if any). */
5698     for (int m = 0; m < meter->n_bands; ++m) {
5699         band = &meter->bands[m];
5700
5701         /* Update band's bucket. */
5702         band->bucket += delta_t * band->up.rate;
5703         if (band->bucket > band->up.burst_size) {
5704             band->bucket = band->up.burst_size;
5705         }
5706
5707         /* Drain the bucket for all the packets, if possible. */
5708         if (band->bucket >= volume) {
5709             band->bucket -= volume;
5710         } else {
5711             int band_exceeded_pkt;
5712
5713             /* Band limit hit, must process packet-by-packet. */
5714             if (meter->flags & OFPMF13_PKTPS) {
5715                 band_exceeded_pkt = band->bucket / 1000;
5716                 band->bucket %= 1000; /* Remainder stays in bucket. */
5717
5718                 /* Update the exceeding band for each exceeding packet.
5719                  * (Only one band will be fired by a packet, and that
5720                  * can be different for each packet.) */
5721                 for (int i = band_exceeded_pkt; i < cnt; i++) {
5722                     if (band->up.rate > exceeded_rate[i]) {
5723                         exceeded_rate[i] = band->up.rate;
5724                         exceeded_band[i] = m;
5725                     }
5726                 }
5727             } else {
5728                 /* Packet sizes differ, must process one-by-one. */
5729                 band_exceeded_pkt = cnt;
5730                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5731                     uint32_t bits = dp_packet_size(packet) * 8;
5732
5733                     if (band->bucket >= bits) {
5734                         band->bucket -= bits;
5735                     } else {
5736                         if (i < band_exceeded_pkt) {
5737                             band_exceeded_pkt = i;
5738                         }
5739                         /* Update the exceeding band for the exceeding packet.
5740                          * (Only one band will be fired by a packet, and that
5741                          * can be different for each packet.) */
5742                         if (band->up.rate > exceeded_rate[i]) {
5743                             exceeded_rate[i] = band->up.rate;
5744                             exceeded_band[i] = m;
5745                         }
5746                     }
5747                 }
5748             }
5749             /* Remember the first exceeding packet. */
5750             if (exceeded_pkt > band_exceeded_pkt) {
5751                 exceeded_pkt = band_exceeded_pkt;
5752             }
5753         }
5754     }
5755
5756     /* Fire the highest rate band exceeded by each packet, and drop
5757      * packets if needed. */
5758     size_t j;
5759     DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
5760         if (exceeded_band[j] >= 0) {
5761             /* Meter drop packet. */
5762             band = &meter->bands[exceeded_band[j]];
5763             band->packet_count += 1;
5764             band->byte_count += dp_packet_size(packet);
5765
5766             dp_packet_delete(packet);
5767         } else {
5768             /* Meter accepts packet. */
5769             dp_packet_batch_refill(packets_, packet, j);
5770         }
5771     }
5772  out:
5773     meter_unlock(dp, meter_id);
5774 }
5775
5776 /* Meter set/get/del processing is still single-threaded. */
5777 static int
5778 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
5779                       struct ofputil_meter_config *config)
5780 {
5781     struct dp_netdev *dp = get_dp_netdev(dpif);
5782     uint32_t mid = meter_id.uint32;
5783     struct dp_meter *meter;
5784     int i;
5785
5786     if (mid >= MAX_METERS) {
5787         return EFBIG; /* Meter_id out of range. */
5788     }
5789
5790     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
5791         return EBADF; /* Unsupported flags set */
5792     }
5793
5794     if (config->n_bands > MAX_BANDS) {
5795         return EINVAL;
5796     }
5797
5798     for (i = 0; i < config->n_bands; ++i) {
5799         switch (config->bands[i].type) {
5800         case OFPMBT13_DROP:
5801             break;
5802         default:
5803             return ENODEV; /* Unsupported band type */
5804         }
5805     }
5806
5807     /* Allocate meter */
5808     meter = xzalloc(sizeof *meter
5809                     + config->n_bands * sizeof(struct dp_meter_band));
5810
5811     meter->flags = config->flags;
5812     meter->n_bands = config->n_bands;
5813     meter->max_delta_t = 0;
5814     meter->used = time_usec();
5815
5816     /* set up bands */
5817     for (i = 0; i < config->n_bands; ++i) {
5818         uint32_t band_max_delta_t;
5819
5820         /* Set burst size to a workable value if none specified. */
5821         if (config->bands[i].burst_size == 0) {
5822             config->bands[i].burst_size = config->bands[i].rate;
5823         }
5824
5825         meter->bands[i].up = config->bands[i];
5826         /* Convert burst size to the bucket units: */
5827         /* pkts => 1/1000 packets, kilobits => bits. */
5828         meter->bands[i].up.burst_size *= 1000;
5829         /* Initialize bucket to empty. */
5830         meter->bands[i].bucket = 0;
5831
5832         /* Figure out max delta_t that is enough to fill any bucket. */
5833         band_max_delta_t
5834             = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
5835         if (band_max_delta_t > meter->max_delta_t) {
5836             meter->max_delta_t = band_max_delta_t;
5837         }
5838     }
5839
5840     meter_lock(dp, mid);
5841     dp_delete_meter(dp, mid); /* Free existing meter, if any */
5842     dp->meters[mid] = meter;
5843     meter_unlock(dp, mid);
5844
5845     return 0;
5846 }
5847
5848 static int
5849 dpif_netdev_meter_get(const struct dpif *dpif,
5850                       ofproto_meter_id meter_id_,
5851                       struct ofputil_meter_stats *stats, uint16_t n_bands)
5852 {
5853     const struct dp_netdev *dp = get_dp_netdev(dpif);
5854     uint32_t meter_id = meter_id_.uint32;
5855     int retval = 0;
5856
5857     if (meter_id >= MAX_METERS) {
5858         return EFBIG;
5859     }
5860
5861     meter_lock(dp, meter_id);
5862     const struct dp_meter *meter = dp->meters[meter_id];
5863     if (!meter) {
5864         retval = ENOENT;
5865         goto done;
5866     }
5867     if (stats) {
5868         int i = 0;
5869
5870         stats->packet_in_count = meter->packet_count;
5871         stats->byte_in_count = meter->byte_count;
5872
5873         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
5874             stats->bands[i].packet_count = meter->bands[i].packet_count;
5875             stats->bands[i].byte_count = meter->bands[i].byte_count;
5876         }
5877
5878         stats->n_bands = i;
5879     }
5880
5881 done:
5882     meter_unlock(dp, meter_id);
5883     return retval;
5884 }
5885
5886 static int
5887 dpif_netdev_meter_del(struct dpif *dpif,
5888                       ofproto_meter_id meter_id_,
5889                       struct ofputil_meter_stats *stats, uint16_t n_bands)
5890 {
5891     struct dp_netdev *dp = get_dp_netdev(dpif);
5892     int error;
5893
5894     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
5895     if (!error) {
5896         uint32_t meter_id = meter_id_.uint32;
5897
5898         meter_lock(dp, meter_id);
5899         dp_delete_meter(dp, meter_id);
5900         meter_unlock(dp, meter_id);
5901     }
5902     return error;
5903 }
5904
5905 \f
5906 static void
5907 dpif_netdev_disable_upcall(struct dpif *dpif)
5908     OVS_NO_THREAD_SAFETY_ANALYSIS
5909 {
5910     struct dp_netdev *dp = get_dp_netdev(dpif);
5911     dp_netdev_disable_upcall(dp);
5912 }
5913
5914 static void
5915 dp_netdev_enable_upcall(struct dp_netdev *dp)
5916     OVS_RELEASES(dp->upcall_rwlock)
5917 {
5918     fat_rwlock_unlock(&dp->upcall_rwlock);
5919 }
5920
5921 static void
5922 dpif_netdev_enable_upcall(struct dpif *dpif)
5923     OVS_NO_THREAD_SAFETY_ANALYSIS
5924 {
5925     struct dp_netdev *dp = get_dp_netdev(dpif);
5926     dp_netdev_enable_upcall(dp);
5927 }
5928
5929 static void
5930 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
5931 {
5932     atomic_store_relaxed(&pmd->wait_for_reload, false);
5933     atomic_store_relaxed(&pmd->reload_tx_qid, false);
5934     pmd->last_reload_seq = seq_read(pmd->reload_seq);
5935     atomic_store_explicit(&pmd->reload, false, memory_order_release);
5936 }
5937
5938 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
5939  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
5940  * 'core_id' is NON_PMD_CORE_ID).
5941  *
5942  * Caller must unrefs the returned reference.  */
5943 static struct dp_netdev_pmd_thread *
5944 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
5945 {
5946     struct dp_netdev_pmd_thread *pmd;
5947     const struct cmap_node *pnode;
5948
5949     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
5950     if (!pnode) {
5951         return NULL;
5952     }
5953     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
5954
5955     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
5956 }
5957
5958 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
5959 static void
5960 dp_netdev_set_nonpmd(struct dp_netdev *dp)
5961     OVS_REQUIRES(dp->port_mutex)
5962 {
5963     struct dp_netdev_pmd_thread *non_pmd;
5964
5965     non_pmd = xzalloc(sizeof *non_pmd);
5966     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
5967 }
5968
5969 /* Caller must have valid pointer to 'pmd'. */
5970 static bool
5971 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
5972 {
5973     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
5974 }
5975
5976 static void
5977 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
5978 {
5979     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
5980         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
5981     }
5982 }
5983
5984 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
5985  * fails, keeps checking for next node until reaching the end of cmap.
5986  *
5987  * Caller must unrefs the returned reference. */
5988 static struct dp_netdev_pmd_thread *
5989 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
5990 {
5991     struct dp_netdev_pmd_thread *next;
5992
5993     do {
5994         struct cmap_node *node;
5995
5996         node = cmap_next_position(&dp->poll_threads, pos);
5997         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
5998             : NULL;
5999     } while (next && !dp_netdev_pmd_try_ref(next));
6000
6001     return next;
6002 }
6003
6004 /* Configures the 'pmd' based on the input argument. */
6005 static void
6006 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6007                         unsigned core_id, int numa_id)
6008 {
6009     pmd->dp = dp;
6010     pmd->core_id = core_id;
6011     pmd->numa_id = numa_id;
6012     pmd->need_reload = false;
6013     pmd->n_output_batches = 0;
6014
6015     ovs_refcount_init(&pmd->ref_cnt);
6016     atomic_init(&pmd->exit, false);
6017     pmd->reload_seq = seq_create();
6018     pmd->last_reload_seq = seq_read(pmd->reload_seq);
6019     atomic_init(&pmd->reload, false);
6020     ovs_mutex_init(&pmd->flow_mutex);
6021     ovs_mutex_init(&pmd->port_mutex);
6022     cmap_init(&pmd->flow_table);
6023     cmap_init(&pmd->classifiers);
6024     pmd->ctx.last_rxq = NULL;
6025     pmd_thread_ctx_time_update(pmd);
6026     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6027     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6028     hmap_init(&pmd->poll_list);
6029     hmap_init(&pmd->tx_ports);
6030     hmap_init(&pmd->tnl_port_cache);
6031     hmap_init(&pmd->send_port_cache);
6032     /* init the 'flow_cache' since there is no
6033      * actual thread created for NON_PMD_CORE_ID. */
6034     if (core_id == NON_PMD_CORE_ID) {
6035         dfc_cache_init(&pmd->flow_cache);
6036         pmd_alloc_static_tx_qid(pmd);
6037     }
6038     pmd_perf_stats_init(&pmd->perf_stats);
6039     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6040                 hash_int(core_id, 0));
6041 }
6042
6043 static void
6044 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6045 {
6046     struct dpcls *cls;
6047
6048     dp_netdev_pmd_flow_flush(pmd);
6049     hmap_destroy(&pmd->send_port_cache);
6050     hmap_destroy(&pmd->tnl_port_cache);
6051     hmap_destroy(&pmd->tx_ports);
6052     hmap_destroy(&pmd->poll_list);
6053     /* All flows (including their dpcls_rules) have been deleted already */
6054     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6055         dpcls_destroy(cls);
6056         ovsrcu_postpone(free, cls);
6057     }
6058     cmap_destroy(&pmd->classifiers);
6059     cmap_destroy(&pmd->flow_table);
6060     ovs_mutex_destroy(&pmd->flow_mutex);
6061     seq_destroy(pmd->reload_seq);
6062     ovs_mutex_destroy(&pmd->port_mutex);
6063     free(pmd);
6064 }
6065
6066 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6067  * and unrefs the struct. */
6068 static void
6069 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6070 {
6071     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6072      * but extra cleanup is necessary */
6073     if (pmd->core_id == NON_PMD_CORE_ID) {
6074         ovs_mutex_lock(&dp->non_pmd_mutex);
6075         dfc_cache_uninit(&pmd->flow_cache);
6076         pmd_free_cached_ports(pmd);
6077         pmd_free_static_tx_qid(pmd);
6078         ovs_mutex_unlock(&dp->non_pmd_mutex);
6079     } else {
6080         atomic_store_relaxed(&pmd->exit, true);
6081         dp_netdev_reload_pmd__(pmd);
6082         xpthread_join(pmd->thread, NULL);
6083     }
6084
6085     dp_netdev_pmd_clear_ports(pmd);
6086
6087     /* Purges the 'pmd''s flows after stopping the thread, but before
6088      * destroying the flows, so that the flow stats can be collected. */
6089     if (dp->dp_purge_cb) {
6090         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6091     }
6092     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6093     dp_netdev_pmd_unref(pmd);
6094 }
6095
6096 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6097  * thread. */
6098 static void
6099 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6100 {
6101     struct dp_netdev_pmd_thread *pmd;
6102     struct dp_netdev_pmd_thread **pmd_list;
6103     size_t k = 0, n_pmds;
6104
6105     n_pmds = cmap_count(&dp->poll_threads);
6106     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6107
6108     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6109         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6110             continue;
6111         }
6112         /* We cannot call dp_netdev_del_pmd(), since it alters
6113          * 'dp->poll_threads' (while we're iterating it) and it
6114          * might quiesce. */
6115         ovs_assert(k < n_pmds);
6116         pmd_list[k++] = pmd;
6117     }
6118
6119     for (size_t i = 0; i < k; i++) {
6120         dp_netdev_del_pmd(dp, pmd_list[i]);
6121     }
6122     free(pmd_list);
6123 }
6124
6125 /* Deletes all rx queues from pmd->poll_list and all the ports from
6126  * pmd->tx_ports. */
6127 static void
6128 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6129 {
6130     struct rxq_poll *poll;
6131     struct tx_port *port;
6132
6133     ovs_mutex_lock(&pmd->port_mutex);
6134     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6135         free(poll);
6136     }
6137     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6138         free(port);
6139     }
6140     ovs_mutex_unlock(&pmd->port_mutex);
6141 }
6142
6143 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6144 static void
6145 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6146                          struct dp_netdev_rxq *rxq)
6147     OVS_REQUIRES(pmd->port_mutex)
6148 {
6149     int qid = netdev_rxq_get_queue_id(rxq->rx);
6150     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6151     struct rxq_poll *poll;
6152
6153     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6154         if (poll->rxq == rxq) {
6155             /* 'rxq' is already polled by this thread. Do nothing. */
6156             return;
6157         }
6158     }
6159
6160     poll = xmalloc(sizeof *poll);
6161     poll->rxq = rxq;
6162     hmap_insert(&pmd->poll_list, &poll->node, hash);
6163
6164     pmd->need_reload = true;
6165 }
6166
6167 /* Delete 'poll' from poll_list of PMD thread. */
6168 static void
6169 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6170                            struct rxq_poll *poll)
6171     OVS_REQUIRES(pmd->port_mutex)
6172 {
6173     hmap_remove(&pmd->poll_list, &poll->node);
6174     free(poll);
6175
6176     pmd->need_reload = true;
6177 }
6178
6179 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6180  * changes to take effect. */
6181 static void
6182 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6183                              struct dp_netdev_port *port)
6184     OVS_REQUIRES(pmd->port_mutex)
6185 {
6186     struct tx_port *tx;
6187
6188     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6189     if (tx) {
6190         /* 'port' is already on this thread tx cache. Do nothing. */
6191         return;
6192     }
6193
6194     tx = xzalloc(sizeof *tx);
6195
6196     tx->port = port;
6197     tx->qid = -1;
6198     tx->flush_time = 0LL;
6199     dp_packet_batch_init(&tx->output_pkts);
6200
6201     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6202     pmd->need_reload = true;
6203 }
6204
6205 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6206  * changes to take effect. */
6207 static void
6208 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6209                                struct tx_port *tx)
6210     OVS_REQUIRES(pmd->port_mutex)
6211 {
6212     hmap_remove(&pmd->tx_ports, &tx->node);
6213     free(tx);
6214     pmd->need_reload = true;
6215 }
6216 \f
6217 static char *
6218 dpif_netdev_get_datapath_version(void)
6219 {
6220      return xstrdup("<built-in>");
6221 }
6222
6223 static void
6224 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6225                     uint16_t tcp_flags, long long now)
6226 {
6227     uint16_t flags;
6228
6229     atomic_store_relaxed(&netdev_flow->stats.used, now);
6230     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6231     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6232     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6233     flags |= tcp_flags;
6234     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6235 }
6236
6237 static int
6238 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6239                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6240                  enum dpif_upcall_type type, const struct nlattr *userdata,
6241                  struct ofpbuf *actions, struct ofpbuf *put_actions)
6242 {
6243     struct dp_netdev *dp = pmd->dp;
6244
6245     if (OVS_UNLIKELY(!dp->upcall_cb)) {
6246         return ENODEV;
6247     }
6248
6249     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6250         struct ds ds = DS_EMPTY_INITIALIZER;
6251         char *packet_str;
6252         struct ofpbuf key;
6253         struct odp_flow_key_parms odp_parms = {
6254             .flow = flow,
6255             .mask = wc ? &wc->masks : NULL,
6256             .support = dp_netdev_support,
6257         };
6258
6259         ofpbuf_init(&key, 0);
6260         odp_flow_key_from_flow(&odp_parms, &key);
6261         packet_str = ofp_dp_packet_to_string(packet_);
6262
6263         odp_flow_key_format(key.data, key.size, &ds);
6264
6265         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6266                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6267
6268         ofpbuf_uninit(&key);
6269         free(packet_str);
6270
6271         ds_destroy(&ds);
6272     }
6273
6274     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6275                          actions, wc, put_actions, dp->upcall_aux);
6276 }
6277
6278 static inline uint32_t
6279 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6280                                 const struct miniflow *mf)
6281 {
6282     uint32_t hash;
6283
6284     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6285         hash = dp_packet_get_rss_hash(packet);
6286     } else {
6287         hash = miniflow_hash_5tuple(mf, 0);
6288         dp_packet_set_rss_hash(packet, hash);
6289     }
6290
6291     return hash;
6292 }
6293
6294 static inline uint32_t
6295 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6296                                 const struct miniflow *mf)
6297 {
6298     uint32_t hash, recirc_depth;
6299
6300     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6301         hash = dp_packet_get_rss_hash(packet);
6302     } else {
6303         hash = miniflow_hash_5tuple(mf, 0);
6304         dp_packet_set_rss_hash(packet, hash);
6305     }
6306
6307     /* The RSS hash must account for the recirculation depth to avoid
6308      * collisions in the exact match cache */
6309     recirc_depth = *recirc_depth_get_unsafe();
6310     if (OVS_UNLIKELY(recirc_depth)) {
6311         hash = hash_finish(hash, recirc_depth);
6312         dp_packet_set_rss_hash(packet, hash);
6313     }
6314     return hash;
6315 }
6316
6317 struct packet_batch_per_flow {
6318     unsigned int byte_count;
6319     uint16_t tcp_flags;
6320     struct dp_netdev_flow *flow;
6321
6322     struct dp_packet_batch array;
6323 };
6324
6325 static inline void
6326 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6327                              struct dp_packet *packet,
6328                              uint16_t tcp_flags)
6329 {
6330     batch->byte_count += dp_packet_size(packet);
6331     batch->tcp_flags |= tcp_flags;
6332     batch->array.packets[batch->array.count++] = packet;
6333 }
6334
6335 static inline void
6336 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6337                            struct dp_netdev_flow *flow)
6338 {
6339     flow->batch = batch;
6340
6341     batch->flow = flow;
6342     dp_packet_batch_init(&batch->array);
6343     batch->byte_count = 0;
6344     batch->tcp_flags = 0;
6345 }
6346
6347 static inline void
6348 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6349                               struct dp_netdev_pmd_thread *pmd)
6350 {
6351     struct dp_netdev_actions *actions;
6352     struct dp_netdev_flow *flow = batch->flow;
6353
6354     dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
6355                         batch->tcp_flags, pmd->ctx.now / 1000);
6356
6357     actions = dp_netdev_flow_get_actions(flow);
6358
6359     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6360                               actions->actions, actions->size);
6361 }
6362
6363 static inline void
6364 dp_netdev_queue_batches(struct dp_packet *pkt,
6365                         struct dp_netdev_flow *flow, uint16_t tcp_flags,
6366                         struct packet_batch_per_flow *batches,
6367                         size_t *n_batches)
6368 {
6369     struct packet_batch_per_flow *batch = flow->batch;
6370
6371     if (OVS_UNLIKELY(!batch)) {
6372         batch = &batches[(*n_batches)++];
6373         packet_batch_per_flow_init(batch, flow);
6374     }
6375
6376     packet_batch_per_flow_update(batch, pkt, tcp_flags);
6377 }
6378
6379 static inline void
6380 packet_enqueue_to_flow_map(struct dp_packet *packet,
6381                            struct dp_netdev_flow *flow,
6382                            uint16_t tcp_flags,
6383                            struct dp_packet_flow_map *flow_map,
6384                            size_t index)
6385 {
6386     struct dp_packet_flow_map *map = &flow_map[index];
6387     map->flow = flow;
6388     map->packet = packet;
6389     map->tcp_flags = tcp_flags;
6390 }
6391
6392 /* SMC lookup function for a batch of packets.
6393  * By doing batching SMC lookup, we can use prefetch
6394  * to hide memory access latency.
6395  */
6396 static inline void
6397 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6398             struct netdev_flow_key *keys,
6399             struct netdev_flow_key **missed_keys,
6400             struct dp_packet_batch *packets_,
6401             const int cnt,
6402             struct dp_packet_flow_map *flow_map,
6403             uint8_t *index_map)
6404 {
6405     int i;
6406     struct dp_packet *packet;
6407     size_t n_smc_hit = 0, n_missed = 0;
6408     struct dfc_cache *cache = &pmd->flow_cache;
6409     struct smc_cache *smc_cache = &cache->smc_cache;
6410     const struct cmap_node *flow_node;
6411     int recv_idx;
6412     uint16_t tcp_flags;
6413
6414     /* Prefetch buckets for all packets */
6415     for (i = 0; i < cnt; i++) {
6416         OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6417     }
6418
6419     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6420         struct dp_netdev_flow *flow = NULL;
6421         flow_node = smc_entry_get(pmd, keys[i].hash);
6422         bool hit = false;
6423         /* Get the original order of this packet in received batch. */
6424         recv_idx = index_map[i];
6425
6426         if (OVS_LIKELY(flow_node != NULL)) {
6427             CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6428                 /* Since we dont have per-port megaflow to check the port
6429                  * number, we need to  verify that the input ports match. */
6430                 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6431                 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6432                     tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6433
6434                     /* SMC hit and emc miss, we insert into EMC */
6435                     keys[i].len =
6436                         netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6437                     emc_probabilistic_insert(pmd, &keys[i], flow);
6438                     /* Add these packets into the flow map in the same order
6439                      * as received.
6440                      */
6441                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6442                                                flow_map, recv_idx);
6443                     n_smc_hit++;
6444                     hit = true;
6445                     break;
6446                 }
6447             }
6448             if (hit) {
6449                 continue;
6450             }
6451         }
6452
6453         /* SMC missed. Group missed packets together at
6454          * the beginning of the 'packets' array. */
6455         dp_packet_batch_refill(packets_, packet, i);
6456
6457         /* Preserve the order of packet for flow batching. */
6458         index_map[n_missed] = recv_idx;
6459
6460         /* Put missed keys to the pointer arrays return to the caller */
6461         missed_keys[n_missed++] = &keys[i];
6462     }
6463
6464     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6465 }
6466
6467 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6468  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6469  * miniflow is copied into 'keys' and the packet pointer is moved at the
6470  * beginning of the 'packets' array. The pointers of missed keys are put in the
6471  * missed_keys pointer array for future processing.
6472  *
6473  * The function returns the number of packets that needs to be processed in the
6474  * 'packets' array (they have been moved to the beginning of the vector).
6475  *
6476  * For performance reasons a caller may choose not to initialize the metadata
6477  * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
6478  * is not valid and must be initialized by this function using 'port_no'.
6479  * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6480  * will be ignored.
6481  */
6482 static inline size_t
6483 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6484                struct dp_packet_batch *packets_,
6485                struct netdev_flow_key *keys,
6486                struct netdev_flow_key **missed_keys,
6487                struct packet_batch_per_flow batches[], size_t *n_batches,
6488                struct dp_packet_flow_map *flow_map,
6489                size_t *n_flows, uint8_t *index_map,
6490                bool md_is_valid, odp_port_t port_no)
6491 {
6492     struct netdev_flow_key *key = &keys[0];
6493     size_t n_missed = 0, n_emc_hit = 0;
6494     struct dfc_cache *cache = &pmd->flow_cache;
6495     struct dp_packet *packet;
6496     const size_t cnt = dp_packet_batch_size(packets_);
6497     uint32_t cur_min = pmd->ctx.emc_insert_min;
6498     int i;
6499     uint16_t tcp_flags;
6500     bool smc_enable_db;
6501     size_t map_cnt = 0;
6502     bool batch_enable = true;
6503
6504     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6505     pmd_perf_update_counter(&pmd->perf_stats,
6506                             md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6507                             cnt);
6508
6509     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6510         struct dp_netdev_flow *flow;
6511         uint32_t mark;
6512
6513         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6514             dp_packet_delete(packet);
6515             continue;
6516         }
6517
6518         if (i != cnt - 1) {
6519             struct dp_packet **packets = packets_->packets;
6520             /* Prefetch next packet data and metadata. */
6521             OVS_PREFETCH(dp_packet_data(packets[i+1]));
6522             pkt_metadata_prefetch_init(&packets[i+1]->md);
6523         }
6524
6525         if (!md_is_valid) {
6526             pkt_metadata_init(&packet->md, port_no);
6527         }
6528
6529         if ((*recirc_depth_get() == 0) &&
6530             dp_packet_has_flow_mark(packet, &mark)) {
6531             flow = mark_to_flow_find(pmd, mark);
6532             if (OVS_LIKELY(flow)) {
6533                 tcp_flags = parse_tcp_flags(packet);
6534                 if (OVS_LIKELY(batch_enable)) {
6535                     dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6536                                             n_batches);
6537                 } else {
6538                     /* Flow batching should be performed only after fast-path
6539                      * processing is also completed for packets with emc miss
6540                      * or else it will result in reordering of packets with
6541                      * same datapath flows. */
6542                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6543                                                flow_map, map_cnt++);
6544                 }
6545                 continue;
6546             }
6547         }
6548
6549         miniflow_extract(packet, &key->mf);
6550         key->len = 0; /* Not computed yet. */
6551         key->hash =
6552                 (md_is_valid == false)
6553                 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6554                 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6555
6556         /* If EMC is disabled skip emc_lookup */
6557         flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6558         if (OVS_LIKELY(flow)) {
6559             tcp_flags = miniflow_get_tcp_flags(&key->mf);
6560             n_emc_hit++;
6561             if (OVS_LIKELY(batch_enable)) {
6562                 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6563                                         n_batches);
6564             } else {
6565                 /* Flow batching should be performed only after fast-path
6566                  * processing is also completed for packets with emc miss
6567                  * or else it will result in reordering of packets with
6568                  * same datapath flows. */
6569                 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6570                                            flow_map, map_cnt++);
6571             }
6572         } else {
6573             /* Exact match cache missed. Group missed packets together at
6574              * the beginning of the 'packets' array. */
6575             dp_packet_batch_refill(packets_, packet, i);
6576
6577             /* Preserve the order of packet for flow batching. */
6578             index_map[n_missed] = map_cnt;
6579             flow_map[map_cnt++].flow = NULL;
6580
6581             /* 'key[n_missed]' contains the key of the current packet and it
6582              * will be passed to SMC lookup. The next key should be extracted
6583              * to 'keys[n_missed + 1]'.
6584              * We also maintain a pointer array to keys missed both SMC and EMC
6585              * which will be returned to the caller for future processing. */
6586             missed_keys[n_missed] = key;
6587             key = &keys[++n_missed];
6588
6589             /* Skip batching for subsequent packets to avoid reordering. */
6590             batch_enable = false;
6591         }
6592     }
6593     /* Count of packets which are not flow batched. */
6594     *n_flows = map_cnt;
6595
6596     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6597
6598     if (!smc_enable_db) {
6599         return dp_packet_batch_size(packets_);
6600     }
6601
6602     /* Packets miss EMC will do a batch lookup in SMC if enabled */
6603     smc_lookup_batch(pmd, keys, missed_keys, packets_,
6604                      n_missed, flow_map, index_map);
6605
6606     return dp_packet_batch_size(packets_);
6607 }
6608
6609 static inline int
6610 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6611                      struct dp_packet *packet,
6612                      const struct netdev_flow_key *key,
6613                      struct ofpbuf *actions, struct ofpbuf *put_actions)
6614 {
6615     struct ofpbuf *add_actions;
6616     struct dp_packet_batch b;
6617     struct match match;
6618     ovs_u128 ufid;
6619     int error;
6620     uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6621
6622     match.tun_md.valid = false;
6623     miniflow_expand(&key->mf, &match.flow);
6624
6625     ofpbuf_clear(actions);
6626     ofpbuf_clear(put_actions);
6627
6628     dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
6629     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6630                              &ufid, DPIF_UC_MISS, NULL, actions,
6631                              put_actions);
6632     if (OVS_UNLIKELY(error && error != ENOSPC)) {
6633         dp_packet_delete(packet);
6634         return error;
6635     }
6636
6637     /* The Netlink encoding of datapath flow keys cannot express
6638      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6639      * tag is interpreted as exact match on the fact that there is no
6640      * VLAN.  Unless we refactor a lot of code that translates between
6641      * Netlink and struct flow representations, we have to do the same
6642      * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
6643     if (!match.wc.masks.vlans[0].tci) {
6644         match.wc.masks.vlans[0].tci = htons(0xffff);
6645     }
6646
6647     /* We can't allow the packet batching in the next loop to execute
6648      * the actions.  Otherwise, if there are any slow path actions,
6649      * we'll send the packet up twice. */
6650     dp_packet_batch_init_packet(&b, packet);
6651     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6652                               actions->data, actions->size);
6653
6654     add_actions = put_actions->size ? put_actions : actions;
6655     if (OVS_LIKELY(error != ENOSPC)) {
6656         struct dp_netdev_flow *netdev_flow;
6657
6658         /* XXX: There's a race window where a flow covering this packet
6659          * could have already been installed since we last did the flow
6660          * lookup before upcall.  This could be solved by moving the
6661          * mutex lock outside the loop, but that's an awful long time
6662          * to be locking revalidators out of making flow modifications. */
6663         ovs_mutex_lock(&pmd->flow_mutex);
6664         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6665         if (OVS_LIKELY(!netdev_flow)) {
6666             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6667                                              add_actions->data,
6668                                              add_actions->size);
6669         }
6670         ovs_mutex_unlock(&pmd->flow_mutex);
6671         uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
6672         smc_insert(pmd, key, hash);
6673         emc_probabilistic_insert(pmd, key, netdev_flow);
6674     }
6675     if (pmd_perf_metrics_enabled(pmd)) {
6676         /* Update upcall stats. */
6677         cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
6678         struct pmd_perf_stats *s = &pmd->perf_stats;
6679         s->current.upcalls++;
6680         s->current.upcall_cycles += cycles;
6681         histogram_add_sample(&s->cycles_per_upcall, cycles);
6682     }
6683     return error;
6684 }
6685
6686 static inline void
6687 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
6688                      struct dp_packet_batch *packets_,
6689                      struct netdev_flow_key **keys,
6690                      struct dp_packet_flow_map *flow_map,
6691                      uint8_t *index_map,
6692                      odp_port_t in_port)
6693 {
6694     const size_t cnt = dp_packet_batch_size(packets_);
6695 #if !defined(__CHECKER__) && !defined(_WIN32)
6696     const size_t PKT_ARRAY_SIZE = cnt;
6697 #else
6698     /* Sparse or MSVC doesn't like variable length array. */
6699     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6700 #endif
6701     struct dp_packet *packet;
6702     struct dpcls *cls;
6703     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
6704     struct dp_netdev *dp = pmd->dp;
6705     int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
6706     int lookup_cnt = 0, add_lookup_cnt;
6707     bool any_miss;
6708
6709     for (size_t i = 0; i < cnt; i++) {
6710         /* Key length is needed in all the cases, hash computed on demand. */
6711         keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
6712     }
6713     /* Get the classifier for the in_port */
6714     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
6715     if (OVS_LIKELY(cls)) {
6716         any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
6717                                 rules, cnt, &lookup_cnt);
6718     } else {
6719         any_miss = true;
6720         memset(rules, 0, sizeof(rules));
6721     }
6722     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
6723         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
6724         struct ofpbuf actions, put_actions;
6725
6726         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
6727         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
6728
6729         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6730             struct dp_netdev_flow *netdev_flow;
6731
6732             if (OVS_LIKELY(rules[i])) {
6733                 continue;
6734             }
6735
6736             /* It's possible that an earlier slow path execution installed
6737              * a rule covering this flow.  In this case, it's a lot cheaper
6738              * to catch it here than execute a miss. */
6739             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
6740                                                     &add_lookup_cnt);
6741             if (netdev_flow) {
6742                 lookup_cnt += add_lookup_cnt;
6743                 rules[i] = &netdev_flow->cr;
6744                 continue;
6745             }
6746
6747             int error = handle_packet_upcall(pmd, packet, keys[i],
6748                                              &actions, &put_actions);
6749
6750             if (OVS_UNLIKELY(error)) {
6751                 upcall_fail_cnt++;
6752             } else {
6753                 upcall_ok_cnt++;
6754             }
6755         }
6756
6757         ofpbuf_uninit(&actions);
6758         ofpbuf_uninit(&put_actions);
6759         fat_rwlock_unlock(&dp->upcall_rwlock);
6760     } else if (OVS_UNLIKELY(any_miss)) {
6761         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6762             if (OVS_UNLIKELY(!rules[i])) {
6763                 dp_packet_delete(packet);
6764                 upcall_fail_cnt++;
6765             }
6766         }
6767     }
6768
6769     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6770         struct dp_netdev_flow *flow;
6771         /* Get the original order of this packet in received batch. */
6772         int recv_idx = index_map[i];
6773         uint16_t tcp_flags;
6774
6775         if (OVS_UNLIKELY(!rules[i])) {
6776             continue;
6777         }
6778
6779         flow = dp_netdev_flow_cast(rules[i]);
6780         uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
6781         smc_insert(pmd, keys[i], hash);
6782
6783         emc_probabilistic_insert(pmd, keys[i], flow);
6784         /* Add these packets into the flow map in the same order
6785          * as received.
6786          */
6787         tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
6788         packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6789                                    flow_map, recv_idx);
6790     }
6791
6792     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
6793                             cnt - upcall_ok_cnt - upcall_fail_cnt);
6794     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
6795                             lookup_cnt);
6796     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
6797                             upcall_ok_cnt);
6798     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
6799                             upcall_fail_cnt);
6800 }
6801
6802 /* Packets enter the datapath from a port (or from recirculation) here.
6803  *
6804  * When 'md_is_valid' is true the metadata in 'packets' are already valid.
6805  * When false the metadata in 'packets' need to be initialized. */
6806 static void
6807 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
6808                   struct dp_packet_batch *packets,
6809                   bool md_is_valid, odp_port_t port_no)
6810 {
6811 #if !defined(__CHECKER__) && !defined(_WIN32)
6812     const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
6813 #else
6814     /* Sparse or MSVC doesn't like variable length array. */
6815     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
6816 #endif
6817     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
6818         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
6819     struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
6820     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
6821     size_t n_batches;
6822     struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
6823     uint8_t index_map[PKT_ARRAY_SIZE];
6824     size_t n_flows, i;
6825
6826     odp_port_t in_port;
6827
6828     n_batches = 0;
6829     dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
6830                    flow_map, &n_flows, index_map, md_is_valid, port_no);
6831
6832     if (!dp_packet_batch_is_empty(packets)) {
6833         /* Get ingress port from first packet's metadata. */
6834         in_port = packets->packets[0]->md.in_port.odp_port;
6835         fast_path_processing(pmd, packets, missed_keys,
6836                              flow_map, index_map, in_port);
6837     }
6838
6839     /* Batch rest of packets which are in flow map. */
6840     for (i = 0; i < n_flows; i++) {
6841         struct dp_packet_flow_map *map = &flow_map[i];
6842
6843         if (OVS_UNLIKELY(!map->flow)) {
6844             continue;
6845         }
6846         dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
6847                                 batches, &n_batches);
6848      }
6849
6850     /* All the flow batches need to be reset before any call to
6851      * packet_batch_per_flow_execute() as it could potentially trigger
6852      * recirculation. When a packet matching flow ‘j’ happens to be
6853      * recirculated, the nested call to dp_netdev_input__() could potentially
6854      * classify the packet as matching another flow - say 'k'. It could happen
6855      * that in the previous call to dp_netdev_input__() that same flow 'k' had
6856      * already its own batches[k] still waiting to be served.  So if its
6857      * ‘batch’ member is not reset, the recirculated packet would be wrongly
6858      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
6859     for (i = 0; i < n_batches; i++) {
6860         batches[i].flow->batch = NULL;
6861     }
6862
6863     for (i = 0; i < n_batches; i++) {
6864         packet_batch_per_flow_execute(&batches[i], pmd);
6865     }
6866 }
6867
6868 static void
6869 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
6870                 struct dp_packet_batch *packets,
6871                 odp_port_t port_no)
6872 {
6873     dp_netdev_input__(pmd, packets, false, port_no);
6874 }
6875
6876 static void
6877 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
6878                       struct dp_packet_batch *packets)
6879 {
6880     dp_netdev_input__(pmd, packets, true, 0);
6881 }
6882
6883 struct dp_netdev_execute_aux {
6884     struct dp_netdev_pmd_thread *pmd;
6885     const struct flow *flow;
6886 };
6887
6888 static void
6889 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
6890                                  void *aux)
6891 {
6892     struct dp_netdev *dp = get_dp_netdev(dpif);
6893     dp->dp_purge_aux = aux;
6894     dp->dp_purge_cb = cb;
6895 }
6896
6897 static void
6898 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
6899                                void *aux)
6900 {
6901     struct dp_netdev *dp = get_dp_netdev(dpif);
6902     dp->upcall_aux = aux;
6903     dp->upcall_cb = cb;
6904 }
6905
6906 static void
6907 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
6908                                bool purge)
6909 {
6910     struct tx_port *tx;
6911     struct dp_netdev_port *port;
6912     long long interval;
6913
6914     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
6915         if (!tx->port->dynamic_txqs) {
6916             continue;
6917         }
6918         interval = pmd->ctx.now - tx->last_used;
6919         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
6920             port = tx->port;
6921             ovs_mutex_lock(&port->txq_used_mutex);
6922             port->txq_used[tx->qid]--;
6923             ovs_mutex_unlock(&port->txq_used_mutex);
6924             tx->qid = -1;
6925         }
6926     }
6927 }
6928
6929 static int
6930 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
6931                            struct tx_port *tx)
6932 {
6933     struct dp_netdev_port *port;
6934     long long interval;
6935     int i, min_cnt, min_qid;
6936
6937     interval = pmd->ctx.now - tx->last_used;
6938     tx->last_used = pmd->ctx.now;
6939
6940     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
6941         return tx->qid;
6942     }
6943
6944     port = tx->port;
6945
6946     ovs_mutex_lock(&port->txq_used_mutex);
6947     if (tx->qid >= 0) {
6948         port->txq_used[tx->qid]--;
6949         tx->qid = -1;
6950     }
6951
6952     min_cnt = -1;
6953     min_qid = 0;
6954     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
6955         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
6956             min_cnt = port->txq_used[i];
6957             min_qid = i;
6958         }
6959     }
6960
6961     port->txq_used[min_qid]++;
6962     tx->qid = min_qid;
6963
6964     ovs_mutex_unlock(&port->txq_used_mutex);
6965
6966     dpif_netdev_xps_revalidate_pmd(pmd, false);
6967
6968     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
6969              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
6970     return min_qid;
6971 }
6972
6973 static struct tx_port *
6974 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6975                           odp_port_t port_no)
6976 {
6977     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
6978 }
6979
6980 static struct tx_port *
6981 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
6982                            odp_port_t port_no)
6983 {
6984     return tx_port_lookup(&pmd->send_port_cache, port_no);
6985 }
6986
6987 static int
6988 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
6989                 const struct nlattr *attr,
6990                 struct dp_packet_batch *batch)
6991 {
6992     struct tx_port *tun_port;
6993     const struct ovs_action_push_tnl *data;
6994     int err;
6995
6996     data = nl_attr_get(attr);
6997
6998     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
6999     if (!tun_port) {
7000         err = -EINVAL;
7001         goto error;
7002     }
7003     err = netdev_push_header(tun_port->port->netdev, batch, data);
7004     if (!err) {
7005         return 0;
7006     }
7007 error:
7008     dp_packet_delete_batch(batch, true);
7009     return err;
7010 }
7011
7012 static void
7013 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7014                             struct dp_packet *packet, bool should_steal,
7015                             struct flow *flow, ovs_u128 *ufid,
7016                             struct ofpbuf *actions,
7017                             const struct nlattr *userdata)
7018 {
7019     struct dp_packet_batch b;
7020     int error;
7021
7022     ofpbuf_clear(actions);
7023
7024     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7025                              DPIF_UC_ACTION, userdata, actions,
7026                              NULL);
7027     if (!error || error == ENOSPC) {
7028         dp_packet_batch_init_packet(&b, packet);
7029         dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7030                                   actions->data, actions->size);
7031     } else if (should_steal) {
7032         dp_packet_delete(packet);
7033     }
7034 }
7035
7036 static void
7037 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7038               const struct nlattr *a, bool should_steal)
7039     OVS_NO_THREAD_SAFETY_ANALYSIS
7040 {
7041     struct dp_netdev_execute_aux *aux = aux_;
7042     uint32_t *depth = recirc_depth_get();
7043     struct dp_netdev_pmd_thread *pmd = aux->pmd;
7044     struct dp_netdev *dp = pmd->dp;
7045     int type = nl_attr_type(a);
7046     struct tx_port *p;
7047
7048     switch ((enum ovs_action_attr)type) {
7049     case OVS_ACTION_ATTR_OUTPUT:
7050         p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
7051         if (OVS_LIKELY(p)) {
7052             struct dp_packet *packet;
7053             struct dp_packet_batch out;
7054
7055             if (!should_steal) {
7056                 dp_packet_batch_clone(&out, packets_);
7057                 dp_packet_batch_reset_cutlen(packets_);
7058                 packets_ = &out;
7059             }
7060             dp_packet_batch_apply_cutlen(packets_);
7061
7062 #ifdef DPDK_NETDEV
7063             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7064                              && packets_->packets[0]->source
7065                                 != p->output_pkts.packets[0]->source)) {
7066                 /* XXX: netdev-dpdk assumes that all packets in a single
7067                  *      output batch has the same source. Flush here to
7068                  *      avoid memory access issues. */
7069                 dp_netdev_pmd_flush_output_on_port(pmd, p);
7070             }
7071 #endif
7072             if (dp_packet_batch_size(&p->output_pkts)
7073                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7074                 /* Flush here to avoid overflow. */
7075                 dp_netdev_pmd_flush_output_on_port(pmd, p);
7076             }
7077
7078             if (dp_packet_batch_is_empty(&p->output_pkts)) {
7079                 pmd->n_output_batches++;
7080             }
7081
7082             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7083                 p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7084                                                              pmd->ctx.last_rxq;
7085                 dp_packet_batch_add(&p->output_pkts, packet);
7086             }
7087             return;
7088         }
7089         break;
7090
7091     case OVS_ACTION_ATTR_TUNNEL_PUSH:
7092         if (should_steal) {
7093             /* We're requested to push tunnel header, but also we need to take
7094              * the ownership of these packets. Thus, we can avoid performing
7095              * the action, because the caller will not use the result anyway.
7096              * Just break to free the batch. */
7097             break;
7098         }
7099         dp_packet_batch_apply_cutlen(packets_);
7100         push_tnl_action(pmd, a, packets_);
7101         return;
7102
7103     case OVS_ACTION_ATTR_TUNNEL_POP:
7104         if (*depth < MAX_RECIRC_DEPTH) {
7105             struct dp_packet_batch *orig_packets_ = packets_;
7106             odp_port_t portno = nl_attr_get_odp_port(a);
7107
7108             p = pmd_tnl_port_cache_lookup(pmd, portno);
7109             if (p) {
7110                 struct dp_packet_batch tnl_pkt;
7111
7112                 if (!should_steal) {
7113                     dp_packet_batch_clone(&tnl_pkt, packets_);
7114                     packets_ = &tnl_pkt;
7115                     dp_packet_batch_reset_cutlen(orig_packets_);
7116                 }
7117
7118                 dp_packet_batch_apply_cutlen(packets_);
7119
7120                 netdev_pop_header(p->port->netdev, packets_);
7121                 if (dp_packet_batch_is_empty(packets_)) {
7122                     return;
7123                 }
7124
7125                 struct dp_packet *packet;
7126                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7127                     packet->md.in_port.odp_port = portno;
7128                 }
7129
7130                 (*depth)++;
7131                 dp_netdev_recirculate(pmd, packets_);
7132                 (*depth)--;
7133                 return;
7134             }
7135         }
7136         break;
7137
7138     case OVS_ACTION_ATTR_USERSPACE:
7139         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7140             struct dp_packet_batch *orig_packets_ = packets_;
7141             const struct nlattr *userdata;
7142             struct dp_packet_batch usr_pkt;
7143             struct ofpbuf actions;
7144             struct flow flow;
7145             ovs_u128 ufid;
7146             bool clone = false;
7147
7148             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7149             ofpbuf_init(&actions, 0);
7150
7151             if (packets_->trunc) {
7152                 if (!should_steal) {
7153                     dp_packet_batch_clone(&usr_pkt, packets_);
7154                     packets_ = &usr_pkt;
7155                     clone = true;
7156                     dp_packet_batch_reset_cutlen(orig_packets_);
7157                 }
7158
7159                 dp_packet_batch_apply_cutlen(packets_);
7160             }
7161
7162             struct dp_packet *packet;
7163             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7164                 flow_extract(packet, &flow);
7165                 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
7166                 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7167                                             &ufid, &actions, userdata);
7168             }
7169
7170             if (clone) {
7171                 dp_packet_delete_batch(packets_, true);
7172             }
7173
7174             ofpbuf_uninit(&actions);
7175             fat_rwlock_unlock(&dp->upcall_rwlock);
7176
7177             return;
7178         }
7179         break;
7180
7181     case OVS_ACTION_ATTR_RECIRC:
7182         if (*depth < MAX_RECIRC_DEPTH) {
7183             struct dp_packet_batch recirc_pkts;
7184
7185             if (!should_steal) {
7186                dp_packet_batch_clone(&recirc_pkts, packets_);
7187                packets_ = &recirc_pkts;
7188             }
7189
7190             struct dp_packet *packet;
7191             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7192                 packet->md.recirc_id = nl_attr_get_u32(a);
7193             }
7194
7195             (*depth)++;
7196             dp_netdev_recirculate(pmd, packets_);
7197             (*depth)--;
7198
7199             return;
7200         }
7201
7202         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7203         break;
7204
7205     case OVS_ACTION_ATTR_CT: {
7206         const struct nlattr *b;
7207         bool force = false;
7208         bool commit = false;
7209         unsigned int left;
7210         uint16_t zone = 0;
7211         const char *helper = NULL;
7212         const uint32_t *setmark = NULL;
7213         const struct ovs_key_ct_labels *setlabel = NULL;
7214         struct nat_action_info_t nat_action_info;
7215         struct nat_action_info_t *nat_action_info_ref = NULL;
7216         bool nat_config = false;
7217
7218         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7219                                  nl_attr_get_size(a)) {
7220             enum ovs_ct_attr sub_type = nl_attr_type(b);
7221
7222             switch(sub_type) {
7223             case OVS_CT_ATTR_FORCE_COMMIT:
7224                 force = true;
7225                 /* fall through. */
7226             case OVS_CT_ATTR_COMMIT:
7227                 commit = true;
7228                 break;
7229             case OVS_CT_ATTR_ZONE:
7230                 zone = nl_attr_get_u16(b);
7231                 break;
7232             case OVS_CT_ATTR_HELPER:
7233                 helper = nl_attr_get_string(b);
7234                 break;
7235             case OVS_CT_ATTR_MARK:
7236                 setmark = nl_attr_get(b);
7237                 break;
7238             case OVS_CT_ATTR_LABELS:
7239                 setlabel = nl_attr_get(b);
7240                 break;
7241             case OVS_CT_ATTR_EVENTMASK:
7242                 /* Silently ignored, as userspace datapath does not generate
7243                  * netlink events. */
7244                 break;
7245             case OVS_CT_ATTR_NAT: {
7246                 const struct nlattr *b_nest;
7247                 unsigned int left_nest;
7248                 bool ip_min_specified = false;
7249                 bool proto_num_min_specified = false;
7250                 bool ip_max_specified = false;
7251                 bool proto_num_max_specified = false;
7252                 memset(&nat_action_info, 0, sizeof nat_action_info);
7253                 nat_action_info_ref = &nat_action_info;
7254
7255                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7256                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7257
7258                     switch (sub_type_nest) {
7259                     case OVS_NAT_ATTR_SRC:
7260                     case OVS_NAT_ATTR_DST:
7261                         nat_config = true;
7262                         nat_action_info.nat_action |=
7263                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
7264                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7265                         break;
7266                     case OVS_NAT_ATTR_IP_MIN:
7267                         memcpy(&nat_action_info.min_addr,
7268                                nl_attr_get(b_nest),
7269                                nl_attr_get_size(b_nest));
7270                         ip_min_specified = true;
7271                         break;
7272                     case OVS_NAT_ATTR_IP_MAX:
7273                         memcpy(&nat_action_info.max_addr,
7274                                nl_attr_get(b_nest),
7275                                nl_attr_get_size(b_nest));
7276                         ip_max_specified = true;
7277                         break;
7278                     case OVS_NAT_ATTR_PROTO_MIN:
7279                         nat_action_info.min_port =
7280                             nl_attr_get_u16(b_nest);
7281                         proto_num_min_specified = true;
7282                         break;
7283                     case OVS_NAT_ATTR_PROTO_MAX:
7284                         nat_action_info.max_port =
7285                             nl_attr_get_u16(b_nest);
7286                         proto_num_max_specified = true;
7287                         break;
7288                     case OVS_NAT_ATTR_PERSISTENT:
7289                     case OVS_NAT_ATTR_PROTO_HASH:
7290                     case OVS_NAT_ATTR_PROTO_RANDOM:
7291                         break;
7292                     case OVS_NAT_ATTR_UNSPEC:
7293                     case __OVS_NAT_ATTR_MAX:
7294                         OVS_NOT_REACHED();
7295                     }
7296                 }
7297
7298                 if (ip_min_specified && !ip_max_specified) {
7299                     nat_action_info.max_addr = nat_action_info.min_addr;
7300                 }
7301                 if (proto_num_min_specified && !proto_num_max_specified) {
7302                     nat_action_info.max_port = nat_action_info.min_port;
7303                 }
7304                 if (proto_num_min_specified || proto_num_max_specified) {
7305                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7306                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7307                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7308                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7309                     }
7310                 }
7311                 break;
7312             }
7313             case OVS_CT_ATTR_UNSPEC:
7314             case __OVS_CT_ATTR_MAX:
7315                 OVS_NOT_REACHED();
7316             }
7317         }
7318
7319         /* We won't be able to function properly in this case, hence
7320          * complain loudly. */
7321         if (nat_config && !commit) {
7322             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7323             VLOG_WARN_RL(&rl, "NAT specified without commit.");
7324         }
7325
7326         conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7327                           commit, zone, setmark, setlabel, aux->flow->tp_src,
7328                           aux->flow->tp_dst, helper, nat_action_info_ref,
7329                           pmd->ctx.now / 1000);
7330         break;
7331     }
7332
7333     case OVS_ACTION_ATTR_METER:
7334         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7335                             pmd->ctx.now);
7336         break;
7337
7338     case OVS_ACTION_ATTR_PUSH_VLAN:
7339     case OVS_ACTION_ATTR_POP_VLAN:
7340     case OVS_ACTION_ATTR_PUSH_MPLS:
7341     case OVS_ACTION_ATTR_POP_MPLS:
7342     case OVS_ACTION_ATTR_SET:
7343     case OVS_ACTION_ATTR_SET_MASKED:
7344     case OVS_ACTION_ATTR_SAMPLE:
7345     case OVS_ACTION_ATTR_HASH:
7346     case OVS_ACTION_ATTR_UNSPEC:
7347     case OVS_ACTION_ATTR_TRUNC:
7348     case OVS_ACTION_ATTR_PUSH_ETH:
7349     case OVS_ACTION_ATTR_POP_ETH:
7350     case OVS_ACTION_ATTR_CLONE:
7351     case OVS_ACTION_ATTR_PUSH_NSH:
7352     case OVS_ACTION_ATTR_POP_NSH:
7353     case OVS_ACTION_ATTR_CT_CLEAR:
7354     case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7355     case __OVS_ACTION_ATTR_MAX:
7356         OVS_NOT_REACHED();
7357     }
7358
7359     dp_packet_delete_batch(packets_, should_steal);
7360 }
7361
7362 static void
7363 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7364                           struct dp_packet_batch *packets,
7365                           bool should_steal, const struct flow *flow,
7366                           const struct nlattr *actions, size_t actions_len)
7367 {
7368     struct dp_netdev_execute_aux aux = { pmd, flow };
7369
7370     odp_execute_actions(&aux, packets, should_steal, actions,
7371                         actions_len, dp_execute_cb);
7372 }
7373
7374 struct dp_netdev_ct_dump {
7375     struct ct_dpif_dump_state up;
7376     struct conntrack_dump dump;
7377     struct conntrack *ct;
7378     struct dp_netdev *dp;
7379 };
7380
7381 static int
7382 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7383                           const uint16_t *pzone, int *ptot_bkts)
7384 {
7385     struct dp_netdev *dp = get_dp_netdev(dpif);
7386     struct dp_netdev_ct_dump *dump;
7387
7388     dump = xzalloc(sizeof *dump);
7389     dump->dp = dp;
7390     dump->ct = dp->conntrack;
7391
7392     conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7393
7394     *dump_ = &dump->up;
7395
7396     return 0;
7397 }
7398
7399 static int
7400 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7401                          struct ct_dpif_dump_state *dump_,
7402                          struct ct_dpif_entry *entry)
7403 {
7404     struct dp_netdev_ct_dump *dump;
7405
7406     INIT_CONTAINER(dump, dump_, up);
7407
7408     return conntrack_dump_next(&dump->dump, entry);
7409 }
7410
7411 static int
7412 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7413                          struct ct_dpif_dump_state *dump_)
7414 {
7415     struct dp_netdev_ct_dump *dump;
7416     int err;
7417
7418     INIT_CONTAINER(dump, dump_, up);
7419
7420     err = conntrack_dump_done(&dump->dump);
7421
7422     free(dump);
7423
7424     return err;
7425 }
7426
7427 static int
7428 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7429                      const struct ct_dpif_tuple *tuple)
7430 {
7431     struct dp_netdev *dp = get_dp_netdev(dpif);
7432
7433     if (tuple) {
7434         return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7435     }
7436     return conntrack_flush(dp->conntrack, zone);
7437 }
7438
7439 static int
7440 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7441 {
7442     struct dp_netdev *dp = get_dp_netdev(dpif);
7443
7444     return conntrack_set_maxconns(dp->conntrack, maxconns);
7445 }
7446
7447 static int
7448 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7449 {
7450     struct dp_netdev *dp = get_dp_netdev(dpif);
7451
7452     return conntrack_get_maxconns(dp->conntrack, maxconns);
7453 }
7454
7455 static int
7456 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7457 {
7458     struct dp_netdev *dp = get_dp_netdev(dpif);
7459
7460     return conntrack_get_nconns(dp->conntrack, nconns);
7461 }
7462
7463 static int
7464 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
7465 {
7466     struct dp_netdev *dp = get_dp_netdev(dpif);
7467     return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
7468 }
7469
7470 static int
7471 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
7472 {
7473     struct dp_netdev *dp = get_dp_netdev(dpif);
7474     return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
7475 }
7476
7477 static int
7478 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
7479 {
7480     struct dp_netdev *dp = get_dp_netdev(dpif);
7481     return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
7482 }
7483
7484 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
7485  * diverge. */
7486 static int
7487 dpif_netdev_ipf_get_status(struct dpif *dpif,
7488                            struct dpif_ipf_status *dpif_ipf_status)
7489 {
7490     struct dp_netdev *dp = get_dp_netdev(dpif);
7491     ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
7492                    (struct ipf_status *) dpif_ipf_status);
7493     return 0;
7494 }
7495
7496 static int
7497 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
7498                            struct ipf_dump_ctx **ipf_dump_ctx)
7499 {
7500     return ipf_dump_start(ipf_dump_ctx);
7501 }
7502
7503 static int
7504 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
7505 {
7506     struct dp_netdev *dp = get_dp_netdev(dpif);
7507     return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
7508                          dump);
7509 }
7510
7511 static int
7512 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
7513 {
7514     return ipf_dump_done(ipf_dump_ctx);
7515
7516 }
7517
7518 const struct dpif_class dpif_netdev_class = {
7519     "netdev",
7520     true,                       /* cleanup_required */
7521     dpif_netdev_init,
7522     dpif_netdev_enumerate,
7523     dpif_netdev_port_open_type,
7524     dpif_netdev_open,
7525     dpif_netdev_close,
7526     dpif_netdev_destroy,
7527     dpif_netdev_run,
7528     dpif_netdev_wait,
7529     dpif_netdev_get_stats,
7530     dpif_netdev_port_add,
7531     dpif_netdev_port_del,
7532     dpif_netdev_port_set_config,
7533     dpif_netdev_port_query_by_number,
7534     dpif_netdev_port_query_by_name,
7535     NULL,                       /* port_get_pid */
7536     dpif_netdev_port_dump_start,
7537     dpif_netdev_port_dump_next,
7538     dpif_netdev_port_dump_done,
7539     dpif_netdev_port_poll,
7540     dpif_netdev_port_poll_wait,
7541     dpif_netdev_flow_flush,
7542     dpif_netdev_flow_dump_create,
7543     dpif_netdev_flow_dump_destroy,
7544     dpif_netdev_flow_dump_thread_create,
7545     dpif_netdev_flow_dump_thread_destroy,
7546     dpif_netdev_flow_dump_next,
7547     dpif_netdev_operate,
7548     NULL,                       /* recv_set */
7549     NULL,                       /* handlers_set */
7550     dpif_netdev_set_config,
7551     dpif_netdev_queue_to_priority,
7552     NULL,                       /* recv */
7553     NULL,                       /* recv_wait */
7554     NULL,                       /* recv_purge */
7555     dpif_netdev_register_dp_purge_cb,
7556     dpif_netdev_register_upcall_cb,
7557     dpif_netdev_enable_upcall,
7558     dpif_netdev_disable_upcall,
7559     dpif_netdev_get_datapath_version,
7560     dpif_netdev_ct_dump_start,
7561     dpif_netdev_ct_dump_next,
7562     dpif_netdev_ct_dump_done,
7563     dpif_netdev_ct_flush,
7564     dpif_netdev_ct_set_maxconns,
7565     dpif_netdev_ct_get_maxconns,
7566     dpif_netdev_ct_get_nconns,
7567     NULL,                       /* ct_set_limits */
7568     NULL,                       /* ct_get_limits */
7569     NULL,                       /* ct_del_limits */
7570     dpif_netdev_ipf_set_enabled,
7571     dpif_netdev_ipf_set_min_frag,
7572     dpif_netdev_ipf_set_max_nfrags,
7573     dpif_netdev_ipf_get_status,
7574     dpif_netdev_ipf_dump_start,
7575     dpif_netdev_ipf_dump_next,
7576     dpif_netdev_ipf_dump_done,
7577     dpif_netdev_meter_get_features,
7578     dpif_netdev_meter_set,
7579     dpif_netdev_meter_get,
7580     dpif_netdev_meter_del,
7581 };
7582
7583 static void
7584 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
7585                               const char *argv[], void *aux OVS_UNUSED)
7586 {
7587     struct dp_netdev_port *port;
7588     struct dp_netdev *dp;
7589     odp_port_t port_no;
7590
7591     ovs_mutex_lock(&dp_netdev_mutex);
7592     dp = shash_find_data(&dp_netdevs, argv[1]);
7593     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
7594         ovs_mutex_unlock(&dp_netdev_mutex);
7595         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
7596         return;
7597     }
7598     ovs_refcount_ref(&dp->ref_cnt);
7599     ovs_mutex_unlock(&dp_netdev_mutex);
7600
7601     ovs_mutex_lock(&dp->port_mutex);
7602     if (get_port_by_name(dp, argv[2], &port)) {
7603         unixctl_command_reply_error(conn, "unknown port");
7604         goto exit;
7605     }
7606
7607     port_no = u32_to_odp(atoi(argv[3]));
7608     if (!port_no || port_no == ODPP_NONE) {
7609         unixctl_command_reply_error(conn, "bad port number");
7610         goto exit;
7611     }
7612     if (dp_netdev_lookup_port(dp, port_no)) {
7613         unixctl_command_reply_error(conn, "port number already in use");
7614         goto exit;
7615     }
7616
7617     /* Remove port. */
7618     hmap_remove(&dp->ports, &port->node);
7619     reconfigure_datapath(dp);
7620
7621     /* Reinsert with new port number. */
7622     port->port_no = port_no;
7623     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
7624     reconfigure_datapath(dp);
7625
7626     seq_change(dp->port_seq);
7627     unixctl_command_reply(conn, NULL);
7628
7629 exit:
7630     ovs_mutex_unlock(&dp->port_mutex);
7631     dp_netdev_unref(dp);
7632 }
7633
7634 static void
7635 dpif_dummy_register__(const char *type)
7636 {
7637     struct dpif_class *class;
7638
7639     class = xmalloc(sizeof *class);
7640     *class = dpif_netdev_class;
7641     class->type = xstrdup(type);
7642     dp_register_provider(class);
7643 }
7644
7645 static void
7646 dpif_dummy_override(const char *type)
7647 {
7648     int error;
7649
7650     /*
7651      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
7652      * a userland-only build.  It's useful for testsuite.
7653      */
7654     error = dp_unregister_provider(type);
7655     if (error == 0 || error == EAFNOSUPPORT) {
7656         dpif_dummy_register__(type);
7657     }
7658 }
7659
7660 void
7661 dpif_dummy_register(enum dummy_level level)
7662 {
7663     if (level == DUMMY_OVERRIDE_ALL) {
7664         struct sset types;
7665         const char *type;
7666
7667         sset_init(&types);
7668         dp_enumerate_types(&types);
7669         SSET_FOR_EACH (type, &types) {
7670             dpif_dummy_override(type);
7671         }
7672         sset_destroy(&types);
7673     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
7674         dpif_dummy_override("system");
7675     }
7676
7677     dpif_dummy_register__("dummy");
7678
7679     unixctl_command_register("dpif-dummy/change-port-number",
7680                              "dp port new-number",
7681                              3, 3, dpif_dummy_change_port_number, NULL);
7682 }
7683 \f
7684 /* Datapath Classifier. */
7685
7686 /* Forward declaration for lookup_func typedef. */
7687 struct dpcls_subtable;
7688
7689 /* Lookup function for a subtable in the dpcls. This function is called
7690  * by each subtable with an array of packets, and a bitmask of packets to
7691  * perform the lookup on. Using a function pointer gives flexibility to
7692  * optimize the lookup function based on subtable properties and the
7693  * CPU instruction set available at runtime.
7694  */
7695 typedef
7696 uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable *subtable,
7697                                        uint32_t keys_map,
7698                                        const struct netdev_flow_key *keys[],
7699                                        struct dpcls_rule **rules);
7700
7701 /* Prototype for generic lookup func, using same code path as before. */
7702 uint32_t
7703 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
7704                               uint32_t keys_map,
7705                               const struct netdev_flow_key *keys[],
7706                               struct dpcls_rule **rules);
7707
7708 /* A set of rules that all have the same fields wildcarded. */
7709 struct dpcls_subtable {
7710     /* The fields are only used by writers. */
7711     struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
7712
7713     /* These fields are accessed by readers. */
7714     struct cmap rules;           /* Contains "struct dpcls_rule"s. */
7715     uint32_t hit_cnt;            /* Number of match hits in subtable in current
7716                                     optimization interval. */
7717
7718     /* The lookup function to use for this subtable. If there is a known
7719      * property of the subtable (eg: only 3 bits of miniflow metadata is
7720      * used for the lookup) then this can point at an optimized version of
7721      * the lookup function for this particular subtable. */
7722     dpcls_subtable_lookup_func lookup_func;
7723
7724     struct netdev_flow_key mask; /* Wildcards for fields (const). */
7725     /* 'mask' must be the last field, additional space is allocated here. */
7726 };
7727
7728 static void
7729 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
7730 {
7731     cmap_destroy(&subtable->rules);
7732     ovsrcu_postpone(free, subtable);
7733 }
7734
7735 /* Initializes 'cls' as a classifier that initially contains no classification
7736  * rules. */
7737 static void
7738 dpcls_init(struct dpcls *cls)
7739 {
7740     cmap_init(&cls->subtables_map);
7741     pvector_init(&cls->subtables);
7742 }
7743
7744 static void
7745 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
7746 {
7747     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
7748     pvector_remove(&cls->subtables, subtable);
7749     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
7750                 subtable->mask.hash);
7751     ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
7752 }
7753
7754 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
7755  * caller's responsibility.
7756  * May only be called after all the readers have been terminated. */
7757 static void
7758 dpcls_destroy(struct dpcls *cls)
7759 {
7760     if (cls) {
7761         struct dpcls_subtable *subtable;
7762
7763         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
7764             ovs_assert(cmap_count(&subtable->rules) == 0);
7765             dpcls_destroy_subtable(cls, subtable);
7766         }
7767         cmap_destroy(&cls->subtables_map);
7768         pvector_destroy(&cls->subtables);
7769     }
7770 }
7771
7772 static struct dpcls_subtable *
7773 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7774 {
7775     struct dpcls_subtable *subtable;
7776
7777     /* Need to add one. */
7778     subtable = xmalloc(sizeof *subtable
7779                        - sizeof subtable->mask.mf + mask->len);
7780     cmap_init(&subtable->rules);
7781     subtable->hit_cnt = 0;
7782     netdev_flow_key_clone(&subtable->mask, mask);
7783
7784     /* Decide which hash/lookup/verify function to use. */
7785     subtable->lookup_func = dpcls_subtable_lookup_generic;
7786
7787     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
7788     /* Add the new subtable at the end of the pvector (with no hits yet) */
7789     pvector_insert(&cls->subtables, subtable, 0);
7790     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
7791              cmap_count(&cls->subtables_map), subtable, cls->in_port);
7792     pvector_publish(&cls->subtables);
7793
7794     return subtable;
7795 }
7796
7797 static inline struct dpcls_subtable *
7798 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
7799 {
7800     struct dpcls_subtable *subtable;
7801
7802     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
7803                              &cls->subtables_map) {
7804         if (netdev_flow_key_equal(&subtable->mask, mask)) {
7805             return subtable;
7806         }
7807     }
7808     return dpcls_create_subtable(cls, mask);
7809 }
7810
7811
7812 /* Periodically sort the dpcls subtable vectors according to hit counts */
7813 static void
7814 dpcls_sort_subtable_vector(struct dpcls *cls)
7815 {
7816     struct pvector *pvec = &cls->subtables;
7817     struct dpcls_subtable *subtable;
7818
7819     PVECTOR_FOR_EACH (subtable, pvec) {
7820         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
7821         subtable->hit_cnt = 0;
7822     }
7823     pvector_publish(pvec);
7824 }
7825
7826 static inline void
7827 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
7828                            struct polled_queue *poll_list, int poll_cnt)
7829 {
7830     struct dpcls *cls;
7831     uint64_t tot_idle = 0, tot_proc = 0;
7832     unsigned int pmd_load = 0;
7833
7834     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
7835         uint64_t curr_tsc;
7836         struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
7837         if (pmd_alb->is_enabled && !pmd->isolated
7838             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
7839                                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
7840             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
7841                                         pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
7842             {
7843             tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
7844                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
7845             tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
7846                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
7847
7848             if (tot_proc) {
7849                 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
7850             }
7851
7852             if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
7853                 atomic_count_inc(&pmd->pmd_overloaded);
7854             } else {
7855                 atomic_count_set(&pmd->pmd_overloaded, 0);
7856             }
7857         }
7858
7859         pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
7860                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
7861         pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
7862                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
7863
7864         /* Get the cycles that were used to process each queue and store. */
7865         for (unsigned i = 0; i < poll_cnt; i++) {
7866             uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
7867                                                         RXQ_CYCLES_PROC_CURR);
7868             dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
7869             dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
7870                                      0);
7871         }
7872         curr_tsc = cycles_counter_update(&pmd->perf_stats);
7873         if (pmd->intrvl_tsc_prev) {
7874             /* There is a prev timestamp, store a new intrvl cycle count. */
7875             atomic_store_relaxed(&pmd->intrvl_cycles,
7876                                  curr_tsc - pmd->intrvl_tsc_prev);
7877         }
7878         pmd->intrvl_tsc_prev = curr_tsc;
7879         /* Start new measuring interval */
7880         pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
7881     }
7882
7883     if (pmd->ctx.now > pmd->next_optimization) {
7884         /* Try to obtain the flow lock to block out revalidator threads.
7885          * If not possible, just try next time. */
7886         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
7887             /* Optimize each classifier */
7888             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7889                 dpcls_sort_subtable_vector(cls);
7890             }
7891             ovs_mutex_unlock(&pmd->flow_mutex);
7892             /* Start new measuring interval */
7893             pmd->next_optimization = pmd->ctx.now
7894                                      + DPCLS_OPTIMIZATION_INTERVAL;
7895         }
7896     }
7897 }
7898
7899 /* Insert 'rule' into 'cls'. */
7900 static void
7901 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
7902              const struct netdev_flow_key *mask)
7903 {
7904     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
7905
7906     /* Refer to subtable's mask, also for later removal. */
7907     rule->mask = &subtable->mask;
7908     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
7909 }
7910
7911 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
7912 static void
7913 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
7914 {
7915     struct dpcls_subtable *subtable;
7916
7917     ovs_assert(rule->mask);
7918
7919     /* Get subtable from reference in rule->mask. */
7920     INIT_CONTAINER(subtable, rule->mask, mask);
7921     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
7922         == 0) {
7923         /* Delete empty subtable. */
7924         dpcls_destroy_subtable(cls, subtable);
7925         pvector_publish(&cls->subtables);
7926     }
7927 }
7928
7929 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
7930  * in 'mask' the values in 'key' and 'target' are the same. */
7931 static bool
7932 dpcls_rule_matches_key(const struct dpcls_rule *rule,
7933                        const struct netdev_flow_key *target)
7934 {
7935     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
7936     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
7937     uint64_t value;
7938
7939     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
7940         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
7941             return false;
7942         }
7943     }
7944     return true;
7945 }
7946
7947 uint32_t
7948 dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
7949                               uint32_t keys_map,
7950                               const struct netdev_flow_key *keys[],
7951                               struct dpcls_rule **rules)
7952 {
7953         int i;
7954         uint32_t found_map;
7955
7956         /* Compute hashes for the remaining keys.  Each search-key is
7957          * masked with the subtable's mask to avoid hashing the wildcarded
7958          * bits. */
7959         uint32_t hashes[NETDEV_MAX_BURST];
7960         ULLONG_FOR_EACH_1 (i, keys_map) {
7961             hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
7962                                                      &subtable->mask);
7963         }
7964
7965         /* Lookup. */
7966         const struct cmap_node *nodes[NETDEV_MAX_BURST];
7967         found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
7968
7969         /* Check results.  When the i-th bit of found_map is set, it means
7970          * that a set of nodes with a matching hash value was found for the
7971          * i-th search-key.  Due to possible hash collisions we need to check
7972          * which of the found rules, if any, really matches our masked
7973          * search-key. */
7974         ULLONG_FOR_EACH_1 (i, found_map) {
7975             struct dpcls_rule *rule;
7976
7977             CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
7978                 if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
7979                     rules[i] = rule;
7980                     /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
7981                      * within one second optimization interval. */
7982                     subtable->hit_cnt++;
7983                     goto next;
7984                 }
7985             }
7986             /* None of the found rules was a match.  Reset the i-th bit to
7987              * keep searching this key in the next subtable. */
7988             ULLONG_SET0(found_map, i);  /* Did not match. */
7989         next:
7990             ;                     /* Keep Sparse happy. */
7991         }
7992
7993         return found_map;
7994 }
7995
7996 /* For each miniflow in 'keys' performs a classifier lookup writing the result
7997  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
7998  * NULL it is skipped.
7999  *
8000  * This function is optimized for use in the userspace datapath and therefore
8001  * does not implement a lot of features available in the standard
8002  * classifier_lookup() function.  Specifically, it does not implement
8003  * priorities, instead returning any rule which matches the flow.
8004  *
8005  * Returns true if all miniflows found a corresponding rule. */
8006 static bool
8007 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8008              struct dpcls_rule **rules, const size_t cnt,
8009              int *num_lookups_p)
8010 {
8011     /* The received 'cnt' miniflows are the search-keys that will be processed
8012      * to find a matching entry into the available subtables.
8013      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8014 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8015     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8016
8017     struct dpcls_subtable *subtable;
8018
8019     uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8020
8021     if (cnt != MAP_BITS) {
8022         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8023     }
8024     memset(rules, 0, cnt * sizeof *rules);
8025
8026     int lookups_match = 0, subtable_pos = 1;
8027     uint32_t found_map;
8028
8029     /* The Datapath classifier - aka dpcls - is composed of subtables.
8030      * Subtables are dynamically created as needed when new rules are inserted.
8031      * Each subtable collects rules with matches on a specific subset of packet
8032      * fields as defined by the subtable's mask.  We proceed to process every
8033      * search-key against each subtable, but when a match is found for a
8034      * search-key, the search for that key can stop because the rules are
8035      * non-overlapping. */
8036     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8037         /* Call the subtable specific lookup function. */
8038         found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8039
8040         /* Count the number of subtables searched for this packet match. This
8041          * estimates the "spread" of subtables looked at per matched packet. */
8042         uint32_t pkts_matched = count_1bits(found_map);
8043         lookups_match += pkts_matched * subtable_pos;
8044
8045         /* Clear the found rules, and return early if all packets are found. */
8046         keys_map &= ~found_map;
8047         if (!keys_map) {
8048             if (num_lookups_p) {
8049                 *num_lookups_p = lookups_match;
8050             }
8051             return true;
8052         }
8053         subtable_pos++;
8054     }
8055
8056     if (num_lookups_p) {
8057         *num_lookups_p = lookups_match;
8058     }
8059     return false;
8060 }