lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19 #include "dpif-netdev-private.h"
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <inttypes.h>
  25 #include <net/if.h>
  26 #include <sys/types.h>
  27 #include <netinet/in.h>
  28 #include <stdint.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <sys/ioctl.h>
  32 #include <sys/socket.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35
  36 #include "bitmap.h"
  37 #include "cmap.h"
  38 #include "conntrack.h"
  39 #include "conntrack-tp.h"
  40 #include "coverage.h"
  41 #include "ct-dpif.h"
  42 #include "csum.h"
  43 #include "dp-packet.h"
  44 #include "dpif.h"
  45 #include "dpif-netdev-perf.h"
  46 #include "dpif-provider.h"
  47 #include "dummy.h"
  48 #include "fat-rwlock.h"
  49 #include "flow.h"
  50 #include "hmapx.h"
  51 #include "id-pool.h"
  52 #include "ipf.h"
  53 #include "netdev.h"
  54 #include "netdev-offload.h"
  55 #include "netdev-provider.h"
  56 #include "netdev-vport.h"
  57 #include "netlink.h"
  58 #include "odp-execute.h"
  59 #include "odp-util.h"
  60 #include "openvswitch/dynamic-string.h"
  61 #include "openvswitch/list.h"
  62 #include "openvswitch/match.h"
  63 #include "openvswitch/ofp-parse.h"
  64 #include "openvswitch/ofp-print.h"
  65 #include "openvswitch/ofpbuf.h"
  66 #include "openvswitch/shash.h"
  67 #include "openvswitch/vlog.h"
  68 #include "ovs-numa.h"
  69 #include "ovs-rcu.h"
  70 #include "packets.h"
  71 #include "openvswitch/poll-loop.h"
  72 #include "pvector.h"
  73 #include "random.h"
  74 #include "seq.h"
  75 #include "smap.h"
  76 #include "sset.h"
  77 #include "timeval.h"
  78 #include "tnl-neigh-cache.h"
  79 #include "tnl-ports.h"
  80 #include "unixctl.h"
  81 #include "util.h"
  82 #include "uuid.h"
  83
  84 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  85
  86 /* Auto Load Balancing Defaults */
  87 #define ALB_ACCEPTABLE_IMPROVEMENT       25
  88 #define ALB_PMD_LOAD_THRESHOLD           95
  89 #define ALB_PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
  90 #define MIN_TO_MSEC                  60000
  91
  92 #define FLOW_DUMP_MAX_BATCH 50
  93 /* Use per thread recirc_depth to prevent recirculation loop. */
  94 #define MAX_RECIRC_DEPTH 6
  95 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  96
  97 /* Use instant packet send by default. */
  98 #define DEFAULT_TX_FLUSH_INTERVAL 0
  99
 100 /* Configuration parameters. */
 101 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
 102 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
 103 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
 104
 105 COVERAGE_DEFINE(datapath_drop_meter);
 106 COVERAGE_DEFINE(datapath_drop_upcall_error);
 107 COVERAGE_DEFINE(datapath_drop_lock_error);
 108 COVERAGE_DEFINE(datapath_drop_userspace_action_error);
 109 COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
 110 COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
 111 COVERAGE_DEFINE(datapath_drop_recirc_error);
 112 COVERAGE_DEFINE(datapath_drop_invalid_port);
 113 COVERAGE_DEFINE(datapath_drop_invalid_bond);
 114 COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
 115 COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
 116
 117 /* Protects against changes to 'dp_netdevs'. */
 118 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
 119
 120 /* Contains all 'struct dp_netdev's. */
 121 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
 122     = SHASH_INITIALIZER(&dp_netdevs);
 123
 124 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 125
 126 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 127                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 128                                      | CS_SRC_NAT | CS_DST_NAT)
 129 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 130
 131 static struct odp_support dp_netdev_support = {
 132     .max_vlan_headers = SIZE_MAX,
 133     .max_mpls_depth = SIZE_MAX,
 134     .recirc = true,
 135     .ct_state = true,
 136     .ct_zone = true,
 137     .ct_mark = true,
 138     .ct_label = true,
 139     .ct_state_nat = true,
 140     .ct_orig_tuple = true,
 141     .ct_orig_tuple6 = true,
 142 };
 143
 144 /* EMC cache and SMC cache compose the datapath flow cache (DFC)
 145  *
 146  * Exact match cache for frequently used flows
 147  *
 148  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 149  * search its entries for a miniflow that matches exactly the miniflow of the
 150  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 151  *
 152  * A cache entry holds a reference to its 'dp_netdev_flow'.
 153  *
 154  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 155  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 156  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 157  * value is the index of a cache entry where the miniflow could be.
 158  *
 159  *
 160  * Signature match cache (SMC)
 161  *
 162  * This cache stores a 16-bit signature for each flow without storing keys, and
 163  * stores the corresponding 16-bit flow_table index to the 'dp_netdev_flow'.
 164  * Each flow thus occupies 32bit which is much more memory efficient than EMC.
 165  * SMC uses a set-associative design that each bucket contains
 166  * SMC_ENTRY_PER_BUCKET number of entries.
 167  * Since 16-bit flow_table index is used, if there are more than 2^16
 168  * dp_netdev_flow, SMC will miss them that cannot be indexed by a 16-bit value.
 169  *
 170  *
 171  * Thread-safety
 172  * =============
 173  *
 174  * Each pmd_thread has its own private exact match cache.
 175  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 176  */
 177
 178 #define EM_FLOW_HASH_SHIFT 13
 179 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 180 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 181 #define EM_FLOW_HASH_SEGS 2
 182
 183 /* SMC uses a set-associative design. A bucket contains a set of entries that
 184  * a flow item can occupy. For now, it uses one hash function rather than two
 185  * as for the EMC design. */
 186 #define SMC_ENTRY_PER_BUCKET 4
 187 #define SMC_ENTRIES (1u << 20)
 188 #define SMC_BUCKET_CNT (SMC_ENTRIES / SMC_ENTRY_PER_BUCKET)
 189 #define SMC_MASK (SMC_BUCKET_CNT - 1)
 190
 191 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 192 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 193 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 194                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 195
 196 struct emc_entry {
 197     struct dp_netdev_flow *flow;
 198     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 199 };
 200
 201 struct emc_cache {
 202     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 203     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 204 };
 205
 206 struct smc_bucket {
 207     uint16_t sig[SMC_ENTRY_PER_BUCKET];
 208     uint16_t flow_idx[SMC_ENTRY_PER_BUCKET];
 209 };
 210
 211 /* Signature match cache, differentiate from EMC cache */
 212 struct smc_cache {
 213     struct smc_bucket buckets[SMC_BUCKET_CNT];
 214 };
 215
 216 struct dfc_cache {
 217     struct emc_cache emc_cache;
 218     struct smc_cache smc_cache;
 219 };
 220
 221 /* Iterate in the exact match cache through every entry that might contain a
 222  * miniflow with hash 'HASH'. */
 223 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 224     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 225          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 226          i__ < EM_FLOW_HASH_SEGS;                                            \
 227          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 228 \f
 229 /* Simple non-wildcarding single-priority classifier. */
 230
 231 /* Time in microseconds between successive optimizations of the dpcls
 232  * subtable vector */
 233 #define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
 234
 235 /* Time in microseconds of the interval in which rxq processing cycles used
 236  * in rxq to pmd assignments is measured and stored. */
 237 #define PMD_RXQ_INTERVAL_LEN 10000000LL
 238
 239 /* Number of intervals for which cycles are stored
 240  * and used during rxq to pmd assignment. */
 241 #define PMD_RXQ_INTERVAL_MAX 6
 242
 243 /* Time in microseconds to try RCU quiescing. */
 244 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
 245
 246 struct dpcls {
 247     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 248     odp_port_t in_port;
 249     struct cmap subtables_map;
 250     struct pvector subtables;
 251 };
 252
 253 /* Data structure to keep packet order till fastpath processing. */
 254 struct dp_packet_flow_map {
 255     struct dp_packet *packet;
 256     struct dp_netdev_flow *flow;
 257     uint16_t tcp_flags;
 258 };
 259
 260 static void dpcls_init(struct dpcls *);
 261 static void dpcls_destroy(struct dpcls *);
 262 static void dpcls_sort_subtable_vector(struct dpcls *);
 263 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 264                          const struct netdev_flow_key *mask);
 265 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 266 static bool dpcls_lookup(struct dpcls *cls,
 267                          const struct netdev_flow_key *keys[],
 268                          struct dpcls_rule **rules, size_t cnt,
 269                          int *num_lookups_p);
 270
 271 /* Set of supported meter flags */
 272 #define DP_SUPPORTED_METER_FLAGS_MASK \
 273     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 274
 275 /* Set of supported meter band types */
 276 #define DP_SUPPORTED_METER_BAND_TYPES           \
 277     ( 1 << OFPMBT13_DROP )
 278
 279 struct dp_meter_band {
 280     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 281     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 282     uint64_t packet_count;
 283     uint64_t byte_count;
 284 };
 285
 286 struct dp_meter {
 287     uint16_t flags;
 288     uint16_t n_bands;
 289     uint32_t max_delta_t;
 290     uint64_t used;
 291     uint64_t packet_count;
 292     uint64_t byte_count;
 293     struct dp_meter_band bands[];
 294 };
 295
 296 struct pmd_auto_lb {
 297     bool auto_lb_requested;     /* Auto load balancing requested by user. */
 298     bool is_enabled;            /* Current status of Auto load balancing. */
 299     uint64_t rebalance_intvl;
 300     uint64_t rebalance_poll_timer;
 301 };
 302
 303 /* Datapath based on the network device interface from netdev.h.
 304  *
 305  *
 306  * Thread-safety
 307  * =============
 308  *
 309  * Some members, marked 'const', are immutable.  Accessing other members
 310  * requires synchronization, as noted in more detail below.
 311  *
 312  * Acquisition order is, from outermost to innermost:
 313  *
 314  *    dp_netdev_mutex (global)
 315  *    port_mutex
 316  *    bond_mutex
 317  *    non_pmd_mutex
 318  */
 319 struct dp_netdev {
 320     const struct dpif_class *const class;
 321     const char *const name;
 322     struct ovs_refcount ref_cnt;
 323     atomic_flag destroyed;
 324
 325     /* Ports.
 326      *
 327      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 328      * through 'ports' requires taking 'port_mutex'. */
 329     struct ovs_mutex port_mutex;
 330     struct hmap ports;
 331     struct seq *port_seq;       /* Incremented whenever a port changes. */
 332
 333     /* The time that a packet can wait in output batch for sending. */
 334     atomic_uint32_t tx_flush_interval;
 335
 336     /* Meters. */
 337     struct ovs_mutex meter_locks[N_METER_LOCKS];
 338     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 339
 340     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 341     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 342     /* Enable collection of PMD performance metrics. */
 343     atomic_bool pmd_perf_metrics;
 344     /* Enable the SMC cache from ovsdb config */
 345     atomic_bool smc_enable_db;
 346
 347     /* Protects access to ofproto-dpif-upcall interface during revalidator
 348      * thread synchronization. */
 349     struct fat_rwlock upcall_rwlock;
 350     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 351     void *upcall_aux;
 352
 353     /* Callback function for notifying the purging of dp flows (during
 354      * reseting pmd deletion). */
 355     dp_purge_callback *dp_purge_cb;
 356     void *dp_purge_aux;
 357
 358     /* Stores all 'struct dp_netdev_pmd_thread's. */
 359     struct cmap poll_threads;
 360     /* id pool for per thread static_tx_qid. */
 361     struct id_pool *tx_qid_pool;
 362     struct ovs_mutex tx_qid_pool_mutex;
 363     /* Use measured cycles for rxq to pmd assignment. */
 364     bool pmd_rxq_assign_cyc;
 365
 366     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 367      * instance for non-pmd thread. */
 368     struct ovs_mutex non_pmd_mutex;
 369
 370     /* Each pmd thread will store its pointer to
 371      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 372     ovsthread_key_t per_pmd_key;
 373
 374     struct seq *reconfigure_seq;
 375     uint64_t last_reconfigure_seq;
 376
 377     /* Cpu mask for pin of pmd threads. */
 378     char *pmd_cmask;
 379
 380     uint64_t last_tnl_conf_seq;
 381
 382     struct conntrack *conntrack;
 383     struct pmd_auto_lb pmd_alb;
 384
 385     /* Bonds. */
 386     struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
 387     struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
 388 };
 389
 390 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 391     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 392 {
 393     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 394 }
 395
 396 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 397     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 398 {
 399     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 400 }
 401
 402
 403 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 404                                                     odp_port_t)
 405     OVS_REQUIRES(dp->port_mutex);
 406
 407 enum rxq_cycles_counter_type {
 408     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 409                                    processing packets during the current
 410                                    interval. */
 411     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 412                                    during rxq to pmd assignment. */
 413     RXQ_N_CYCLES
 414 };
 415
 416 enum {
 417     DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
 418     DP_NETDEV_FLOW_OFFLOAD_OP_MOD,
 419     DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
 420 };
 421
 422 struct dp_flow_offload_item {
 423     struct dp_netdev_pmd_thread *pmd;
 424     struct dp_netdev_flow *flow;
 425     int op;
 426     struct match match;
 427     struct nlattr *actions;
 428     size_t actions_len;
 429
 430     struct ovs_list node;
 431 };
 432
 433 struct dp_flow_offload {
 434     struct ovs_mutex mutex;
 435     struct ovs_list list;
 436     pthread_cond_t cond;
 437 };
 438
 439 static struct dp_flow_offload dp_flow_offload = {
 440     .mutex = OVS_MUTEX_INITIALIZER,
 441     .list  = OVS_LIST_INITIALIZER(&dp_flow_offload.list),
 442 };
 443
 444 static struct ovsthread_once offload_thread_once
 445     = OVSTHREAD_ONCE_INITIALIZER;
 446
 447 #define XPS_TIMEOUT 500000LL    /* In microseconds. */
 448
 449 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 450 struct dp_netdev_rxq {
 451     struct dp_netdev_port *port;
 452     struct netdev_rxq *rx;
 453     unsigned core_id;                  /* Core to which this queue should be
 454                                           pinned. OVS_CORE_UNSPEC if the
 455                                           queue doesn't need to be pinned to a
 456                                           particular core. */
 457     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 458     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 459     bool is_vhost;                     /* Is rxq of a vhost port. */
 460
 461     /* Counters of cycles spent successfully polling and processing pkts. */
 462     atomic_ullong cycles[RXQ_N_CYCLES];
 463     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 464        sum them to yield the cycles used for an rxq. */
 465     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 466 };
 467
 468 /* A port in a netdev-based datapath. */
 469 struct dp_netdev_port {
 470     odp_port_t port_no;
 471     bool dynamic_txqs;          /* If true XPS will be used. */
 472     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 473     struct netdev *netdev;
 474     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 475     struct netdev_saved_flags *sf;
 476     struct dp_netdev_rxq *rxqs;
 477     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 478     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 479     struct ovs_mutex txq_used_mutex;
 480     bool emc_enabled;           /* If true EMC will be used. */
 481     char *type;                 /* Port type as requested by user. */
 482     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 483 };
 484
 485 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 486 struct dp_netdev_flow_stats {
 487     atomic_llong used;             /* Last used time, in monotonic msecs. */
 488     atomic_ullong packet_count;    /* Number of packets matched. */
 489     atomic_ullong byte_count;      /* Number of bytes matched. */
 490     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 491 };
 492
 493 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 494  *
 495  *
 496  * Thread-safety
 497  * =============
 498  *
 499  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 500  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 501  *
 502  * Motivation
 503  * ----------
 504  *
 505  * The thread safety rules described here for "struct dp_netdev_flow" are
 506  * motivated by two goals:
 507  *
 508  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 509  *      reading bad data due to changes by some thread concurrently modifying
 510  *      those members.
 511  *
 512  *    - Prevent two threads making changes to members of a given "struct
 513  *      dp_netdev_flow" from interfering with each other.
 514  *
 515  *
 516  * Rules
 517  * -----
 518  *
 519  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 520  * grace period.  Code that needs to hold onto a flow for a while
 521  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 522  *
 523  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 524  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 525  * from modification.
 526  *
 527  * Some members, marked 'const', are immutable.  Accessing other members
 528  * requires synchronization, as noted in more detail below.
 529  */
 530 struct dp_netdev_flow {
 531     const struct flow flow;      /* Unmasked flow that created this entry. */
 532     /* Hash table index by unmasked flow. */
 533     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 534                                  /* 'flow_table'. */
 535     const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
 536     const ovs_u128 ufid;         /* Unique flow identifier. */
 537     const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
 538     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 539                                  /* flow. */
 540
 541     /* Number of references.
 542      * The classifier owns one reference.
 543      * Any thread trying to keep a rule from being freed should hold its own
 544      * reference. */
 545     struct ovs_refcount ref_cnt;
 546
 547     bool dead;
 548     uint32_t mark;               /* Unique flow mark assigned to a flow */
 549
 550     /* Statistics. */
 551     struct dp_netdev_flow_stats stats;
 552
 553     /* Actions. */
 554     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 555
 556     /* While processing a group of input packets, the datapath uses the next
 557      * member to store a pointer to the output batch for the flow.  It is
 558      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 559      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 560     struct packet_batch_per_flow *batch;
 561
 562     /* Packet classification. */
 563     char *dp_extra_info;         /* String to return in a flow dump/get. */
 564     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 565     /* 'cr' must be the last member. */
 566 };
 567
 568 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 569 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 570 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 571                                          struct flow *, bool);
 572
 573 /* A set of datapath actions within a "struct dp_netdev_flow".
 574  *
 575  *
 576  * Thread-safety
 577  * =============
 578  *
 579  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 580 struct dp_netdev_actions {
 581     /* These members are immutable: they do not change during the struct's
 582      * lifetime.  */
 583     unsigned int size;          /* Size of 'actions', in bytes. */
 584     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 585 };
 586
 587 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 588                                                    size_t);
 589 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 590     const struct dp_netdev_flow *);
 591 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 592
 593 struct polled_queue {
 594     struct dp_netdev_rxq *rxq;
 595     odp_port_t port_no;
 596     bool emc_enabled;
 597     bool rxq_enabled;
 598     uint64_t change_seq;
 599 };
 600
 601 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 602 struct rxq_poll {
 603     struct dp_netdev_rxq *rxq;
 604     struct hmap_node node;
 605 };
 606
 607 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 608  * 'tnl_port_cache' or 'tx_ports'. */
 609 struct tx_port {
 610     struct dp_netdev_port *port;
 611     int qid;
 612     long long last_used;
 613     struct hmap_node node;
 614     long long flush_time;
 615     struct dp_packet_batch output_pkts;
 616     struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
 617 };
 618
 619 /* Contained by struct tx_bond 'slave_buckets'. */
 620 struct slave_entry {
 621     odp_port_t slave_id;
 622     atomic_ullong n_packets;
 623     atomic_ullong n_bytes;
 624 };
 625
 626 /* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
 627 struct tx_bond {
 628     struct cmap_node node;
 629     uint32_t bond_id;
 630     struct slave_entry slave_buckets[BOND_BUCKETS];
 631 };
 632
 633 /* A set of properties for the current processing loop that is not directly
 634  * associated with the pmd thread itself, but with the packets being
 635  * processed or the short-term system configuration (for example, time).
 636  * Contained by struct dp_netdev_pmd_thread's 'ctx' member. */
 637 struct dp_netdev_pmd_thread_ctx {
 638     /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
 639     long long now;
 640     /* RX queue from which last packet was received. */
 641     struct dp_netdev_rxq *last_rxq;
 642     /* EMC insertion probability context for the current processing cycle. */
 643     uint32_t emc_insert_min;
 644 };
 645
 646 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 647  * the performance overhead of interrupt processing.  Therefore netdev can
 648  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 649  * these device to check for recv buffer.  pmd-thread does polling for
 650  * devices assigned to itself.
 651  *
 652  * DPDK used PMD for accessing NIC.
 653  *
 654  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 655  * I/O of all non-pmd threads.  There will be no actual thread created
 656  * for the instance.
 657  *
 658  * Each struct has its own flow cache and classifier per managed ingress port.
 659  * For packets received on ingress port, a look up is done on corresponding PMD
 660  * thread's flow cache and in case of a miss, lookup is performed in the
 661  * corresponding classifier of port.  Packets are executed with the found
 662  * actions in either case.
 663  * */
 664 struct dp_netdev_pmd_thread {
 665     struct dp_netdev *dp;
 666     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 667     struct cmap_node node;          /* In 'dp->poll_threads'. */
 668
 669     /* Per thread exact-match cache.  Note, the instance for cpu core
 670      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 671      * need to be protected by 'non_pmd_mutex'.  Every other instance
 672      * will only be accessed by its own pmd thread. */
 673     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct dfc_cache flow_cache;
 674
 675     /* Flow-Table and classifiers
 676      *
 677      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 678      * changes to 'classifiers' must be made while still holding the
 679      * 'flow_mutex'.
 680      */
 681     struct ovs_mutex flow_mutex;
 682     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 683
 684     /* One classifier per in_port polled by the pmd */
 685     struct cmap classifiers;
 686     /* Periodically sort subtable vectors according to hit frequencies */
 687     long long int next_optimization;
 688     /* End of the next time interval for which processing cycles
 689        are stored for each polled rxq. */
 690     long long int rxq_next_cycle_store;
 691
 692     /* Last interval timestamp. */
 693     uint64_t intrvl_tsc_prev;
 694     /* Last interval cycles. */
 695     atomic_ullong intrvl_cycles;
 696
 697     /* Current context of the PMD thread. */
 698     struct dp_netdev_pmd_thread_ctx ctx;
 699
 700     struct seq *reload_seq;
 701     uint64_t last_reload_seq;
 702
 703     /* These are atomic variables used as a synchronization and configuration
 704      * points for thread reload/exit.
 705      *
 706      * 'reload' atomic is the main one and it's used as a memory
 707      * synchronization point for all other knobs and data.
 708      *
 709      * For a thread that requests PMD reload:
 710      *
 711      *   * All changes that should be visible to the PMD thread must be made
 712      *     before setting the 'reload'.  These changes could use any memory
 713      *     ordering model including 'relaxed'.
 714      *   * Setting the 'reload' atomic should occur in the same thread where
 715      *     all other PMD configuration options updated.
 716      *   * Setting the 'reload' atomic should be done with 'release' memory
 717      *     ordering model or stricter.  This will guarantee that all previous
 718      *     changes (including non-atomic and 'relaxed') will be visible to
 719      *     the PMD thread.
 720      *   * To check that reload is done, thread should poll the 'reload' atomic
 721      *     to become 'false'.  Polling should be done with 'acquire' memory
 722      *     ordering model or stricter.  This ensures that PMD thread completed
 723      *     the reload process.
 724      *
 725      * For the PMD thread:
 726      *
 727      *   * PMD thread should read 'reload' atomic with 'acquire' memory
 728      *     ordering model or stricter.  This will guarantee that all changes
 729      *     made before setting the 'reload' in the requesting thread will be
 730      *     visible to the PMD thread.
 731      *   * All other configuration data could be read with any memory
 732      *     ordering model (including non-atomic and 'relaxed') but *only after*
 733      *     reading the 'reload' atomic set to 'true'.
 734      *   * When the PMD reload done, PMD should (optionally) set all the below
 735      *     knobs except the 'reload' to their default ('false') values and
 736      *     (mandatory), as the last step, set the 'reload' to 'false' using
 737      *     'release' memory ordering model or stricter.  This will inform the
 738      *     requesting thread that PMD has completed a reload cycle.
 739      */
 740     atomic_bool reload;             /* Do we need to reload ports? */
 741     atomic_bool wait_for_reload;    /* Can we busy wait for the next reload? */
 742     atomic_bool reload_tx_qid;      /* Do we need to reload static_tx_qid? */
 743     atomic_bool exit;               /* For terminating the pmd thread. */
 744
 745     pthread_t thread;
 746     unsigned core_id;               /* CPU core id of this pmd thread. */
 747     int numa_id;                    /* numa node id of this pmd thread. */
 748     bool isolated;
 749
 750     /* Queue id used by this pmd thread to send packets on all netdevs if
 751      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 752      * than 'cmap_count(dp->poll_threads)'. */
 753     uint32_t static_tx_qid;
 754
 755     /* Number of filled output batches. */
 756     int n_output_batches;
 757
 758     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 759     /* List of rx queues to poll. */
 760     struct hmap poll_list OVS_GUARDED;
 761     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 762      * read by the pmd thread. */
 763     struct hmap tx_ports OVS_GUARDED;
 764
 765     struct ovs_mutex bond_mutex;    /* Protects updates of 'tx_bonds'. */
 766     /* Map of 'tx_bond's used for transmission.  Written by the main thread
 767      * and read by the pmd thread. */
 768     struct cmap tx_bonds;
 769
 770     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 771      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 772      * with at least one txq (that support send).  A port can be in both.
 773      *
 774      * There are two separate maps to make sure that we don't try to execute
 775      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 776      *
 777      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 778      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 779      * other instance will only be accessed by its own pmd thread. */
 780     struct hmap tnl_port_cache;
 781     struct hmap send_port_cache;
 782
 783     /* Keep track of detailed PMD performance statistics. */
 784     struct pmd_perf_stats perf_stats;
 785
 786     /* Stats from previous iteration used by automatic pmd
 787      * load balance logic. */
 788     uint64_t prev_stats[PMD_N_STATS];
 789     atomic_count pmd_overloaded;
 790
 791     /* Set to true if the pmd thread needs to be reloaded. */
 792     bool need_reload;
 793
 794     /* Next time when PMD should try RCU quiescing. */
 795     long long next_rcu_quiesce;
 796 };
 797
 798 /* Interface to netdev-based datapath. */
 799 struct dpif_netdev {
 800     struct dpif dpif;
 801     struct dp_netdev *dp;
 802     uint64_t last_port_seq;
 803 };
 804
 805 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 806                               struct dp_netdev_port **portp)
 807     OVS_REQUIRES(dp->port_mutex);
 808 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 809                             struct dp_netdev_port **portp)
 810     OVS_REQUIRES(dp->port_mutex);
 811 static void dp_netdev_free(struct dp_netdev *)
 812     OVS_REQUIRES(dp_netdev_mutex);
 813 static int do_add_port(struct dp_netdev *dp, const char *devname,
 814                        const char *type, odp_port_t port_no)
 815     OVS_REQUIRES(dp->port_mutex);
 816 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 817     OVS_REQUIRES(dp->port_mutex);
 818 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 819                             bool create, struct dpif **);
 820 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 821                                       struct dp_packet_batch *,
 822                                       bool should_steal,
 823                                       const struct flow *flow,
 824                                       const struct nlattr *actions,
 825                                       size_t actions_len);
 826 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 827                             struct dp_packet_batch *, odp_port_t port_no);
 828 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 829                                   struct dp_packet_batch *);
 830
 831 static void dp_netdev_disable_upcall(struct dp_netdev *);
 832 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 833 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 834                                     struct dp_netdev *dp, unsigned core_id,
 835                                     int numa_id);
 836 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 837 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 838     OVS_REQUIRES(dp->port_mutex);
 839
 840 static void *pmd_thread_main(void *);
 841 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 842                                                       unsigned core_id);
 843 static struct dp_netdev_pmd_thread *
 844 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 845 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 846                               struct dp_netdev_pmd_thread *pmd);
 847 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 848 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 849 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 850                                          struct dp_netdev_port *port)
 851     OVS_REQUIRES(pmd->port_mutex);
 852 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 853                                            struct tx_port *tx)
 854     OVS_REQUIRES(pmd->port_mutex);
 855 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 856                                      struct dp_netdev_rxq *rxq)
 857     OVS_REQUIRES(pmd->port_mutex);
 858 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 859                                        struct rxq_poll *poll)
 860     OVS_REQUIRES(pmd->port_mutex);
 861 static int
 862 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 863                                    bool force);
 864 static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 865                                          struct tx_bond *bond, bool update)
 866     OVS_EXCLUDED(pmd->bond_mutex);
 867 static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 868                                            uint32_t bond_id)
 869     OVS_EXCLUDED(pmd->bond_mutex);
 870
 871 static void reconfigure_datapath(struct dp_netdev *dp)
 872     OVS_REQUIRES(dp->port_mutex);
 873 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 874 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 875 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 876 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 877     OVS_REQUIRES(pmd->port_mutex);
 878 static inline void
 879 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
 880                            struct polled_queue *poll_list, int poll_cnt);
 881 static void
 882 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
 883                          enum rxq_cycles_counter_type type,
 884                          unsigned long long cycles);
 885 static uint64_t
 886 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 887                          enum rxq_cycles_counter_type type);
 888 static void
 889 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 890                            unsigned long long cycles);
 891 static uint64_t
 892 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 893 static void
 894 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 895                                bool purge);
 896 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 897                                       struct tx_port *tx);
 898
 899 static inline bool emc_entry_alive(struct emc_entry *ce);
 900 static void emc_clear_entry(struct emc_entry *ce);
 901 static void smc_clear_entry(struct smc_bucket *b, int idx);
 902
 903 static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
 904 static inline bool
 905 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
 906 static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
 907                                   struct dp_netdev_flow *flow);
 908
 909 static void
 910 emc_cache_init(struct emc_cache *flow_cache)
 911 {
 912     int i;
 913
 914     flow_cache->sweep_idx = 0;
 915     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 916         flow_cache->entries[i].flow = NULL;
 917         flow_cache->entries[i].key.hash = 0;
 918         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 919         flowmap_init(&flow_cache->entries[i].key.mf.map);
 920     }
 921 }
 922
 923 static void
 924 smc_cache_init(struct smc_cache *smc_cache)
 925 {
 926     int i, j;
 927     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 928         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 929             smc_cache->buckets[i].flow_idx[j] = UINT16_MAX;
 930         }
 931     }
 932 }
 933
 934 static void
 935 dfc_cache_init(struct dfc_cache *flow_cache)
 936 {
 937     emc_cache_init(&flow_cache->emc_cache);
 938     smc_cache_init(&flow_cache->smc_cache);
 939 }
 940
 941 static void
 942 emc_cache_uninit(struct emc_cache *flow_cache)
 943 {
 944     int i;
 945
 946     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 947         emc_clear_entry(&flow_cache->entries[i]);
 948     }
 949 }
 950
 951 static void
 952 smc_cache_uninit(struct smc_cache *smc)
 953 {
 954     int i, j;
 955
 956     for (i = 0; i < SMC_BUCKET_CNT; i++) {
 957         for (j = 0; j < SMC_ENTRY_PER_BUCKET; j++) {
 958             smc_clear_entry(&(smc->buckets[i]), j);
 959         }
 960     }
 961 }
 962
 963 static void
 964 dfc_cache_uninit(struct dfc_cache *flow_cache)
 965 {
 966     smc_cache_uninit(&flow_cache->smc_cache);
 967     emc_cache_uninit(&flow_cache->emc_cache);
 968 }
 969
 970 /* Check and clear dead flow references slowly (one entry at each
 971  * invocation).  */
 972 static void
 973 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 974 {
 975     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 976
 977     if (!emc_entry_alive(entry)) {
 978         emc_clear_entry(entry);
 979     }
 980     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 981 }
 982
 983 /* Updates the time in PMD threads context and should be called in three cases:
 984  *
 985  *     1. PMD structure initialization:
 986  *         - dp_netdev_configure_pmd()
 987  *
 988  *     2. Before processing of the new packet batch:
 989  *         - dpif_netdev_execute()
 990  *         - dp_netdev_process_rxq_port()
 991  *
 992  *     3. At least once per polling iteration in main polling threads if no
 993  *        packets received on current iteration:
 994  *         - dpif_netdev_run()
 995  *         - pmd_thread_main()
 996  *
 997  * 'pmd->ctx.now' should be used without update in all other cases if possible.
 998  */
 999 static inline void
1000 pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
1001 {
1002     pmd->ctx.now = time_usec();
1003 }
1004
1005 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
1006 bool
1007 dpif_is_netdev(const struct dpif *dpif)
1008 {
1009     return dpif->dpif_class->open == dpif_netdev_open;
1010 }
1011
1012 static struct dpif_netdev *
1013 dpif_netdev_cast(const struct dpif *dpif)
1014 {
1015     ovs_assert(dpif_is_netdev(dpif));
1016     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
1017 }
1018
1019 static struct dp_netdev *
1020 get_dp_netdev(const struct dpif *dpif)
1021 {
1022     return dpif_netdev_cast(dpif)->dp;
1023 }
1024 \f
1025 enum pmd_info_type {
1026     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
1027     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
1028     PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
1029     PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
1030 };
1031
1032 static void
1033 format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1034 {
1035     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
1036                         ? "main thread" : "pmd thread");
1037     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
1038         ds_put_format(reply, " numa_id %d", pmd->numa_id);
1039     }
1040     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
1041         ds_put_format(reply, " core_id %u", pmd->core_id);
1042     }
1043     ds_put_cstr(reply, ":\n");
1044 }
1045
1046 static void
1047 pmd_info_show_stats(struct ds *reply,
1048                     struct dp_netdev_pmd_thread *pmd)
1049 {
1050     uint64_t stats[PMD_N_STATS];
1051     uint64_t total_cycles, total_packets;
1052     double passes_per_pkt = 0;
1053     double lookups_per_hit = 0;
1054     double packets_per_batch = 0;
1055
1056     pmd_perf_read_counters(&pmd->perf_stats, stats);
1057     total_cycles = stats[PMD_CYCLES_ITER_IDLE]
1058                          + stats[PMD_CYCLES_ITER_BUSY];
1059     total_packets = stats[PMD_STAT_RECV];
1060
1061     format_pmd_thread(reply, pmd);
1062
1063     if (total_packets > 0) {
1064         passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
1065                             / (double) total_packets;
1066     }
1067     if (stats[PMD_STAT_MASKED_HIT] > 0) {
1068         lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
1069                             / (double) stats[PMD_STAT_MASKED_HIT];
1070     }
1071     if (stats[PMD_STAT_SENT_BATCHES] > 0) {
1072         packets_per_batch = stats[PMD_STAT_SENT_PKTS]
1073                             / (double) stats[PMD_STAT_SENT_BATCHES];
1074     }
1075
1076     ds_put_format(reply,
1077                   "  packets received: %"PRIu64"\n"
1078                   "  packet recirculations: %"PRIu64"\n"
1079                   "  avg. datapath passes per packet: %.02f\n"
1080                   "  emc hits: %"PRIu64"\n"
1081                   "  smc hits: %"PRIu64"\n"
1082                   "  megaflow hits: %"PRIu64"\n"
1083                   "  avg. subtable lookups per megaflow hit: %.02f\n"
1084                   "  miss with success upcall: %"PRIu64"\n"
1085                   "  miss with failed upcall: %"PRIu64"\n"
1086                   "  avg. packets per output batch: %.02f\n",
1087                   total_packets, stats[PMD_STAT_RECIRC],
1088                   passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
1089                   stats[PMD_STAT_SMC_HIT],
1090                   stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
1091                   stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
1092                   packets_per_batch);
1093
1094     if (total_cycles == 0) {
1095         return;
1096     }
1097
1098     ds_put_format(reply,
1099                   "  idle cycles: %"PRIu64" (%.02f%%)\n"
1100                   "  processing cycles: %"PRIu64" (%.02f%%)\n",
1101                   stats[PMD_CYCLES_ITER_IDLE],
1102                   stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
1103                   stats[PMD_CYCLES_ITER_BUSY],
1104                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
1105
1106     if (total_packets == 0) {
1107         return;
1108     }
1109
1110     ds_put_format(reply,
1111                   "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
1112                   total_cycles / (double) total_packets,
1113                   total_cycles, total_packets);
1114
1115     ds_put_format(reply,
1116                   "  avg processing cycles per packet: "
1117                   "%.02f (%"PRIu64"/%"PRIu64")\n",
1118                   stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
1119                   stats[PMD_CYCLES_ITER_BUSY], total_packets);
1120 }
1121
1122 static void
1123 pmd_info_show_perf(struct ds *reply,
1124                    struct dp_netdev_pmd_thread *pmd,
1125                    struct pmd_perf_params *par)
1126 {
1127     if (pmd->core_id != NON_PMD_CORE_ID) {
1128         char *time_str =
1129                 xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
1130         long long now = time_msec();
1131         double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
1132
1133         ds_put_cstr(reply, "\n");
1134         ds_put_format(reply, "Time: %s\n", time_str);
1135         ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
1136         ds_put_cstr(reply, "\n");
1137         format_pmd_thread(reply, pmd);
1138         ds_put_cstr(reply, "\n");
1139         pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
1140         if (pmd_perf_metrics_enabled(pmd)) {
1141             /* Prevent parallel clearing of perf metrics. */
1142             ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
1143             if (par->histograms) {
1144                 ds_put_cstr(reply, "\n");
1145                 pmd_perf_format_histograms(reply, &pmd->perf_stats);
1146             }
1147             if (par->iter_hist_len > 0) {
1148                 ds_put_cstr(reply, "\n");
1149                 pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
1150                         par->iter_hist_len);
1151             }
1152             if (par->ms_hist_len > 0) {
1153                 ds_put_cstr(reply, "\n");
1154                 pmd_perf_format_ms_history(reply, &pmd->perf_stats,
1155                         par->ms_hist_len);
1156             }
1157             ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
1158         }
1159         free(time_str);
1160     }
1161 }
1162
1163 static int
1164 compare_poll_list(const void *a_, const void *b_)
1165 {
1166     const struct rxq_poll *a = a_;
1167     const struct rxq_poll *b = b_;
1168
1169     const char *namea = netdev_rxq_get_name(a->rxq->rx);
1170     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
1171
1172     int cmp = strcmp(namea, nameb);
1173     if (!cmp) {
1174         return netdev_rxq_get_queue_id(a->rxq->rx)
1175                - netdev_rxq_get_queue_id(b->rxq->rx);
1176     } else {
1177         return cmp;
1178     }
1179 }
1180
1181 static void
1182 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
1183                  size_t *n)
1184     OVS_REQUIRES(pmd->port_mutex)
1185 {
1186     struct rxq_poll *ret, *poll;
1187     size_t i;
1188
1189     *n = hmap_count(&pmd->poll_list);
1190     if (!*n) {
1191         ret = NULL;
1192     } else {
1193         ret = xcalloc(*n, sizeof *ret);
1194         i = 0;
1195         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
1196             ret[i] = *poll;
1197             i++;
1198         }
1199         ovs_assert(i == *n);
1200         qsort(ret, *n, sizeof *ret, compare_poll_list);
1201     }
1202
1203     *list = ret;
1204 }
1205
1206 static void
1207 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
1208 {
1209     if (pmd->core_id != NON_PMD_CORE_ID) {
1210         struct rxq_poll *list;
1211         size_t n_rxq;
1212         uint64_t total_cycles = 0;
1213
1214         ds_put_format(reply,
1215                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
1216                       pmd->numa_id, pmd->core_id, (pmd->isolated)
1217                                                   ? "true" : "false");
1218
1219         ovs_mutex_lock(&pmd->port_mutex);
1220         sorted_poll_list(pmd, &list, &n_rxq);
1221
1222         /* Get the total pmd cycles for an interval. */
1223         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
1224         /* Estimate the cycles to cover all intervals. */
1225         total_cycles *= PMD_RXQ_INTERVAL_MAX;
1226
1227         for (int i = 0; i < n_rxq; i++) {
1228             struct dp_netdev_rxq *rxq = list[i].rxq;
1229             const char *name = netdev_rxq_get_name(rxq->rx);
1230             uint64_t proc_cycles = 0;
1231
1232             for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
1233                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
1234             }
1235             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
1236                           netdev_rxq_get_queue_id(list[i].rxq->rx));
1237             ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
1238                                         ? "(enabled) " : "(disabled)");
1239             ds_put_format(reply, "  pmd usage: ");
1240             if (total_cycles) {
1241                 ds_put_format(reply, "%2"PRIu64"",
1242                               proc_cycles * 100 / total_cycles);
1243                 ds_put_cstr(reply, " %");
1244             } else {
1245                 ds_put_format(reply, "%s", "NOT AVAIL");
1246             }
1247             ds_put_cstr(reply, "\n");
1248         }
1249         ovs_mutex_unlock(&pmd->port_mutex);
1250         free(list);
1251     }
1252 }
1253
1254 static int
1255 compare_poll_thread_list(const void *a_, const void *b_)
1256 {
1257     const struct dp_netdev_pmd_thread *a, *b;
1258
1259     a = *(struct dp_netdev_pmd_thread **)a_;
1260     b = *(struct dp_netdev_pmd_thread **)b_;
1261
1262     if (a->core_id < b->core_id) {
1263         return -1;
1264     }
1265     if (a->core_id > b->core_id) {
1266         return 1;
1267     }
1268     return 0;
1269 }
1270
1271 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
1272  * this list, as long as we do not go to quiescent state. */
1273 static void
1274 sorted_poll_thread_list(struct dp_netdev *dp,
1275                         struct dp_netdev_pmd_thread ***list,
1276                         size_t *n)
1277 {
1278     struct dp_netdev_pmd_thread *pmd;
1279     struct dp_netdev_pmd_thread **pmd_list;
1280     size_t k = 0, n_pmds;
1281
1282     n_pmds = cmap_count(&dp->poll_threads);
1283     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
1284
1285     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1286         if (k >= n_pmds) {
1287             break;
1288         }
1289         pmd_list[k++] = pmd;
1290     }
1291
1292     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
1293
1294     *list = pmd_list;
1295     *n = k;
1296 }
1297
1298 static void
1299 dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1300                           const char *argv[], void *aux OVS_UNUSED)
1301 {
1302     struct ds reply = DS_EMPTY_INITIALIZER;
1303     struct dp_netdev *dp = NULL;
1304
1305     ovs_mutex_lock(&dp_netdev_mutex);
1306
1307     if (argc == 2) {
1308         dp = shash_find_data(&dp_netdevs, argv[1]);
1309     } else if (shash_count(&dp_netdevs) == 1) {
1310         /* There's only one datapath */
1311         dp = shash_first(&dp_netdevs)->data;
1312     }
1313
1314     if (!dp) {
1315         ovs_mutex_unlock(&dp_netdev_mutex);
1316         unixctl_command_reply_error(conn,
1317                                     "please specify an existing datapath");
1318         return;
1319     }
1320
1321     dp_netdev_request_reconfigure(dp);
1322     ovs_mutex_unlock(&dp_netdev_mutex);
1323     ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1324     unixctl_command_reply(conn, ds_cstr(&reply));
1325     ds_destroy(&reply);
1326 }
1327
1328 static void
1329 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1330                      void *aux)
1331 {
1332     struct ds reply = DS_EMPTY_INITIALIZER;
1333     struct dp_netdev_pmd_thread **pmd_list;
1334     struct dp_netdev *dp = NULL;
1335     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1336     unsigned int core_id;
1337     bool filter_on_pmd = false;
1338     size_t n;
1339
1340     ovs_mutex_lock(&dp_netdev_mutex);
1341
1342     while (argc > 1) {
1343         if (!strcmp(argv[1], "-pmd") && argc > 2) {
1344             if (str_to_uint(argv[2], 10, &core_id)) {
1345                 filter_on_pmd = true;
1346             }
1347             argc -= 2;
1348             argv += 2;
1349         } else {
1350             dp = shash_find_data(&dp_netdevs, argv[1]);
1351             argc -= 1;
1352             argv += 1;
1353         }
1354     }
1355
1356     if (!dp) {
1357         if (shash_count(&dp_netdevs) == 1) {
1358             /* There's only one datapath */
1359             dp = shash_first(&dp_netdevs)->data;
1360         } else {
1361             ovs_mutex_unlock(&dp_netdev_mutex);
1362             unixctl_command_reply_error(conn,
1363                                         "please specify an existing datapath");
1364             return;
1365         }
1366     }
1367
1368     sorted_poll_thread_list(dp, &pmd_list, &n);
1369     for (size_t i = 0; i < n; i++) {
1370         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1371         if (!pmd) {
1372             break;
1373         }
1374         if (filter_on_pmd && pmd->core_id != core_id) {
1375             continue;
1376         }
1377         if (type == PMD_INFO_SHOW_RXQ) {
1378             pmd_info_show_rxq(&reply, pmd);
1379         } else if (type == PMD_INFO_CLEAR_STATS) {
1380             pmd_perf_stats_clear(&pmd->perf_stats);
1381         } else if (type == PMD_INFO_SHOW_STATS) {
1382             pmd_info_show_stats(&reply, pmd);
1383         } else if (type == PMD_INFO_PERF_SHOW) {
1384             pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1385         }
1386     }
1387     free(pmd_list);
1388
1389     ovs_mutex_unlock(&dp_netdev_mutex);
1390
1391     unixctl_command_reply(conn, ds_cstr(&reply));
1392     ds_destroy(&reply);
1393 }
1394
1395 static void
1396 pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1397                           const char *argv[],
1398                           void *aux OVS_UNUSED)
1399 {
1400     struct pmd_perf_params par;
1401     long int it_hist = 0, ms_hist = 0;
1402     par.histograms = true;
1403
1404     while (argc > 1) {
1405         if (!strcmp(argv[1], "-nh")) {
1406             par.histograms = false;
1407             argc -= 1;
1408             argv += 1;
1409         } else if (!strcmp(argv[1], "-it") && argc > 2) {
1410             it_hist = strtol(argv[2], NULL, 10);
1411             if (it_hist < 0) {
1412                 it_hist = 0;
1413             } else if (it_hist > HISTORY_LEN) {
1414                 it_hist = HISTORY_LEN;
1415             }
1416             argc -= 2;
1417             argv += 2;
1418         } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1419             ms_hist = strtol(argv[2], NULL, 10);
1420             if (ms_hist < 0) {
1421                 ms_hist = 0;
1422             } else if (ms_hist > HISTORY_LEN) {
1423                 ms_hist = HISTORY_LEN;
1424             }
1425             argc -= 2;
1426             argv += 2;
1427         } else {
1428             break;
1429         }
1430     }
1431     par.iter_hist_len = it_hist;
1432     par.ms_hist_len = ms_hist;
1433     par.command_type = PMD_INFO_PERF_SHOW;
1434     dpif_netdev_pmd_info(conn, argc, argv, &par);
1435 }
1436
1437 static void
1438 dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1439                       const char *argv[], void *aux OVS_UNUSED)
1440 {
1441     struct ds reply = DS_EMPTY_INITIALIZER;
1442     struct dp_netdev *dp = NULL;
1443
1444     ovs_mutex_lock(&dp_netdev_mutex);
1445     if (argc == 2) {
1446         dp = shash_find_data(&dp_netdevs, argv[1]);
1447     } else if (shash_count(&dp_netdevs) == 1) {
1448         /* There's only one datapath. */
1449         dp = shash_first(&dp_netdevs)->data;
1450     }
1451     if (!dp) {
1452         ovs_mutex_unlock(&dp_netdev_mutex);
1453         unixctl_command_reply_error(conn,
1454                                     "please specify an existing datapath");
1455         return;
1456     }
1457
1458     if (cmap_count(&dp->tx_bonds) > 0) {
1459         struct tx_bond *dp_bond_entry;
1460         uint32_t slave_id;
1461
1462         ds_put_cstr(&reply, "Bonds:\n");
1463         CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1464             ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1465                           dp_bond_entry->bond_id);
1466             for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1467                 slave_id =
1468                     odp_to_u32(dp_bond_entry->slave_buckets[bucket].slave_id);
1469                 ds_put_format(&reply, "    bucket %d - slave %"PRIu32"\n",
1470                               bucket, slave_id);
1471             }
1472         }
1473     }
1474     ovs_mutex_unlock(&dp_netdev_mutex);
1475     unixctl_command_reply(conn, ds_cstr(&reply));
1476     ds_destroy(&reply);
1477 }
1478
1479 \f
1480 static int
1481 dpif_netdev_init(void)
1482 {
1483     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1484                               clear_aux = PMD_INFO_CLEAR_STATS,
1485                               poll_aux = PMD_INFO_SHOW_RXQ;
1486
1487     unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1488                              0, 3, dpif_netdev_pmd_info,
1489                              (void *)&show_aux);
1490     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1491                              0, 3, dpif_netdev_pmd_info,
1492                              (void *)&clear_aux);
1493     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
1494                              0, 3, dpif_netdev_pmd_info,
1495                              (void *)&poll_aux);
1496     unixctl_command_register("dpif-netdev/pmd-perf-show",
1497                              "[-nh] [-it iter-history-len]"
1498                              " [-ms ms-history-len]"
1499                              " [-pmd core] [dp]",
1500                              0, 8, pmd_perf_show_cmd,
1501                              NULL);
1502     unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1503                              0, 1, dpif_netdev_pmd_rebalance,
1504                              NULL);
1505     unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1506                              "on|off [-b before] [-a after] [-e|-ne] "
1507                              "[-us usec] [-q qlen]",
1508                              0, 10, pmd_perf_log_set_cmd,
1509                              NULL);
1510     unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1511                              0, 1, dpif_netdev_bond_show,
1512                              NULL);
1513     return 0;
1514 }
1515
1516 static int
1517 dpif_netdev_enumerate(struct sset *all_dps,
1518                       const struct dpif_class *dpif_class)
1519 {
1520     struct shash_node *node;
1521
1522     ovs_mutex_lock(&dp_netdev_mutex);
1523     SHASH_FOR_EACH(node, &dp_netdevs) {
1524         struct dp_netdev *dp = node->data;
1525         if (dpif_class != dp->class) {
1526             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1527              * If the class doesn't match, skip this dpif. */
1528              continue;
1529         }
1530         sset_add(all_dps, node->name);
1531     }
1532     ovs_mutex_unlock(&dp_netdev_mutex);
1533
1534     return 0;
1535 }
1536
1537 static bool
1538 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1539 {
1540     return class != &dpif_netdev_class;
1541 }
1542
1543 static const char *
1544 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1545 {
1546     return strcmp(type, "internal") ? type
1547                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1548                   : "tap";
1549 }
1550
1551 static struct dpif *
1552 create_dpif_netdev(struct dp_netdev *dp)
1553 {
1554     uint16_t netflow_id = hash_string(dp->name, 0);
1555     struct dpif_netdev *dpif;
1556
1557     ovs_refcount_ref(&dp->ref_cnt);
1558
1559     dpif = xmalloc(sizeof *dpif);
1560     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1561     dpif->dp = dp;
1562     dpif->last_port_seq = seq_read(dp->port_seq);
1563
1564     return &dpif->dpif;
1565 }
1566
1567 /* Choose an unused, non-zero port number and return it on success.
1568  * Return ODPP_NONE on failure. */
1569 static odp_port_t
1570 choose_port(struct dp_netdev *dp, const char *name)
1571     OVS_REQUIRES(dp->port_mutex)
1572 {
1573     uint32_t port_no;
1574
1575     if (dp->class != &dpif_netdev_class) {
1576         const char *p;
1577         int start_no = 0;
1578
1579         /* If the port name begins with "br", start the number search at
1580          * 100 to make writing tests easier. */
1581         if (!strncmp(name, "br", 2)) {
1582             start_no = 100;
1583         }
1584
1585         /* If the port name contains a number, try to assign that port number.
1586          * This can make writing unit tests easier because port numbers are
1587          * predictable. */
1588         for (p = name; *p != '\0'; p++) {
1589             if (isdigit((unsigned char) *p)) {
1590                 port_no = start_no + strtol(p, NULL, 10);
1591                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1592                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1593                     return u32_to_odp(port_no);
1594                 }
1595                 break;
1596             }
1597         }
1598     }
1599
1600     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1601         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1602             return u32_to_odp(port_no);
1603         }
1604     }
1605
1606     return ODPP_NONE;
1607 }
1608
1609 static int
1610 create_dp_netdev(const char *name, const struct dpif_class *class,
1611                  struct dp_netdev **dpp)
1612     OVS_REQUIRES(dp_netdev_mutex)
1613 {
1614     static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1615     struct dp_netdev *dp;
1616     int error;
1617
1618     /* Avoid estimating TSC frequency for dummy datapath to not slow down
1619      * unit tests. */
1620     if (!dpif_netdev_class_is_dummy(class)
1621         && ovsthread_once_start(&tsc_freq_check)) {
1622         pmd_perf_estimate_tsc_frequency();
1623         ovsthread_once_done(&tsc_freq_check);
1624     }
1625
1626     dp = xzalloc(sizeof *dp);
1627     shash_add(&dp_netdevs, name, dp);
1628
1629     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1630     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1631     ovs_refcount_init(&dp->ref_cnt);
1632     atomic_flag_clear(&dp->destroyed);
1633
1634     ovs_mutex_init_recursive(&dp->port_mutex);
1635     hmap_init(&dp->ports);
1636     dp->port_seq = seq_create();
1637     ovs_mutex_init(&dp->bond_mutex);
1638     cmap_init(&dp->tx_bonds);
1639
1640     fat_rwlock_init(&dp->upcall_rwlock);
1641
1642     dp->reconfigure_seq = seq_create();
1643     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1644
1645     for (int i = 0; i < N_METER_LOCKS; ++i) {
1646         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1647     }
1648
1649     /* Disable upcalls by default. */
1650     dp_netdev_disable_upcall(dp);
1651     dp->upcall_aux = NULL;
1652     dp->upcall_cb = NULL;
1653
1654     dp->conntrack = conntrack_init();
1655
1656     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1657     atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1658
1659     cmap_init(&dp->poll_threads);
1660     dp->pmd_rxq_assign_cyc = true;
1661
1662     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1663     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1664     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1665
1666     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1667     ovsthread_key_create(&dp->per_pmd_key, NULL);
1668
1669     ovs_mutex_lock(&dp->port_mutex);
1670     /* non-PMD will be created before all other threads and will
1671      * allocate static_tx_qid = 0. */
1672     dp_netdev_set_nonpmd(dp);
1673
1674     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1675                                                              "internal"),
1676                         ODPP_LOCAL);
1677     ovs_mutex_unlock(&dp->port_mutex);
1678     if (error) {
1679         dp_netdev_free(dp);
1680         return error;
1681     }
1682
1683     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1684     *dpp = dp;
1685     return 0;
1686 }
1687
1688 static void
1689 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1690 {
1691     seq_change(dp->reconfigure_seq);
1692 }
1693
1694 static bool
1695 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1696 {
1697     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1698 }
1699
1700 static int
1701 dpif_netdev_open(const struct dpif_class *class, const char *name,
1702                  bool create, struct dpif **dpifp)
1703 {
1704     struct dp_netdev *dp;
1705     int error;
1706
1707     ovs_mutex_lock(&dp_netdev_mutex);
1708     dp = shash_find_data(&dp_netdevs, name);
1709     if (!dp) {
1710         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1711     } else {
1712         error = (dp->class != class ? EINVAL
1713                  : create ? EEXIST
1714                  : 0);
1715     }
1716     if (!error) {
1717         *dpifp = create_dpif_netdev(dp);
1718     }
1719     ovs_mutex_unlock(&dp_netdev_mutex);
1720
1721     return error;
1722 }
1723
1724 static void
1725 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1726     OVS_NO_THREAD_SAFETY_ANALYSIS
1727 {
1728     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1729     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1730
1731     /* Before freeing a lock we should release it */
1732     fat_rwlock_unlock(&dp->upcall_rwlock);
1733     fat_rwlock_destroy(&dp->upcall_rwlock);
1734 }
1735
1736 static void
1737 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1738     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1739 {
1740     if (dp->meters[meter_id]) {
1741         free(dp->meters[meter_id]);
1742         dp->meters[meter_id] = NULL;
1743     }
1744 }
1745
1746 static uint32_t
1747 hash_bond_id(uint32_t bond_id)
1748 {
1749     return hash_int(bond_id, 0);
1750 }
1751
1752 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1753  * through the 'dp_netdevs' shash while freeing 'dp'. */
1754 static void
1755 dp_netdev_free(struct dp_netdev *dp)
1756     OVS_REQUIRES(dp_netdev_mutex)
1757 {
1758     struct dp_netdev_port *port, *next;
1759     struct tx_bond *bond;
1760
1761     shash_find_and_delete(&dp_netdevs, dp->name);
1762
1763     ovs_mutex_lock(&dp->port_mutex);
1764     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1765         do_del_port(dp, port);
1766     }
1767     ovs_mutex_unlock(&dp->port_mutex);
1768
1769     ovs_mutex_lock(&dp->bond_mutex);
1770     CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1771         cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1772         ovsrcu_postpone(free, bond);
1773     }
1774     ovs_mutex_unlock(&dp->bond_mutex);
1775
1776     dp_netdev_destroy_all_pmds(dp, true);
1777     cmap_destroy(&dp->poll_threads);
1778
1779     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1780     id_pool_destroy(dp->tx_qid_pool);
1781
1782     ovs_mutex_destroy(&dp->non_pmd_mutex);
1783     ovsthread_key_delete(dp->per_pmd_key);
1784
1785     conntrack_destroy(dp->conntrack);
1786
1787
1788     seq_destroy(dp->reconfigure_seq);
1789
1790     seq_destroy(dp->port_seq);
1791     hmap_destroy(&dp->ports);
1792     ovs_mutex_destroy(&dp->port_mutex);
1793
1794     cmap_destroy(&dp->tx_bonds);
1795     ovs_mutex_destroy(&dp->bond_mutex);
1796
1797     /* Upcalls must be disabled at this point */
1798     dp_netdev_destroy_upcall_lock(dp);
1799
1800     int i;
1801
1802     for (i = 0; i < MAX_METERS; ++i) {
1803         meter_lock(dp, i);
1804         dp_delete_meter(dp, i);
1805         meter_unlock(dp, i);
1806     }
1807     for (i = 0; i < N_METER_LOCKS; ++i) {
1808         ovs_mutex_destroy(&dp->meter_locks[i]);
1809     }
1810
1811     free(dp->pmd_cmask);
1812     free(CONST_CAST(char *, dp->name));
1813     free(dp);
1814 }
1815
1816 static void
1817 dp_netdev_unref(struct dp_netdev *dp)
1818 {
1819     if (dp) {
1820         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1821          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1822         ovs_mutex_lock(&dp_netdev_mutex);
1823         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1824             dp_netdev_free(dp);
1825         }
1826         ovs_mutex_unlock(&dp_netdev_mutex);
1827     }
1828 }
1829
1830 static void
1831 dpif_netdev_close(struct dpif *dpif)
1832 {
1833     struct dp_netdev *dp = get_dp_netdev(dpif);
1834
1835     dp_netdev_unref(dp);
1836     free(dpif);
1837 }
1838
1839 static int
1840 dpif_netdev_destroy(struct dpif *dpif)
1841 {
1842     struct dp_netdev *dp = get_dp_netdev(dpif);
1843
1844     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1845         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1846             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1847             OVS_NOT_REACHED();
1848         }
1849     }
1850
1851     return 0;
1852 }
1853
1854 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1855  * load/store semantics.  While the increment is not atomic, the load and
1856  * store operations are, making it impossible to read inconsistent values.
1857  *
1858  * This is used to update thread local stats counters. */
1859 static void
1860 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1861 {
1862     unsigned long long tmp;
1863
1864     atomic_read_relaxed(var, &tmp);
1865     tmp += n;
1866     atomic_store_relaxed(var, tmp);
1867 }
1868
1869 static int
1870 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1871 {
1872     struct dp_netdev *dp = get_dp_netdev(dpif);
1873     struct dp_netdev_pmd_thread *pmd;
1874     uint64_t pmd_stats[PMD_N_STATS];
1875
1876     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1877     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1878         stats->n_flows += cmap_count(&pmd->flow_table);
1879         pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1880         stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1881         stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1882         stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1883         stats->n_missed += pmd_stats[PMD_STAT_MISS];
1884         stats->n_lost += pmd_stats[PMD_STAT_LOST];
1885     }
1886     stats->n_masks = UINT32_MAX;
1887     stats->n_mask_hit = UINT64_MAX;
1888
1889     return 0;
1890 }
1891
1892 static void
1893 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1894 {
1895     if (pmd->core_id == NON_PMD_CORE_ID) {
1896         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1897         ovs_mutex_lock(&pmd->port_mutex);
1898         pmd_load_cached_ports(pmd);
1899         ovs_mutex_unlock(&pmd->port_mutex);
1900         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1901         return;
1902     }
1903
1904     seq_change(pmd->reload_seq);
1905     atomic_store_explicit(&pmd->reload, true, memory_order_release);
1906 }
1907
1908 static uint32_t
1909 hash_port_no(odp_port_t port_no)
1910 {
1911     return hash_int(odp_to_u32(port_no), 0);
1912 }
1913
1914 static int
1915 port_create(const char *devname, const char *type,
1916             odp_port_t port_no, struct dp_netdev_port **portp)
1917 {
1918     struct dp_netdev_port *port;
1919     enum netdev_flags flags;
1920     struct netdev *netdev;
1921     int error;
1922
1923     *portp = NULL;
1924
1925     /* Open and validate network device. */
1926     error = netdev_open(devname, type, &netdev);
1927     if (error) {
1928         return error;
1929     }
1930     /* XXX reject non-Ethernet devices */
1931
1932     netdev_get_flags(netdev, &flags);
1933     if (flags & NETDEV_LOOPBACK) {
1934         VLOG_ERR("%s: cannot add a loopback device", devname);
1935         error = EINVAL;
1936         goto out;
1937     }
1938
1939     port = xzalloc(sizeof *port);
1940     port->port_no = port_no;
1941     port->netdev = netdev;
1942     port->type = xstrdup(type);
1943     port->sf = NULL;
1944     port->emc_enabled = true;
1945     port->need_reconfigure = true;
1946     ovs_mutex_init(&port->txq_used_mutex);
1947
1948     *portp = port;
1949
1950     return 0;
1951
1952 out:
1953     netdev_close(netdev);
1954     return error;
1955 }
1956
1957 static int
1958 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1959             odp_port_t port_no)
1960     OVS_REQUIRES(dp->port_mutex)
1961 {
1962     struct netdev_saved_flags *sf;
1963     struct dp_netdev_port *port;
1964     int error;
1965
1966     /* Reject devices already in 'dp'. */
1967     if (!get_port_by_name(dp, devname, &port)) {
1968         return EEXIST;
1969     }
1970
1971     error = port_create(devname, type, port_no, &port);
1972     if (error) {
1973         return error;
1974     }
1975
1976     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1977     seq_change(dp->port_seq);
1978
1979     reconfigure_datapath(dp);
1980
1981     /* Check that port was successfully configured. */
1982     if (!dp_netdev_lookup_port(dp, port_no)) {
1983         return EINVAL;
1984     }
1985
1986     /* Updating device flags triggers an if_notifier, which triggers a bridge
1987      * reconfiguration and another attempt to add this port, leading to an
1988      * infinite loop if the device is configured incorrectly and cannot be
1989      * added.  Setting the promisc mode after a successful reconfiguration,
1990      * since we already know that the device is somehow properly configured. */
1991     error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1992     if (error) {
1993         VLOG_ERR("%s: cannot set promisc flag", devname);
1994         do_del_port(dp, port);
1995         return error;
1996     }
1997     port->sf = sf;
1998
1999     return 0;
2000 }
2001
2002 static int
2003 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2004                      odp_port_t *port_nop)
2005 {
2006     struct dp_netdev *dp = get_dp_netdev(dpif);
2007     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2008     const char *dpif_port;
2009     odp_port_t port_no;
2010     int error;
2011
2012     ovs_mutex_lock(&dp->port_mutex);
2013     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2014     if (*port_nop != ODPP_NONE) {
2015         port_no = *port_nop;
2016         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2017     } else {
2018         port_no = choose_port(dp, dpif_port);
2019         error = port_no == ODPP_NONE ? EFBIG : 0;
2020     }
2021     if (!error) {
2022         *port_nop = port_no;
2023         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2024     }
2025     ovs_mutex_unlock(&dp->port_mutex);
2026
2027     return error;
2028 }
2029
2030 static int
2031 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2032 {
2033     struct dp_netdev *dp = get_dp_netdev(dpif);
2034     int error;
2035
2036     ovs_mutex_lock(&dp->port_mutex);
2037     if (port_no == ODPP_LOCAL) {
2038         error = EINVAL;
2039     } else {
2040         struct dp_netdev_port *port;
2041
2042         error = get_port_by_number(dp, port_no, &port);
2043         if (!error) {
2044             do_del_port(dp, port);
2045         }
2046     }
2047     ovs_mutex_unlock(&dp->port_mutex);
2048
2049     return error;
2050 }
2051
2052 static bool
2053 is_valid_port_number(odp_port_t port_no)
2054 {
2055     return port_no != ODPP_NONE;
2056 }
2057
2058 static struct dp_netdev_port *
2059 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2060     OVS_REQUIRES(dp->port_mutex)
2061 {
2062     struct dp_netdev_port *port;
2063
2064     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2065         if (port->port_no == port_no) {
2066             return port;
2067         }
2068     }
2069     return NULL;
2070 }
2071
2072 static int
2073 get_port_by_number(struct dp_netdev *dp,
2074                    odp_port_t port_no, struct dp_netdev_port **portp)
2075     OVS_REQUIRES(dp->port_mutex)
2076 {
2077     if (!is_valid_port_number(port_no)) {
2078         *portp = NULL;
2079         return EINVAL;
2080     } else {
2081         *portp = dp_netdev_lookup_port(dp, port_no);
2082         return *portp ? 0 : ENODEV;
2083     }
2084 }
2085
2086 static void
2087 port_destroy(struct dp_netdev_port *port)
2088 {
2089     if (!port) {
2090         return;
2091     }
2092
2093     netdev_close(port->netdev);
2094     netdev_restore_flags(port->sf);
2095
2096     for (unsigned i = 0; i < port->n_rxq; i++) {
2097         netdev_rxq_close(port->rxqs[i].rx);
2098     }
2099     ovs_mutex_destroy(&port->txq_used_mutex);
2100     free(port->rxq_affinity_list);
2101     free(port->txq_used);
2102     free(port->rxqs);
2103     free(port->type);
2104     free(port);
2105 }
2106
2107 static int
2108 get_port_by_name(struct dp_netdev *dp,
2109                  const char *devname, struct dp_netdev_port **portp)
2110     OVS_REQUIRES(dp->port_mutex)
2111 {
2112     struct dp_netdev_port *port;
2113
2114     HMAP_FOR_EACH (port, node, &dp->ports) {
2115         if (!strcmp(netdev_get_name(port->netdev), devname)) {
2116             *portp = port;
2117             return 0;
2118         }
2119     }
2120
2121     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2122      * existing port. */
2123     return ENODEV;
2124 }
2125
2126 /* Returns 'true' if there is a port with pmd netdev. */
2127 static bool
2128 has_pmd_port(struct dp_netdev *dp)
2129     OVS_REQUIRES(dp->port_mutex)
2130 {
2131     struct dp_netdev_port *port;
2132
2133     HMAP_FOR_EACH (port, node, &dp->ports) {
2134         if (netdev_is_pmd(port->netdev)) {
2135             return true;
2136         }
2137     }
2138
2139     return false;
2140 }
2141
2142 static void
2143 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2144     OVS_REQUIRES(dp->port_mutex)
2145 {
2146     hmap_remove(&dp->ports, &port->node);
2147     seq_change(dp->port_seq);
2148
2149     reconfigure_datapath(dp);
2150
2151     port_destroy(port);
2152 }
2153
2154 static void
2155 answer_port_query(const struct dp_netdev_port *port,
2156                   struct dpif_port *dpif_port)
2157 {
2158     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2159     dpif_port->type = xstrdup(port->type);
2160     dpif_port->port_no = port->port_no;
2161 }
2162
2163 static int
2164 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2165                                  struct dpif_port *dpif_port)
2166 {
2167     struct dp_netdev *dp = get_dp_netdev(dpif);
2168     struct dp_netdev_port *port;
2169     int error;
2170
2171     ovs_mutex_lock(&dp->port_mutex);
2172     error = get_port_by_number(dp, port_no, &port);
2173     if (!error && dpif_port) {
2174         answer_port_query(port, dpif_port);
2175     }
2176     ovs_mutex_unlock(&dp->port_mutex);
2177
2178     return error;
2179 }
2180
2181 static int
2182 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2183                                struct dpif_port *dpif_port)
2184 {
2185     struct dp_netdev *dp = get_dp_netdev(dpif);
2186     struct dp_netdev_port *port;
2187     int error;
2188
2189     ovs_mutex_lock(&dp->port_mutex);
2190     error = get_port_by_name(dp, devname, &port);
2191     if (!error && dpif_port) {
2192         answer_port_query(port, dpif_port);
2193     }
2194     ovs_mutex_unlock(&dp->port_mutex);
2195
2196     return error;
2197 }
2198
2199 static void
2200 dp_netdev_flow_free(struct dp_netdev_flow *flow)
2201 {
2202     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2203     free(flow->dp_extra_info);
2204     free(flow);
2205 }
2206
2207 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2208 {
2209     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2210         ovsrcu_postpone(dp_netdev_flow_free, flow);
2211     }
2212 }
2213
2214 static uint32_t
2215 dp_netdev_flow_hash(const ovs_u128 *ufid)
2216 {
2217     return ufid->u32[0];
2218 }
2219
2220 static inline struct dpcls *
2221 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2222                            odp_port_t in_port)
2223 {
2224     struct dpcls *cls;
2225     uint32_t hash = hash_port_no(in_port);
2226     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2227         if (cls->in_port == in_port) {
2228             /* Port classifier exists already */
2229             return cls;
2230         }
2231     }
2232     return NULL;
2233 }
2234
2235 static inline struct dpcls *
2236 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2237                          odp_port_t in_port)
2238     OVS_REQUIRES(pmd->flow_mutex)
2239 {
2240     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2241     uint32_t hash = hash_port_no(in_port);
2242
2243     if (!cls) {
2244         /* Create new classifier for in_port */
2245         cls = xmalloc(sizeof(*cls));
2246         dpcls_init(cls);
2247         cls->in_port = in_port;
2248         cmap_insert(&pmd->classifiers, &cls->node, hash);
2249         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2250     }
2251     return cls;
2252 }
2253
2254 #define MAX_FLOW_MARK       (UINT32_MAX - 1)
2255 #define INVALID_FLOW_MARK   0
2256 /* Zero flow mark is used to indicate the HW to remove the mark. A packet
2257  * marked with zero mark is received in SW without a mark at all, so it
2258  * cannot be used as a valid mark.
2259  */
2260
2261 struct megaflow_to_mark_data {
2262     const struct cmap_node node;
2263     ovs_u128 mega_ufid;
2264     uint32_t mark;
2265 };
2266
2267 struct flow_mark {
2268     struct cmap megaflow_to_mark;
2269     struct cmap mark_to_flow;
2270     struct id_pool *pool;
2271 };
2272
2273 static struct flow_mark flow_mark = {
2274     .megaflow_to_mark = CMAP_INITIALIZER,
2275     .mark_to_flow = CMAP_INITIALIZER,
2276 };
2277
2278 static uint32_t
2279 flow_mark_alloc(void)
2280 {
2281     uint32_t mark;
2282
2283     if (!flow_mark.pool) {
2284         /* Haven't initiated yet, do it here */
2285         flow_mark.pool = id_pool_create(1, MAX_FLOW_MARK);
2286     }
2287
2288     if (id_pool_alloc_id(flow_mark.pool, &mark)) {
2289         return mark;
2290     }
2291
2292     return INVALID_FLOW_MARK;
2293 }
2294
2295 static void
2296 flow_mark_free(uint32_t mark)
2297 {
2298     id_pool_free_id(flow_mark.pool, mark);
2299 }
2300
2301 /* associate megaflow with a mark, which is a 1:1 mapping */
2302 static void
2303 megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t mark)
2304 {
2305     size_t hash = dp_netdev_flow_hash(mega_ufid);
2306     struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
2307
2308     data->mega_ufid = *mega_ufid;
2309     data->mark = mark;
2310
2311     cmap_insert(&flow_mark.megaflow_to_mark,
2312                 CONST_CAST(struct cmap_node *, &data->node), hash);
2313 }
2314
2315 /* disassociate meagaflow with a mark */
2316 static void
2317 megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid)
2318 {
2319     size_t hash = dp_netdev_flow_hash(mega_ufid);
2320     struct megaflow_to_mark_data *data;
2321
2322     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2323         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2324             cmap_remove(&flow_mark.megaflow_to_mark,
2325                         CONST_CAST(struct cmap_node *, &data->node), hash);
2326             ovsrcu_postpone(free, data);
2327             return;
2328         }
2329     }
2330
2331     VLOG_WARN("Masked ufid "UUID_FMT" is not associated with a mark?\n",
2332               UUID_ARGS((struct uuid *)mega_ufid));
2333 }
2334
2335 static inline uint32_t
2336 megaflow_to_mark_find(const ovs_u128 *mega_ufid)
2337 {
2338     size_t hash = dp_netdev_flow_hash(mega_ufid);
2339     struct megaflow_to_mark_data *data;
2340
2341     CMAP_FOR_EACH_WITH_HASH (data, node, hash, &flow_mark.megaflow_to_mark) {
2342         if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
2343             return data->mark;
2344         }
2345     }
2346
2347     VLOG_DBG("Mark id for ufid "UUID_FMT" was not found\n",
2348              UUID_ARGS((struct uuid *)mega_ufid));
2349     return INVALID_FLOW_MARK;
2350 }
2351
2352 /* associate mark with a flow, which is 1:N mapping */
2353 static void
2354 mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow *flow)
2355 {
2356     dp_netdev_flow_ref(flow);
2357
2358     cmap_insert(&flow_mark.mark_to_flow,
2359                 CONST_CAST(struct cmap_node *, &flow->mark_node),
2360                 hash_int(mark, 0));
2361     flow->mark = mark;
2362
2363     VLOG_DBG("Associated dp_netdev flow %p with mark %u mega_ufid "UUID_FMT,
2364              flow, mark, UUID_ARGS((struct uuid *) &flow->mega_ufid));
2365 }
2366
2367 static bool
2368 flow_mark_has_no_ref(uint32_t mark)
2369 {
2370     struct dp_netdev_flow *flow;
2371
2372     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2373                              &flow_mark.mark_to_flow) {
2374         if (flow->mark == mark) {
2375             return false;
2376         }
2377     }
2378
2379     return true;
2380 }
2381
2382 static int
2383 mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
2384                           struct dp_netdev_flow *flow)
2385 {
2386     const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2387     struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
2388                                              &flow->mark_node);
2389     uint32_t mark = flow->mark;
2390     int ret = 0;
2391
2392     /* INVALID_FLOW_MARK may mean that the flow has been disassociated or
2393      * never associated. */
2394     if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) {
2395         return EINVAL;
2396     }
2397
2398     cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
2399     flow->mark = INVALID_FLOW_MARK;
2400
2401     /*
2402      * no flow is referencing the mark any more? If so, let's
2403      * remove the flow from hardware and free the mark.
2404      */
2405     if (flow_mark_has_no_ref(mark)) {
2406         struct netdev *port;
2407         odp_port_t in_port = flow->flow.in_port.odp_port;
2408
2409         port = netdev_ports_get(in_port, dpif_type_str);
2410         if (port) {
2411             /* Taking a global 'port_mutex' to fulfill thread safety
2412              * restrictions for the netdev-offload-dpdk module. */
2413             ovs_mutex_lock(&pmd->dp->port_mutex);
2414             ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
2415             ovs_mutex_unlock(&pmd->dp->port_mutex);
2416             netdev_close(port);
2417         }
2418
2419         flow_mark_free(mark);
2420         VLOG_DBG("Freed flow mark %u mega_ufid "UUID_FMT, mark,
2421                  UUID_ARGS((struct uuid *) &flow->mega_ufid));
2422
2423         megaflow_to_mark_disassociate(&flow->mega_ufid);
2424     }
2425     dp_netdev_flow_unref(flow);
2426
2427     return ret;
2428 }
2429
2430 static void
2431 flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
2432 {
2433     struct dp_netdev_flow *flow;
2434
2435     CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
2436         if (flow->pmd_id == pmd->core_id) {
2437             queue_netdev_flow_del(pmd, flow);
2438         }
2439     }
2440 }
2441
2442 static struct dp_netdev_flow *
2443 mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
2444                   const uint32_t mark)
2445 {
2446     struct dp_netdev_flow *flow;
2447
2448     CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
2449                              &flow_mark.mark_to_flow) {
2450         if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
2451             flow->dead == false) {
2452             return flow;
2453         }
2454     }
2455
2456     return NULL;
2457 }
2458
2459 static struct dp_flow_offload_item *
2460 dp_netdev_alloc_flow_offload(struct dp_netdev_pmd_thread *pmd,
2461                              struct dp_netdev_flow *flow,
2462                              int op)
2463 {
2464     struct dp_flow_offload_item *offload;
2465
2466     offload = xzalloc(sizeof(*offload));
2467     offload->pmd = pmd;
2468     offload->flow = flow;
2469     offload->op = op;
2470
2471     dp_netdev_flow_ref(flow);
2472     dp_netdev_pmd_try_ref(pmd);
2473
2474     return offload;
2475 }
2476
2477 static void
2478 dp_netdev_free_flow_offload(struct dp_flow_offload_item *offload)
2479 {
2480     dp_netdev_pmd_unref(offload->pmd);
2481     dp_netdev_flow_unref(offload->flow);
2482
2483     free(offload->actions);
2484     free(offload);
2485 }
2486
2487 static void
2488 dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
2489 {
2490     ovs_mutex_lock(&dp_flow_offload.mutex);
2491     ovs_list_push_back(&dp_flow_offload.list, &offload->node);
2492     xpthread_cond_signal(&dp_flow_offload.cond);
2493     ovs_mutex_unlock(&dp_flow_offload.mutex);
2494 }
2495
2496 static int
2497 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
2498 {
2499     return mark_to_flow_disassociate(offload->pmd, offload->flow);
2500 }
2501
2502 /*
2503  * There are two flow offload operations here: addition and modification.
2504  *
2505  * For flow addition, this function does:
2506  * - allocate a new flow mark id
2507  * - perform hardware flow offload
2508  * - associate the flow mark with flow and mega flow
2509  *
2510  * For flow modification, both flow mark and the associations are still
2511  * valid, thus only item 2 needed.
2512  */
2513 static int
2514 dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
2515 {
2516     struct dp_netdev_pmd_thread *pmd = offload->pmd;
2517     struct dp_netdev_flow *flow = offload->flow;
2518     odp_port_t in_port = flow->flow.in_port.odp_port;
2519     const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
2520     bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2521     struct offload_info info;
2522     struct netdev *port;
2523     uint32_t mark;
2524     int ret;
2525
2526     if (flow->dead) {
2527         return -1;
2528     }
2529
2530     if (modification) {
2531         mark = flow->mark;
2532         ovs_assert(mark != INVALID_FLOW_MARK);
2533     } else {
2534         /*
2535          * If a mega flow has already been offloaded (from other PMD
2536          * instances), do not offload it again.
2537          */
2538         mark = megaflow_to_mark_find(&flow->mega_ufid);
2539         if (mark != INVALID_FLOW_MARK) {
2540             VLOG_DBG("Flow has already been offloaded with mark %u\n", mark);
2541             if (flow->mark != INVALID_FLOW_MARK) {
2542                 ovs_assert(flow->mark == mark);
2543             } else {
2544                 mark_to_flow_associate(mark, flow);
2545             }
2546             return 0;
2547         }
2548
2549         mark = flow_mark_alloc();
2550         if (mark == INVALID_FLOW_MARK) {
2551             VLOG_ERR("Failed to allocate flow mark!\n");
2552             return -1;
2553         }
2554     }
2555     info.flow_mark = mark;
2556
2557     port = netdev_ports_get(in_port, dpif_type_str);
2558     if (!port || netdev_vport_is_vport_class(port->netdev_class)) {
2559         netdev_close(port);
2560         goto err_free;
2561     }
2562     /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
2563      * the netdev-offload-dpdk module. */
2564     ovs_mutex_lock(&pmd->dp->port_mutex);
2565     ret = netdev_flow_put(port, &offload->match,
2566                           CONST_CAST(struct nlattr *, offload->actions),
2567                           offload->actions_len, &flow->mega_ufid, &info,
2568                           NULL);
2569     ovs_mutex_unlock(&pmd->dp->port_mutex);
2570     netdev_close(port);
2571
2572     if (ret) {
2573         goto err_free;
2574     }
2575
2576     if (!modification) {
2577         megaflow_to_mark_associate(&flow->mega_ufid, mark);
2578         mark_to_flow_associate(mark, flow);
2579     }
2580     return 0;
2581
2582 err_free:
2583     if (!modification) {
2584         flow_mark_free(mark);
2585     } else {
2586         mark_to_flow_disassociate(pmd, flow);
2587     }
2588     return -1;
2589 }
2590
2591 static void *
2592 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
2593 {
2594     struct dp_flow_offload_item *offload;
2595     struct ovs_list *list;
2596     const char *op;
2597     int ret;
2598
2599     for (;;) {
2600         ovs_mutex_lock(&dp_flow_offload.mutex);
2601         if (ovs_list_is_empty(&dp_flow_offload.list)) {
2602             ovsrcu_quiesce_start();
2603             ovs_mutex_cond_wait(&dp_flow_offload.cond,
2604                                 &dp_flow_offload.mutex);
2605             ovsrcu_quiesce_end();
2606         }
2607         list = ovs_list_pop_front(&dp_flow_offload.list);
2608         offload = CONTAINER_OF(list, struct dp_flow_offload_item, node);
2609         ovs_mutex_unlock(&dp_flow_offload.mutex);
2610
2611         switch (offload->op) {
2612         case DP_NETDEV_FLOW_OFFLOAD_OP_ADD:
2613             op = "add";
2614             ret = dp_netdev_flow_offload_put(offload);
2615             break;
2616         case DP_NETDEV_FLOW_OFFLOAD_OP_MOD:
2617             op = "modify";
2618             ret = dp_netdev_flow_offload_put(offload);
2619             break;
2620         case DP_NETDEV_FLOW_OFFLOAD_OP_DEL:
2621             op = "delete";
2622             ret = dp_netdev_flow_offload_del(offload);
2623             break;
2624         default:
2625             OVS_NOT_REACHED();
2626         }
2627
2628         VLOG_DBG("%s to %s netdev flow "UUID_FMT,
2629                  ret == 0 ? "succeed" : "failed", op,
2630                  UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
2631         dp_netdev_free_flow_offload(offload);
2632         ovsrcu_quiesce();
2633     }
2634
2635     return NULL;
2636 }
2637
2638 static void
2639 queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
2640                       struct dp_netdev_flow *flow)
2641 {
2642     struct dp_flow_offload_item *offload;
2643
2644     if (ovsthread_once_start(&offload_thread_once)) {
2645         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2646         ovs_thread_create("dp_netdev_flow_offload",
2647                           dp_netdev_flow_offload_main, NULL);
2648         ovsthread_once_done(&offload_thread_once);
2649     }
2650
2651     offload = dp_netdev_alloc_flow_offload(pmd, flow,
2652                                            DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
2653     dp_netdev_append_flow_offload(offload);
2654 }
2655
2656 static void
2657 queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
2658                       struct dp_netdev_flow *flow, struct match *match,
2659                       const struct nlattr *actions, size_t actions_len)
2660 {
2661     struct dp_flow_offload_item *offload;
2662     int op;
2663
2664     if (!netdev_is_flow_api_enabled()) {
2665         return;
2666     }
2667
2668     if (ovsthread_once_start(&offload_thread_once)) {
2669         xpthread_cond_init(&dp_flow_offload.cond, NULL);
2670         ovs_thread_create("dp_netdev_flow_offload",
2671                           dp_netdev_flow_offload_main, NULL);
2672         ovsthread_once_done(&offload_thread_once);
2673     }
2674
2675     if (flow->mark != INVALID_FLOW_MARK) {
2676         op = DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
2677     } else {
2678         op = DP_NETDEV_FLOW_OFFLOAD_OP_ADD;
2679     }
2680     offload = dp_netdev_alloc_flow_offload(pmd, flow, op);
2681     offload->match = *match;
2682     offload->actions = xmalloc(actions_len);
2683     memcpy(offload->actions, actions, actions_len);
2684     offload->actions_len = actions_len;
2685
2686     dp_netdev_append_flow_offload(offload);
2687 }
2688
2689 static void
2690 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2691                           struct dp_netdev_flow *flow)
2692     OVS_REQUIRES(pmd->flow_mutex)
2693 {
2694     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2695     struct dpcls *cls;
2696     odp_port_t in_port = flow->flow.in_port.odp_port;
2697
2698     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2699     ovs_assert(cls != NULL);
2700     dpcls_remove(cls, &flow->cr);
2701     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2702     if (flow->mark != INVALID_FLOW_MARK) {
2703         queue_netdev_flow_del(pmd, flow);
2704     }
2705     flow->dead = true;
2706
2707     dp_netdev_flow_unref(flow);
2708 }
2709
2710 static void
2711 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2712 {
2713     struct dp_netdev_flow *netdev_flow;
2714
2715     ovs_mutex_lock(&pmd->flow_mutex);
2716     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2717         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2718     }
2719     ovs_mutex_unlock(&pmd->flow_mutex);
2720 }
2721
2722 static int
2723 dpif_netdev_flow_flush(struct dpif *dpif)
2724 {
2725     struct dp_netdev *dp = get_dp_netdev(dpif);
2726     struct dp_netdev_pmd_thread *pmd;
2727
2728     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2729         dp_netdev_pmd_flow_flush(pmd);
2730     }
2731
2732     return 0;
2733 }
2734
2735 struct dp_netdev_port_state {
2736     struct hmap_position position;
2737     char *name;
2738 };
2739
2740 static int
2741 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2742 {
2743     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2744     return 0;
2745 }
2746
2747 static int
2748 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2749                            struct dpif_port *dpif_port)
2750 {
2751     struct dp_netdev_port_state *state = state_;
2752     struct dp_netdev *dp = get_dp_netdev(dpif);
2753     struct hmap_node *node;
2754     int retval;
2755
2756     ovs_mutex_lock(&dp->port_mutex);
2757     node = hmap_at_position(&dp->ports, &state->position);
2758     if (node) {
2759         struct dp_netdev_port *port;
2760
2761         port = CONTAINER_OF(node, struct dp_netdev_port, node);
2762
2763         free(state->name);
2764         state->name = xstrdup(netdev_get_name(port->netdev));
2765         dpif_port->name = state->name;
2766         dpif_port->type = port->type;
2767         dpif_port->port_no = port->port_no;
2768
2769         retval = 0;
2770     } else {
2771         retval = EOF;
2772     }
2773     ovs_mutex_unlock(&dp->port_mutex);
2774
2775     return retval;
2776 }
2777
2778 static int
2779 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2780 {
2781     struct dp_netdev_port_state *state = state_;
2782     free(state->name);
2783     free(state);
2784     return 0;
2785 }
2786
2787 static int
2788 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2789 {
2790     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2791     uint64_t new_port_seq;
2792     int error;
2793
2794     new_port_seq = seq_read(dpif->dp->port_seq);
2795     if (dpif->last_port_seq != new_port_seq) {
2796         dpif->last_port_seq = new_port_seq;
2797         error = ENOBUFS;
2798     } else {
2799         error = EAGAIN;
2800     }
2801
2802     return error;
2803 }
2804
2805 static void
2806 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2807 {
2808     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2809
2810     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2811 }
2812
2813 static struct dp_netdev_flow *
2814 dp_netdev_flow_cast(const struct dpcls_rule *cr)
2815 {
2816     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2817 }
2818
2819 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2820 {
2821     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2822 }
2823
2824 /* netdev_flow_key utilities.
2825  *
2826  * netdev_flow_key is basically a miniflow.  We use these functions
2827  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2828  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2829  *
2830  * - Since we are dealing exclusively with miniflows created by
2831  *   miniflow_extract(), if the map is different the miniflow is different.
2832  *   Therefore we can be faster by comparing the map and the miniflow in a
2833  *   single memcmp().
2834  * - These functions can be inlined by the compiler. */
2835
2836 /* Given the number of bits set in miniflow's maps, returns the size of the
2837  * 'netdev_flow_key.mf' */
2838 static inline size_t
2839 netdev_flow_key_size(size_t flow_u64s)
2840 {
2841     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
2842 }
2843
2844 static inline bool
2845 netdev_flow_key_equal(const struct netdev_flow_key *a,
2846                       const struct netdev_flow_key *b)
2847 {
2848     /* 'b->len' may be not set yet. */
2849     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2850 }
2851
2852 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
2853  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
2854  * generated by miniflow_extract. */
2855 static inline bool
2856 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
2857                          const struct miniflow *mf)
2858 {
2859     return !memcmp(&key->mf, mf, key->len);
2860 }
2861
2862 static inline void
2863 netdev_flow_key_clone(struct netdev_flow_key *dst,
2864                       const struct netdev_flow_key *src)
2865 {
2866     memcpy(dst, src,
2867            offsetof(struct netdev_flow_key, mf) + src->len);
2868 }
2869
2870 /* Initialize a netdev_flow_key 'mask' from 'match'. */
2871 static inline void
2872 netdev_flow_mask_init(struct netdev_flow_key *mask,
2873                       const struct match *match)
2874 {
2875     uint64_t *dst = miniflow_values(&mask->mf);
2876     struct flowmap fmap;
2877     uint32_t hash = 0;
2878     size_t idx;
2879
2880     /* Only check masks that make sense for the flow. */
2881     flow_wc_map(&match->flow, &fmap);
2882     flowmap_init(&mask->mf.map);
2883
2884     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2885         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2886
2887         if (mask_u64) {
2888             flowmap_set(&mask->mf.map, idx, 1);
2889             *dst++ = mask_u64;
2890             hash = hash_add64(hash, mask_u64);
2891         }
2892     }
2893
2894     map_t map;
2895
2896     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2897         hash = hash_add64(hash, map);
2898     }
2899
2900     size_t n = dst - miniflow_get_values(&mask->mf);
2901
2902     mask->hash = hash_finish(hash, n * 8);
2903     mask->len = netdev_flow_key_size(n);
2904 }
2905
2906 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2907 static inline void
2908 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2909                             const struct flow *flow,
2910                             const struct netdev_flow_key *mask)
2911 {
2912     uint64_t *dst_u64 = miniflow_values(&dst->mf);
2913     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2914     uint32_t hash = 0;
2915     uint64_t value;
2916
2917     dst->len = mask->len;
2918     dst->mf = mask->mf;   /* Copy maps. */
2919
2920     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2921         *dst_u64 = value & *mask_u64++;
2922         hash = hash_add64(hash, *dst_u64++);
2923     }
2924     dst->hash = hash_finish(hash,
2925                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2926 }
2927
2928 static inline bool
2929 emc_entry_alive(struct emc_entry *ce)
2930 {
2931     return ce->flow && !ce->flow->dead;
2932 }
2933
2934 static void
2935 emc_clear_entry(struct emc_entry *ce)
2936 {
2937     if (ce->flow) {
2938         dp_netdev_flow_unref(ce->flow);
2939         ce->flow = NULL;
2940     }
2941 }
2942
2943 static inline void
2944 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2945                  const struct netdev_flow_key *key)
2946 {
2947     if (ce->flow != flow) {
2948         if (ce->flow) {
2949             dp_netdev_flow_unref(ce->flow);
2950         }
2951
2952         if (dp_netdev_flow_ref(flow)) {
2953             ce->flow = flow;
2954         } else {
2955             ce->flow = NULL;
2956         }
2957     }
2958     if (key) {
2959         netdev_flow_key_clone(&ce->key, key);
2960     }
2961 }
2962
2963 static inline void
2964 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2965            struct dp_netdev_flow *flow)
2966 {
2967     struct emc_entry *to_be_replaced = NULL;
2968     struct emc_entry *current_entry;
2969
2970     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2971         if (netdev_flow_key_equal(&current_entry->key, key)) {
2972             /* We found the entry with the 'mf' miniflow */
2973             emc_change_entry(current_entry, flow, NULL);
2974             return;
2975         }
2976
2977         /* Replacement policy: put the flow in an empty (not alive) entry, or
2978          * in the first entry where it can be */
2979         if (!to_be_replaced
2980             || (emc_entry_alive(to_be_replaced)
2981                 && !emc_entry_alive(current_entry))
2982             || current_entry->key.hash < to_be_replaced->key.hash) {
2983             to_be_replaced = current_entry;
2984         }
2985     }
2986     /* We didn't find the miniflow in the cache.
2987      * The 'to_be_replaced' entry is where the new flow will be stored */
2988
2989     emc_change_entry(to_be_replaced, flow, key);
2990 }
2991
2992 static inline void
2993 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2994                          const struct netdev_flow_key *key,
2995                          struct dp_netdev_flow *flow)
2996 {
2997     /* Insert an entry into the EMC based on probability value 'min'. By
2998      * default the value is UINT32_MAX / 100 which yields an insertion
2999      * probability of 1/100 ie. 1% */
3000
3001     uint32_t min = pmd->ctx.emc_insert_min;
3002
3003     if (min && random_uint32() <= min) {
3004         emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
3005     }
3006 }
3007
3008 static inline struct dp_netdev_flow *
3009 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
3010 {
3011     struct emc_entry *current_entry;
3012
3013     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
3014         if (current_entry->key.hash == key->hash
3015             && emc_entry_alive(current_entry)
3016             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
3017
3018             /* We found the entry with the 'key->mf' miniflow */
3019             return current_entry->flow;
3020         }
3021     }
3022
3023     return NULL;
3024 }
3025
3026 static inline const struct cmap_node *
3027 smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
3028 {
3029     struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
3030     struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
3031     uint16_t sig = hash >> 16;
3032     uint16_t index = UINT16_MAX;
3033
3034     for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3035         if (bucket->sig[i] == sig) {
3036             index = bucket->flow_idx[i];
3037             break;
3038         }
3039     }
3040     if (index != UINT16_MAX) {
3041         return cmap_find_by_index(&pmd->flow_table, index);
3042     }
3043     return NULL;
3044 }
3045
3046 static void
3047 smc_clear_entry(struct smc_bucket *b, int idx)
3048 {
3049     b->flow_idx[idx] = UINT16_MAX;
3050 }
3051
3052 /* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
3053  * turned off, 2) the flow_table index is larger than uint16_t can handle.
3054  * If there is already an SMC entry having same signature, the index will be
3055  * updated. If there is no existing entry, but an empty entry is available,
3056  * the empty entry will be taken. If no empty entry or existing same signature,
3057  * a random entry from the hashed bucket will be picked. */
3058 static inline void
3059 smc_insert(struct dp_netdev_pmd_thread *pmd,
3060            const struct netdev_flow_key *key,
3061            uint32_t hash)
3062 {
3063     struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
3064     struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
3065     uint16_t index;
3066     uint32_t cmap_index;
3067     bool smc_enable_db;
3068     int i;
3069
3070     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
3071     if (!smc_enable_db) {
3072         return;
3073     }
3074
3075     cmap_index = cmap_find_index(&pmd->flow_table, hash);
3076     index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
3077
3078     /* If the index is larger than SMC can handle (uint16_t), we don't
3079      * insert */
3080     if (index == UINT16_MAX) {
3081         return;
3082     }
3083
3084     /* If an entry with same signature already exists, update the index */
3085     uint16_t sig = key->hash >> 16;
3086     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3087         if (bucket->sig[i] == sig) {
3088             bucket->flow_idx[i] = index;
3089             return;
3090         }
3091     }
3092     /* If there is an empty entry, occupy it. */
3093     for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
3094         if (bucket->flow_idx[i] == UINT16_MAX) {
3095             bucket->sig[i] = sig;
3096             bucket->flow_idx[i] = index;
3097             return;
3098         }
3099     }
3100     /* Otherwise, pick a random entry. */
3101     i = random_uint32() % SMC_ENTRY_PER_BUCKET;
3102     bucket->sig[i] = sig;
3103     bucket->flow_idx[i] = index;
3104 }
3105
3106 static struct dp_netdev_flow *
3107 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3108                           const struct netdev_flow_key *key,
3109                           int *lookup_num_p)
3110 {
3111     struct dpcls *cls;
3112     struct dpcls_rule *rule;
3113     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3114                                                      in_port.odp_port));
3115     struct dp_netdev_flow *netdev_flow = NULL;
3116
3117     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3118     if (OVS_LIKELY(cls)) {
3119         dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3120         netdev_flow = dp_netdev_flow_cast(rule);
3121     }
3122     return netdev_flow;
3123 }
3124
3125 static struct dp_netdev_flow *
3126 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3127                         const ovs_u128 *ufidp, const struct nlattr *key,
3128                         size_t key_len)
3129 {
3130     struct dp_netdev_flow *netdev_flow;
3131     struct flow flow;
3132     ovs_u128 ufid;
3133
3134     /* If a UFID is not provided, determine one based on the key. */
3135     if (!ufidp && key && key_len
3136         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3137         odp_flow_key_hash(&flow, sizeof flow, &ufid);
3138         ufidp = &ufid;
3139     }
3140
3141     if (ufidp) {
3142         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3143                                  &pmd->flow_table) {
3144             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3145                 return netdev_flow;
3146             }
3147         }
3148     }
3149
3150     return NULL;
3151 }
3152
3153 static bool
3154 dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp,
3155                                     const struct dp_netdev_flow *netdev_flow,
3156                                     struct dpif_flow_stats *stats,
3157                                     struct dpif_flow_attrs *attrs)
3158 {
3159     uint64_t act_buf[1024 / 8];
3160     struct nlattr *actions;
3161     struct netdev *netdev;
3162     struct match match;
3163     struct ofpbuf buf;
3164
3165     int ret = 0;
3166
3167     if (!netdev_is_flow_api_enabled()) {
3168         return false;
3169     }
3170
3171     netdev = netdev_ports_get(netdev_flow->flow.in_port.odp_port,
3172                               dpif_normalize_type(dp->class->type));
3173     if (!netdev) {
3174         return false;
3175     }
3176     ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
3177     /* Taking a global 'port_mutex' to fulfill thread safety
3178      * restrictions for the netdev-offload-dpdk module. */
3179     ovs_mutex_lock(&dp->port_mutex);
3180     ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid,
3181                           stats, attrs, &buf);
3182     ovs_mutex_unlock(&dp->port_mutex);
3183     netdev_close(netdev);
3184     if (ret) {
3185         return false;
3186     }
3187
3188     return true;
3189 }
3190
3191 static void
3192 get_dpif_flow_status(const struct dp_netdev *dp,
3193                      const struct dp_netdev_flow *netdev_flow_,
3194                      struct dpif_flow_stats *stats,
3195                      struct dpif_flow_attrs *attrs)
3196 {
3197     struct dpif_flow_stats offload_stats;
3198     struct dpif_flow_attrs offload_attrs;
3199     struct dp_netdev_flow *netdev_flow;
3200     unsigned long long n;
3201     long long used;
3202     uint16_t flags;
3203
3204     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3205
3206     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3207     stats->n_packets = n;
3208     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3209     stats->n_bytes = n;
3210     atomic_read_relaxed(&netdev_flow->stats.used, &used);
3211     stats->used = used;
3212     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3213     stats->tcp_flags = flags;
3214
3215     if (dpif_netdev_get_flow_offload_status(dp, netdev_flow,
3216                                             &offload_stats, &offload_attrs)) {
3217         stats->n_packets += offload_stats.n_packets;
3218         stats->n_bytes += offload_stats.n_bytes;
3219         stats->used = MAX(stats->used, offload_stats.used);
3220         stats->tcp_flags |= offload_stats.tcp_flags;
3221         if (attrs) {
3222             attrs->offloaded = offload_attrs.offloaded;
3223             attrs->dp_layer = offload_attrs.dp_layer;
3224         }
3225     } else if (attrs) {
3226         attrs->offloaded = false;
3227         attrs->dp_layer = "ovs";
3228     }
3229 }
3230
3231 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3232  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3233  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3234  * protect them. */
3235 static void
3236 dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3237                             const struct dp_netdev_flow *netdev_flow,
3238                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3239                             struct dpif_flow *flow, bool terse)
3240 {
3241     if (terse) {
3242         memset(flow, 0, sizeof *flow);
3243     } else {
3244         struct flow_wildcards wc;
3245         struct dp_netdev_actions *actions;
3246         size_t offset;
3247         struct odp_flow_key_parms odp_parms = {
3248             .flow = &netdev_flow->flow,
3249             .mask = &wc.masks,
3250             .support = dp_netdev_support,
3251         };
3252
3253         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3254         /* in_port is exact matched, but we have left it out from the mask for
3255          * optimnization reasons. Add in_port back to the mask. */
3256         wc.masks.in_port.odp_port = ODPP_NONE;
3257
3258         /* Key */
3259         offset = key_buf->size;
3260         flow->key = ofpbuf_tail(key_buf);
3261         odp_flow_key_from_flow(&odp_parms, key_buf);
3262         flow->key_len = key_buf->size - offset;
3263
3264         /* Mask */
3265         offset = mask_buf->size;
3266         flow->mask = ofpbuf_tail(mask_buf);
3267         odp_parms.key_buf = key_buf;
3268         odp_flow_key_from_mask(&odp_parms, mask_buf);
3269         flow->mask_len = mask_buf->size - offset;
3270
3271         /* Actions */
3272         actions = dp_netdev_flow_get_actions(netdev_flow);
3273         flow->actions = actions->actions;
3274         flow->actions_len = actions->size;
3275     }
3276
3277     flow->ufid = netdev_flow->ufid;
3278     flow->ufid_present = true;
3279     flow->pmd_id = netdev_flow->pmd_id;
3280
3281     get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3282     flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3283 }
3284
3285 static int
3286 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3287                               const struct nlattr *mask_key,
3288                               uint32_t mask_key_len, const struct flow *flow,
3289                               struct flow_wildcards *wc, bool probe)
3290 {
3291     enum odp_key_fitness fitness;
3292
3293     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3294     if (fitness) {
3295         if (!probe) {
3296             /* This should not happen: it indicates that
3297              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3298              * disagree on the acceptable form of a mask.  Log the problem
3299              * as an error, with enough details to enable debugging. */
3300             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3301
3302             if (!VLOG_DROP_ERR(&rl)) {
3303                 struct ds s;
3304
3305                 ds_init(&s);
3306                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3307                                 true);
3308                 VLOG_ERR("internal error parsing flow mask %s (%s)",
3309                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
3310                 ds_destroy(&s);
3311             }
3312         }
3313
3314         return EINVAL;
3315     }
3316
3317     return 0;
3318 }
3319
3320 static int
3321 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3322                               struct flow *flow, bool probe)
3323 {
3324     if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3325         if (!probe) {
3326             /* This should not happen: it indicates that
3327              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3328              * the acceptable form of a flow.  Log the problem as an error,
3329              * with enough details to enable debugging. */
3330             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3331
3332             if (!VLOG_DROP_ERR(&rl)) {
3333                 struct ds s;
3334
3335                 ds_init(&s);
3336                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
3337                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3338                 ds_destroy(&s);
3339             }
3340         }
3341
3342         return EINVAL;
3343     }
3344
3345     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3346         return EINVAL;
3347     }
3348
3349     return 0;
3350 }
3351
3352 static int
3353 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3354 {
3355     struct dp_netdev *dp = get_dp_netdev(dpif);
3356     struct dp_netdev_flow *netdev_flow;
3357     struct dp_netdev_pmd_thread *pmd;
3358     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3359     struct hmapx_node *node;
3360     int error = EINVAL;
3361
3362     if (get->pmd_id == PMD_ID_NULL) {
3363         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3364             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3365                 dp_netdev_pmd_unref(pmd);
3366             }
3367         }
3368     } else {
3369         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3370         if (!pmd) {
3371             goto out;
3372         }
3373         hmapx_add(&to_find, pmd);
3374     }
3375
3376     if (!hmapx_count(&to_find)) {
3377         goto out;
3378     }
3379
3380     HMAPX_FOR_EACH (node, &to_find) {
3381         pmd = (struct dp_netdev_pmd_thread *) node->data;
3382         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3383                                               get->key_len);
3384         if (netdev_flow) {
3385             dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3386                                         get->buffer, get->flow, false);
3387             error = 0;
3388             break;
3389         } else {
3390             error = ENOENT;
3391         }
3392     }
3393
3394     HMAPX_FOR_EACH (node, &to_find) {
3395         pmd = (struct dp_netdev_pmd_thread *) node->data;
3396         dp_netdev_pmd_unref(pmd);
3397     }
3398 out:
3399     hmapx_destroy(&to_find);
3400     return error;
3401 }
3402
3403 static void
3404 dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3405 {
3406     struct flow masked_flow;
3407     size_t i;
3408
3409     for (i = 0; i < sizeof(struct flow); i++) {
3410         ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3411                                        ((uint8_t *)&match->wc)[i];
3412     }
3413     odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3414 }
3415
3416 static struct dp_netdev_flow *
3417 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3418                    struct match *match, const ovs_u128 *ufid,
3419                    const struct nlattr *actions, size_t actions_len)
3420     OVS_REQUIRES(pmd->flow_mutex)
3421 {
3422     struct ds extra_info = DS_EMPTY_INITIALIZER;
3423     struct dp_netdev_flow *flow;
3424     struct netdev_flow_key mask;
3425     struct dpcls *cls;
3426     size_t unit;
3427
3428     /* Make sure in_port is exact matched before we read it. */
3429     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3430     odp_port_t in_port = match->flow.in_port.odp_port;
3431
3432     /* As we select the dpcls based on the port number, each netdev flow
3433      * belonging to the same dpcls will have the same odp_port value.
3434      * For performance reasons we wildcard odp_port here in the mask.  In the
3435      * typical case dp_hash is also wildcarded, and the resulting 8-byte
3436      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3437      * will not be part of the subtable mask.
3438      * This will speed up the hash computation during dpcls_lookup() because
3439      * there is one less call to hash_add64() in this case. */
3440     match->wc.masks.in_port.odp_port = 0;
3441     netdev_flow_mask_init(&mask, match);
3442     match->wc.masks.in_port.odp_port = ODPP_NONE;
3443
3444     /* Make sure wc does not have metadata. */
3445     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3446                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3447
3448     /* Do not allocate extra space. */
3449     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3450     memset(&flow->stats, 0, sizeof flow->stats);
3451     flow->dead = false;
3452     flow->batch = NULL;
3453     flow->mark = INVALID_FLOW_MARK;
3454     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3455     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3456     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3457     ovs_refcount_init(&flow->ref_cnt);
3458     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3459
3460     dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3461     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3462
3463     /* Select dpcls for in_port. Relies on in_port to be exact match. */
3464     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3465     dpcls_insert(cls, &flow->cr, &mask);
3466
3467     ds_put_cstr(&extra_info, "miniflow_bits(");
3468     FLOWMAP_FOR_EACH_UNIT (unit) {
3469         if (unit) {
3470             ds_put_char(&extra_info, ',');
3471         }
3472         ds_put_format(&extra_info, "%d",
3473                       count_1bits(flow->cr.mask->mf.map.bits[unit]));
3474     }
3475     ds_put_char(&extra_info, ')');
3476     flow->dp_extra_info = ds_steal_cstr(&extra_info);
3477     ds_destroy(&extra_info);
3478
3479     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3480                 dp_netdev_flow_hash(&flow->ufid));
3481
3482     queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
3483
3484     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
3485         struct ds ds = DS_EMPTY_INITIALIZER;
3486         struct ofpbuf key_buf, mask_buf;
3487         struct odp_flow_key_parms odp_parms = {
3488             .flow = &match->flow,
3489             .mask = &match->wc.masks,
3490             .support = dp_netdev_support,
3491         };
3492
3493         ofpbuf_init(&key_buf, 0);
3494         ofpbuf_init(&mask_buf, 0);
3495
3496         odp_flow_key_from_flow(&odp_parms, &key_buf);
3497         odp_parms.key_buf = &key_buf;
3498         odp_flow_key_from_mask(&odp_parms, &mask_buf);
3499
3500         ds_put_cstr(&ds, "flow_add: ");
3501         odp_format_ufid(ufid, &ds);
3502         ds_put_cstr(&ds, " mega_");
3503         odp_format_ufid(&flow->mega_ufid, &ds);
3504         ds_put_cstr(&ds, " ");
3505         odp_flow_format(key_buf.data, key_buf.size,
3506                         mask_buf.data, mask_buf.size,
3507                         NULL, &ds, false);
3508         ds_put_cstr(&ds, ", actions:");
3509         format_odp_actions(&ds, actions, actions_len, NULL);
3510
3511         VLOG_DBG("%s", ds_cstr(&ds));
3512
3513         ofpbuf_uninit(&key_buf);
3514         ofpbuf_uninit(&mask_buf);
3515
3516         /* Add a printout of the actual match installed. */
3517         struct match m;
3518         ds_clear(&ds);
3519         ds_put_cstr(&ds, "flow match: ");
3520         miniflow_expand(&flow->cr.flow.mf, &m.flow);
3521         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
3522         memset(&m.tun_md, 0, sizeof m.tun_md);
3523         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
3524
3525         VLOG_DBG("%s", ds_cstr(&ds));
3526
3527         ds_destroy(&ds);
3528     }
3529
3530     return flow;
3531 }
3532
3533 static int
3534 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3535                 struct netdev_flow_key *key,
3536                 struct match *match,
3537                 ovs_u128 *ufid,
3538                 const struct dpif_flow_put *put,
3539                 struct dpif_flow_stats *stats)
3540 {
3541     struct dp_netdev_flow *netdev_flow;
3542     int error = 0;
3543
3544     if (stats) {
3545         memset(stats, 0, sizeof *stats);
3546     }
3547
3548     ovs_mutex_lock(&pmd->flow_mutex);
3549     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3550     if (!netdev_flow) {
3551         if (put->flags & DPIF_FP_CREATE) {
3552             dp_netdev_flow_add(pmd, match, ufid, put->actions,
3553                                put->actions_len);
3554         } else {
3555             error = ENOENT;
3556         }
3557     } else {
3558         if (put->flags & DPIF_FP_MODIFY) {
3559             struct dp_netdev_actions *new_actions;
3560             struct dp_netdev_actions *old_actions;
3561
3562             new_actions = dp_netdev_actions_create(put->actions,
3563                                                    put->actions_len);
3564
3565             old_actions = dp_netdev_flow_get_actions(netdev_flow);
3566             ovsrcu_set(&netdev_flow->actions, new_actions);
3567
3568             queue_netdev_flow_put(pmd, netdev_flow, match,
3569                                   put->actions, put->actions_len);
3570
3571             if (stats) {
3572                 get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3573             }
3574             if (put->flags & DPIF_FP_ZERO_STATS) {
3575                 /* XXX: The userspace datapath uses thread local statistics
3576                  * (for flows), which should be updated only by the owning
3577                  * thread.  Since we cannot write on stats memory here,
3578                  * we choose not to support this flag.  Please note:
3579                  * - This feature is currently used only by dpctl commands with
3580                  *   option --clear.
3581                  * - Should the need arise, this operation can be implemented
3582                  *   by keeping a base value (to be update here) for each
3583                  *   counter, and subtracting it before outputting the stats */
3584                 error = EOPNOTSUPP;
3585             }
3586
3587             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3588         } else if (put->flags & DPIF_FP_CREATE) {
3589             error = EEXIST;
3590         } else {
3591             /* Overlapping flow. */
3592             error = EINVAL;
3593         }
3594     }
3595     ovs_mutex_unlock(&pmd->flow_mutex);
3596     return error;
3597 }
3598
3599 static int
3600 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3601 {
3602     struct dp_netdev *dp = get_dp_netdev(dpif);
3603     struct netdev_flow_key key, mask;
3604     struct dp_netdev_pmd_thread *pmd;
3605     struct match match;
3606     ovs_u128 ufid;
3607     int error;
3608     bool probe = put->flags & DPIF_FP_PROBE;
3609
3610     if (put->stats) {
3611         memset(put->stats, 0, sizeof *put->stats);
3612     }
3613     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3614                                           probe);
3615     if (error) {
3616         return error;
3617     }
3618     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3619                                           put->mask, put->mask_len,
3620                                           &match.flow, &match.wc, probe);
3621     if (error) {
3622         return error;
3623     }
3624
3625     if (put->ufid) {
3626         ufid = *put->ufid;
3627     } else {
3628         odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3629     }
3630
3631     /* The Netlink encoding of datapath flow keys cannot express
3632      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3633      * tag is interpreted as exact match on the fact that there is no
3634      * VLAN.  Unless we refactor a lot of code that translates between
3635      * Netlink and struct flow representations, we have to do the same
3636      * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3637     if (!match.wc.masks.vlans[0].tci) {
3638         match.wc.masks.vlans[0].tci = htons(0xffff);
3639     }
3640
3641     /* Must produce a netdev_flow_key for lookup.
3642      * Use the same method as employed to create the key when adding
3643      * the flow to the dplcs to make sure they match. */
3644     netdev_flow_mask_init(&mask, &match);
3645     netdev_flow_key_init_masked(&key, &match.flow, &mask);
3646
3647     if (put->pmd_id == PMD_ID_NULL) {
3648         if (cmap_count(&dp->poll_threads) == 0) {
3649             return EINVAL;
3650         }
3651         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3652             struct dpif_flow_stats pmd_stats;
3653             int pmd_error;
3654
3655             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3656                                         &pmd_stats);
3657             if (pmd_error) {
3658                 error = pmd_error;
3659             } else if (put->stats) {
3660                 put->stats->n_packets += pmd_stats.n_packets;
3661                 put->stats->n_bytes += pmd_stats.n_bytes;
3662                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
3663                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
3664             }
3665         }
3666     } else {
3667         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3668         if (!pmd) {
3669             return EINVAL;
3670         }
3671         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3672         dp_netdev_pmd_unref(pmd);
3673     }
3674
3675     return error;
3676 }
3677
3678 static int
3679 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3680                 struct dpif_flow_stats *stats,
3681                 const struct dpif_flow_del *del)
3682 {
3683     struct dp_netdev_flow *netdev_flow;
3684     int error = 0;
3685
3686     ovs_mutex_lock(&pmd->flow_mutex);
3687     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3688                                           del->key_len);
3689     if (netdev_flow) {
3690         if (stats) {
3691             get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3692         }
3693         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3694     } else {
3695         error = ENOENT;
3696     }
3697     ovs_mutex_unlock(&pmd->flow_mutex);
3698
3699     return error;
3700 }
3701
3702 static int
3703 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3704 {
3705     struct dp_netdev *dp = get_dp_netdev(dpif);
3706     struct dp_netdev_pmd_thread *pmd;
3707     int error = 0;
3708
3709     if (del->stats) {
3710         memset(del->stats, 0, sizeof *del->stats);
3711     }
3712
3713     if (del->pmd_id == PMD_ID_NULL) {
3714         if (cmap_count(&dp->poll_threads) == 0) {
3715             return EINVAL;
3716         }
3717         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3718             struct dpif_flow_stats pmd_stats;
3719             int pmd_error;
3720
3721             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3722             if (pmd_error) {
3723                 error = pmd_error;
3724             } else if (del->stats) {
3725                 del->stats->n_packets += pmd_stats.n_packets;
3726                 del->stats->n_bytes += pmd_stats.n_bytes;
3727                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
3728                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
3729             }
3730         }
3731     } else {
3732         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3733         if (!pmd) {
3734             return EINVAL;
3735         }
3736         error = flow_del_on_pmd(pmd, del->stats, del);
3737         dp_netdev_pmd_unref(pmd);
3738     }
3739
3740
3741     return error;
3742 }
3743
3744 struct dpif_netdev_flow_dump {
3745     struct dpif_flow_dump up;
3746     struct cmap_position poll_thread_pos;
3747     struct cmap_position flow_pos;
3748     struct dp_netdev_pmd_thread *cur_pmd;
3749     int status;
3750     struct ovs_mutex mutex;
3751 };
3752
3753 static struct dpif_netdev_flow_dump *
3754 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3755 {
3756     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3757 }
3758
3759 static struct dpif_flow_dump *
3760 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3761                              struct dpif_flow_dump_types *types OVS_UNUSED)
3762 {
3763     struct dpif_netdev_flow_dump *dump;
3764
3765     dump = xzalloc(sizeof *dump);
3766     dpif_flow_dump_init(&dump->up, dpif_);
3767     dump->up.terse = terse;
3768     ovs_mutex_init(&dump->mutex);
3769
3770     return &dump->up;
3771 }
3772
3773 static int
3774 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3775 {
3776     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3777
3778     ovs_mutex_destroy(&dump->mutex);
3779     free(dump);
3780     return 0;
3781 }
3782
3783 struct dpif_netdev_flow_dump_thread {
3784     struct dpif_flow_dump_thread up;
3785     struct dpif_netdev_flow_dump *dump;
3786     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3787     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3788 };
3789
3790 static struct dpif_netdev_flow_dump_thread *
3791 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3792 {
3793     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3794 }
3795
3796 static struct dpif_flow_dump_thread *
3797 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3798 {
3799     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3800     struct dpif_netdev_flow_dump_thread *thread;
3801
3802     thread = xmalloc(sizeof *thread);
3803     dpif_flow_dump_thread_init(&thread->up, &dump->up);
3804     thread->dump = dump;
3805     return &thread->up;
3806 }
3807
3808 static void
3809 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3810 {
3811     struct dpif_netdev_flow_dump_thread *thread
3812         = dpif_netdev_flow_dump_thread_cast(thread_);
3813
3814     free(thread);
3815 }
3816
3817 static int
3818 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3819                            struct dpif_flow *flows, int max_flows)
3820 {
3821     struct dpif_netdev_flow_dump_thread *thread
3822         = dpif_netdev_flow_dump_thread_cast(thread_);
3823     struct dpif_netdev_flow_dump *dump = thread->dump;
3824     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3825     struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
3826     struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3827     int n_flows = 0;
3828     int i;
3829
3830     ovs_mutex_lock(&dump->mutex);
3831     if (!dump->status) {
3832         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3833         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3834
3835         /* First call to dump_next(), extracts the first pmd thread.
3836          * If there is no pmd thread, returns immediately. */
3837         if (!pmd) {
3838             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3839             if (!pmd) {
3840                 ovs_mutex_unlock(&dump->mutex);
3841                 return n_flows;
3842
3843             }
3844         }
3845
3846         do {
3847             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3848                 struct cmap_node *node;
3849
3850                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3851                 if (!node) {
3852                     break;
3853                 }
3854                 netdev_flows[n_flows] = CONTAINER_OF(node,
3855                                                      struct dp_netdev_flow,
3856                                                      node);
3857             }
3858             /* When finishing dumping the current pmd thread, moves to
3859              * the next. */
3860             if (n_flows < flow_limit) {
3861                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3862                 dp_netdev_pmd_unref(pmd);
3863                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3864                 if (!pmd) {
3865                     dump->status = EOF;
3866                     break;
3867                 }
3868             }
3869             /* Keeps the reference to next caller. */
3870             dump->cur_pmd = pmd;
3871
3872             /* If the current dump is empty, do not exit the loop, since the
3873              * remaining pmds could have flows to be dumped.  Just dumps again
3874              * on the new 'pmd'. */
3875         } while (!n_flows);
3876     }
3877     ovs_mutex_unlock(&dump->mutex);
3878
3879     for (i = 0; i < n_flows; i++) {
3880         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3881         struct odputil_keybuf *keybuf = &thread->keybuf[i];
3882         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3883         struct dpif_flow *f = &flows[i];
3884         struct ofpbuf key, mask;
3885
3886         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3887         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3888         dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
3889                                     dump->up.terse);
3890     }
3891
3892     return n_flows;
3893 }
3894
3895 static int
3896 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3897     OVS_NO_THREAD_SAFETY_ANALYSIS
3898 {
3899     struct dp_netdev *dp = get_dp_netdev(dpif);
3900     struct dp_netdev_pmd_thread *pmd;
3901     struct dp_packet_batch pp;
3902
3903     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3904         dp_packet_size(execute->packet) > UINT16_MAX) {
3905         return EINVAL;
3906     }
3907
3908     /* Tries finding the 'pmd'.  If NULL is returned, that means
3909      * the current thread is a non-pmd thread and should use
3910      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3911     pmd = ovsthread_getspecific(dp->per_pmd_key);
3912     if (!pmd) {
3913         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3914         if (!pmd) {
3915             return EBUSY;
3916         }
3917     }
3918
3919     if (execute->probe) {
3920         /* If this is part of a probe, Drop the packet, since executing
3921          * the action may actually cause spurious packets be sent into
3922          * the network. */
3923         if (pmd->core_id == NON_PMD_CORE_ID) {
3924             dp_netdev_pmd_unref(pmd);
3925         }
3926         return 0;
3927     }
3928
3929     /* If the current thread is non-pmd thread, acquires
3930      * the 'non_pmd_mutex'. */
3931     if (pmd->core_id == NON_PMD_CORE_ID) {
3932         ovs_mutex_lock(&dp->non_pmd_mutex);
3933     }
3934
3935     /* Update current time in PMD context. We don't care about EMC insertion
3936      * probability, because we are on a slow path. */
3937     pmd_thread_ctx_time_update(pmd);
3938
3939     /* The action processing expects the RSS hash to be valid, because
3940      * it's always initialized at the beginning of datapath processing.
3941      * In this case, though, 'execute->packet' may not have gone through
3942      * the datapath at all, it may have been generated by the upper layer
3943      * (OpenFlow packet-out, BFD frame, ...). */
3944     if (!dp_packet_rss_valid(execute->packet)) {
3945         dp_packet_set_rss_hash(execute->packet,
3946                                flow_hash_5tuple(execute->flow, 0));
3947     }
3948
3949     dp_packet_batch_init_packet(&pp, execute->packet);
3950     pp.do_not_steal = true;
3951     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3952                               execute->actions, execute->actions_len);
3953     dp_netdev_pmd_flush_output_packets(pmd, true);
3954
3955     if (pmd->core_id == NON_PMD_CORE_ID) {
3956         ovs_mutex_unlock(&dp->non_pmd_mutex);
3957         dp_netdev_pmd_unref(pmd);
3958     }
3959
3960     return 0;
3961 }
3962
3963 static void
3964 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
3965                     enum dpif_offload_type offload_type OVS_UNUSED)
3966 {
3967     size_t i;
3968
3969     for (i = 0; i < n_ops; i++) {
3970         struct dpif_op *op = ops[i];
3971
3972         switch (op->type) {
3973         case DPIF_OP_FLOW_PUT:
3974             op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3975             break;
3976
3977         case DPIF_OP_FLOW_DEL:
3978             op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3979             break;
3980
3981         case DPIF_OP_EXECUTE:
3982             op->error = dpif_netdev_execute(dpif, &op->execute);
3983             break;
3984
3985         case DPIF_OP_FLOW_GET:
3986             op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3987             break;
3988         }
3989     }
3990 }
3991
3992 /* Enable or Disable PMD auto load balancing. */
3993 static void
3994 set_pmd_auto_lb(struct dp_netdev *dp)
3995 {
3996     unsigned int cnt = 0;
3997     struct dp_netdev_pmd_thread *pmd;
3998     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3999
4000     bool enable_alb = false;
4001     bool multi_rxq = false;
4002     bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
4003
4004     /* Ensure that there is at least 2 non-isolated PMDs and
4005      * one of them is polling more than one rxq. */
4006     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4007         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4008             continue;
4009         }
4010
4011         if (hmap_count(&pmd->poll_list) > 1) {
4012             multi_rxq = true;
4013         }
4014         if (cnt && multi_rxq) {
4015                 enable_alb = true;
4016                 break;
4017         }
4018         cnt++;
4019     }
4020
4021     /* Enable auto LB if it is requested and cycle based assignment is true. */
4022     enable_alb = enable_alb && pmd_rxq_assign_cyc &&
4023                     pmd_alb->auto_lb_requested;
4024
4025     if (pmd_alb->is_enabled != enable_alb) {
4026         pmd_alb->is_enabled = enable_alb;
4027         if (pmd_alb->is_enabled) {
4028             VLOG_INFO("PMD auto load balance is enabled "
4029                       "(with rebalance interval:%"PRIu64" msec)",
4030                        pmd_alb->rebalance_intvl);
4031         } else {
4032             pmd_alb->rebalance_poll_timer = 0;
4033             VLOG_INFO("PMD auto load balance is disabled");
4034         }
4035     }
4036
4037 }
4038
4039 /* Applies datapath configuration from the database. Some of the changes are
4040  * actually applied in dpif_netdev_run(). */
4041 static int
4042 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4043 {
4044     struct dp_netdev *dp = get_dp_netdev(dpif);
4045     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4046     const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4047                                              "cycles");
4048     unsigned long long insert_prob =
4049         smap_get_ullong(other_config, "emc-insert-inv-prob",
4050                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
4051     uint32_t insert_min, cur_min;
4052     uint32_t tx_flush_interval, cur_tx_flush_interval;
4053     uint64_t rebalance_intvl;
4054
4055     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4056                                      DEFAULT_TX_FLUSH_INTERVAL);
4057     atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4058     if (tx_flush_interval != cur_tx_flush_interval) {
4059         atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4060         VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4061                   tx_flush_interval);
4062     }
4063
4064     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4065         free(dp->pmd_cmask);
4066         dp->pmd_cmask = nullable_xstrdup(cmask);
4067         dp_netdev_request_reconfigure(dp);
4068     }
4069
4070     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4071     if (insert_prob <= UINT32_MAX) {
4072         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4073     } else {
4074         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4075         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4076     }
4077
4078     if (insert_min != cur_min) {
4079         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4080         if (insert_min == 0) {
4081             VLOG_INFO("EMC insertion probability changed to zero");
4082         } else {
4083             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4084                       insert_prob, (100 / (float)insert_prob));
4085         }
4086     }
4087
4088     bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4089     bool cur_perf_enabled;
4090     atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4091     if (perf_enabled != cur_perf_enabled) {
4092         atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4093         if (perf_enabled) {
4094             VLOG_INFO("PMD performance metrics collection enabled");
4095         } else {
4096             VLOG_INFO("PMD performance metrics collection disabled");
4097         }
4098     }
4099
4100     bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4101     bool cur_smc;
4102     atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4103     if (smc_enable != cur_smc) {
4104         atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4105         if (smc_enable) {
4106             VLOG_INFO("SMC cache is enabled");
4107         } else {
4108             VLOG_INFO("SMC cache is disabled");
4109         }
4110     }
4111
4112     bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles");
4113     if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) {
4114         VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. "
4115                       "Defaulting to 'cycles'.");
4116         pmd_rxq_assign_cyc = true;
4117         pmd_rxq_assign = "cycles";
4118     }
4119     if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) {
4120         dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc;
4121         VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4122                   pmd_rxq_assign);
4123         dp_netdev_request_reconfigure(dp);
4124     }
4125
4126     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4127     pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb",
4128                               false);
4129
4130     rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
4131                               ALB_PMD_REBALANCE_POLL_INTERVAL);
4132
4133     /* Input is in min, convert it to msec. */
4134     rebalance_intvl =
4135         rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4136
4137     if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4138         pmd_alb->rebalance_intvl = rebalance_intvl;
4139     }
4140
4141     set_pmd_auto_lb(dp);
4142     return 0;
4143 }
4144
4145 /* Parses affinity list and returns result in 'core_ids'. */
4146 static int
4147 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4148 {
4149     unsigned i;
4150     char *list, *copy, *key, *value;
4151     int error = 0;
4152
4153     for (i = 0; i < n_rxq; i++) {
4154         core_ids[i] = OVS_CORE_UNSPEC;
4155     }
4156
4157     if (!affinity_list) {
4158         return 0;
4159     }
4160
4161     list = copy = xstrdup(affinity_list);
4162
4163     while (ofputil_parse_key_value(&list, &key, &value)) {
4164         int rxq_id, core_id;
4165
4166         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4167             || !str_to_int(value, 0, &core_id) || core_id < 0) {
4168             error = EINVAL;
4169             break;
4170         }
4171
4172         if (rxq_id < n_rxq) {
4173             core_ids[rxq_id] = core_id;
4174         }
4175     }
4176
4177     free(copy);
4178     return error;
4179 }
4180
4181 /* Parses 'affinity_list' and applies configuration if it is valid. */
4182 static int
4183 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4184                                   const char *affinity_list)
4185 {
4186     unsigned *core_ids, i;
4187     int error = 0;
4188
4189     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4190     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4191         error = EINVAL;
4192         goto exit;
4193     }
4194
4195     for (i = 0; i < port->n_rxq; i++) {
4196         port->rxqs[i].core_id = core_ids[i];
4197     }
4198
4199 exit:
4200     free(core_ids);
4201     return error;
4202 }
4203
4204 /* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4205  * of given PMD thread. */
4206 static bool
4207 dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4208                            struct dp_netdev_port *port)
4209     OVS_EXCLUDED(pmd->port_mutex)
4210 {
4211     struct rxq_poll *poll;
4212     bool found = false;
4213
4214     ovs_mutex_lock(&pmd->port_mutex);
4215     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4216         if (port == poll->rxq->port) {
4217             found = true;
4218             break;
4219         }
4220     }
4221     ovs_mutex_unlock(&pmd->port_mutex);
4222     return found;
4223 }
4224
4225 /* Updates port configuration from the database.  The changes are actually
4226  * applied in dpif_netdev_run(). */
4227 static int
4228 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4229                             const struct smap *cfg)
4230 {
4231     struct dp_netdev *dp = get_dp_netdev(dpif);
4232     struct dp_netdev_port *port;
4233     int error = 0;
4234     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4235     bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4236
4237     ovs_mutex_lock(&dp->port_mutex);
4238     error = get_port_by_number(dp, port_no, &port);
4239     if (error) {
4240         goto unlock;
4241     }
4242
4243     if (emc_enabled != port->emc_enabled) {
4244         struct dp_netdev_pmd_thread *pmd;
4245         struct ds ds = DS_EMPTY_INITIALIZER;
4246         uint32_t cur_min, insert_prob;
4247
4248         port->emc_enabled = emc_enabled;
4249         /* Mark for reload all the threads that polls this port and request
4250          * for reconfiguration for the actual reloading of threads. */
4251         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4252             if (dpif_netdev_pmd_polls_port(pmd, port)) {
4253                 pmd->need_reload = true;
4254             }
4255         }
4256         dp_netdev_request_reconfigure(dp);
4257
4258         ds_put_format(&ds, "%s: EMC has been %s.",
4259                       netdev_get_name(port->netdev),
4260                       (emc_enabled) ? "enabled" : "disabled");
4261         if (emc_enabled) {
4262             ds_put_cstr(&ds, " Current insertion probability is ");
4263             atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4264             if (!cur_min) {
4265                 ds_put_cstr(&ds, "zero.");
4266             } else {
4267                 insert_prob = UINT32_MAX / cur_min;
4268                 ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4269                               insert_prob, 100 / (float) insert_prob);
4270             }
4271         }
4272         VLOG_INFO("%s", ds_cstr(&ds));
4273         ds_destroy(&ds);
4274     }
4275
4276     /* Checking for RXq affinity changes. */
4277     if (!netdev_is_pmd(port->netdev)
4278         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4279         goto unlock;
4280     }
4281
4282     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4283     if (error) {
4284         goto unlock;
4285     }
4286     free(port->rxq_affinity_list);
4287     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4288
4289     dp_netdev_request_reconfigure(dp);
4290 unlock:
4291     ovs_mutex_unlock(&dp->port_mutex);
4292     return error;
4293 }
4294
4295 static int
4296 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4297                               uint32_t queue_id, uint32_t *priority)
4298 {
4299     *priority = queue_id;
4300     return 0;
4301 }
4302
4303 \f
4304 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4305  * a copy of the 'size' bytes of 'actions' input parameters. */
4306 struct dp_netdev_actions *
4307 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4308 {
4309     struct dp_netdev_actions *netdev_actions;
4310
4311     netdev_actions = xmalloc(sizeof *netdev_actions + size);
4312     memcpy(netdev_actions->actions, actions, size);
4313     netdev_actions->size = size;
4314
4315     return netdev_actions;
4316 }
4317
4318 struct dp_netdev_actions *
4319 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4320 {
4321     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4322 }
4323
4324 static void
4325 dp_netdev_actions_free(struct dp_netdev_actions *actions)
4326 {
4327     free(actions);
4328 }
4329 \f
4330 static void
4331 dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4332                          enum rxq_cycles_counter_type type,
4333                          unsigned long long cycles)
4334 {
4335    atomic_store_relaxed(&rx->cycles[type], cycles);
4336 }
4337
4338 static void
4339 dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4340                          enum rxq_cycles_counter_type type,
4341                          unsigned long long cycles)
4342 {
4343     non_atomic_ullong_add(&rx->cycles[type], cycles);
4344 }
4345
4346 static uint64_t
4347 dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4348                          enum rxq_cycles_counter_type type)
4349 {
4350     unsigned long long processing_cycles;
4351     atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4352     return processing_cycles;
4353 }
4354
4355 static void
4356 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4357                                 unsigned long long cycles)
4358 {
4359     unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
4360     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4361 }
4362
4363 static uint64_t
4364 dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4365 {
4366     unsigned long long processing_cycles;
4367     atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4368     return processing_cycles;
4369 }
4370
4371 #if ATOMIC_ALWAYS_LOCK_FREE_8B
4372 static inline bool
4373 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4374 {
4375     bool pmd_perf_enabled;
4376     atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4377     return pmd_perf_enabled;
4378 }
4379 #else
4380 /* If stores and reads of 64-bit integers are not atomic, the full PMD
4381  * performance metrics are not available as locked access to 64 bit
4382  * integers would be prohibitively expensive. */
4383 static inline bool
4384 pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4385 {
4386     return false;
4387 }
4388 #endif
4389
4390 static int
4391 dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4392                                    struct tx_port *p)
4393 {
4394     int i;
4395     int tx_qid;
4396     int output_cnt;
4397     bool dynamic_txqs;
4398     struct cycle_timer timer;
4399     uint64_t cycles;
4400     uint32_t tx_flush_interval;
4401
4402     cycle_timer_start(&pmd->perf_stats, &timer);
4403
4404     dynamic_txqs = p->port->dynamic_txqs;
4405     if (dynamic_txqs) {
4406         tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4407     } else {
4408         tx_qid = pmd->static_tx_qid;
4409     }
4410
4411     output_cnt = dp_packet_batch_size(&p->output_pkts);
4412     ovs_assert(output_cnt > 0);
4413
4414     netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
4415     dp_packet_batch_init(&p->output_pkts);
4416
4417     /* Update time of the next flush. */
4418     atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4419     p->flush_time = pmd->ctx.now + tx_flush_interval;
4420
4421     ovs_assert(pmd->n_output_batches > 0);
4422     pmd->n_output_batches--;
4423
4424     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4425     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4426
4427     /* Distribute send cycles evenly among transmitted packets and assign to
4428      * their respective rx queues. */
4429     cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4430     for (i = 0; i < output_cnt; i++) {
4431         if (p->output_pkts_rxqs[i]) {
4432             dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4433                                      RXQ_CYCLES_PROC_CURR, cycles);
4434         }
4435     }
4436
4437     return output_cnt;
4438 }
4439
4440 static int
4441 dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4442                                    bool force)
4443 {
4444     struct tx_port *p;
4445     int output_cnt = 0;
4446
4447     if (!pmd->n_output_batches) {
4448         return 0;
4449     }
4450
4451     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4452         if (!dp_packet_batch_is_empty(&p->output_pkts)
4453             && (force || pmd->ctx.now >= p->flush_time)) {
4454             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4455         }
4456     }
4457     return output_cnt;
4458 }
4459
4460 static int
4461 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4462                            struct dp_netdev_rxq *rxq,
4463                            odp_port_t port_no)
4464 {
4465     struct pmd_perf_stats *s = &pmd->perf_stats;
4466     struct dp_packet_batch batch;
4467     struct cycle_timer timer;
4468     int error;
4469     int batch_cnt = 0;
4470     int rem_qlen = 0, *qlen_p = NULL;
4471     uint64_t cycles;
4472
4473     /* Measure duration for polling and processing rx burst. */
4474     cycle_timer_start(&pmd->perf_stats, &timer);
4475
4476     pmd->ctx.last_rxq = rxq;
4477     dp_packet_batch_init(&batch);
4478
4479     /* Fetch the rx queue length only for vhostuser ports. */
4480     if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4481         qlen_p = &rem_qlen;
4482     }
4483
4484     error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4485     if (!error) {
4486         /* At least one packet received. */
4487         *recirc_depth_get() = 0;
4488         pmd_thread_ctx_time_update(pmd);
4489         batch_cnt = dp_packet_batch_size(&batch);
4490         if (pmd_perf_metrics_enabled(pmd)) {
4491             /* Update batch histogram. */
4492             s->current.batches++;
4493             histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4494             /* Update the maximum vhost rx queue fill level. */
4495             if (rxq->is_vhost && rem_qlen >= 0) {
4496                 uint32_t qfill = batch_cnt + rem_qlen;
4497                 if (qfill > s->current.max_vhost_qfill) {
4498                     s->current.max_vhost_qfill = qfill;
4499                 }
4500             }
4501         }
4502         /* Process packet batch. */
4503         dp_netdev_input(pmd, &batch, port_no);
4504
4505         /* Assign processing cycles to rx queue. */
4506         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4507         dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4508
4509         dp_netdev_pmd_flush_output_packets(pmd, false);
4510     } else {
4511         /* Discard cycles. */
4512         cycle_timer_stop(&pmd->perf_stats, &timer);
4513         if (error != EAGAIN && error != EOPNOTSUPP) {
4514             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4515
4516             VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4517                     netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4518         }
4519     }
4520
4521     pmd->ctx.last_rxq = NULL;
4522
4523     return batch_cnt;
4524 }
4525
4526 static struct tx_port *
4527 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4528 {
4529     struct tx_port *tx;
4530
4531     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4532         if (tx->port->port_no == port_no) {
4533             return tx;
4534         }
4535     }
4536
4537     return NULL;
4538 }
4539
4540 static struct tx_bond *
4541 tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4542 {
4543     uint32_t hash = hash_bond_id(bond_id);
4544     struct tx_bond *tx;
4545
4546     CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4547         if (tx->bond_id == bond_id) {
4548             return tx;
4549         }
4550     }
4551     return NULL;
4552 }
4553
4554 static int
4555 port_reconfigure(struct dp_netdev_port *port)
4556 {
4557     struct netdev *netdev = port->netdev;
4558     int i, err;
4559
4560     /* Closes the existing 'rxq's. */
4561     for (i = 0; i < port->n_rxq; i++) {
4562         netdev_rxq_close(port->rxqs[i].rx);
4563         port->rxqs[i].rx = NULL;
4564     }
4565     unsigned last_nrxq = port->n_rxq;
4566     port->n_rxq = 0;
4567
4568     /* Allows 'netdev' to apply the pending configuration changes. */
4569     if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4570         err = netdev_reconfigure(netdev);
4571         if (err && (err != EOPNOTSUPP)) {
4572             VLOG_ERR("Failed to set interface %s new configuration",
4573                      netdev_get_name(netdev));
4574             return err;
4575         }
4576     }
4577     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4578     port->rxqs = xrealloc(port->rxqs,
4579                           sizeof *port->rxqs * netdev_n_rxq(netdev));
4580     /* Realloc 'used' counters for tx queues. */
4581     free(port->txq_used);
4582     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4583
4584     for (i = 0; i < netdev_n_rxq(netdev); i++) {
4585         bool new_queue = i >= last_nrxq;
4586         if (new_queue) {
4587             memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4588         }
4589
4590         port->rxqs[i].port = port;
4591         port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4592
4593         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4594         if (err) {
4595             return err;
4596         }
4597         port->n_rxq++;
4598     }
4599
4600     /* Parse affinity list to apply configuration for new queues. */
4601     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4602
4603     /* If reconfiguration was successful mark it as such, so we can use it */
4604     port->need_reconfigure = false;
4605
4606     return 0;
4607 }
4608
4609 struct rr_numa_list {
4610     struct hmap numas;  /* Contains 'struct rr_numa' */
4611 };
4612
4613 struct rr_numa {
4614     struct hmap_node node;
4615
4616     int numa_id;
4617
4618     /* Non isolated pmds on numa node 'numa_id' */
4619     struct dp_netdev_pmd_thread **pmds;
4620     int n_pmds;
4621
4622     int cur_index;
4623     bool idx_inc;
4624 };
4625
4626 static struct rr_numa *
4627 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
4628 {
4629     struct rr_numa *numa;
4630
4631     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
4632         if (numa->numa_id == numa_id) {
4633             return numa;
4634         }
4635     }
4636
4637     return NULL;
4638 }
4639
4640 /* Returns the next node in numa list following 'numa' in round-robin fashion.
4641  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
4642  * Returns NULL if 'rr' numa list is empty. */
4643 static struct rr_numa *
4644 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
4645 {
4646     struct hmap_node *node = NULL;
4647
4648     if (numa) {
4649         node = hmap_next(&rr->numas, &numa->node);
4650     }
4651     if (!node) {
4652         node = hmap_first(&rr->numas);
4653     }
4654
4655     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
4656 }
4657
4658 static void
4659 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
4660 {
4661     struct dp_netdev_pmd_thread *pmd;
4662     struct rr_numa *numa;
4663
4664     hmap_init(&rr->numas);
4665
4666     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4667         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
4668             continue;
4669         }
4670
4671         numa = rr_numa_list_lookup(rr, pmd->numa_id);
4672         if (!numa) {
4673             numa = xzalloc(sizeof *numa);
4674             numa->numa_id = pmd->numa_id;
4675             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
4676         }
4677         numa->n_pmds++;
4678         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4679         numa->pmds[numa->n_pmds - 1] = pmd;
4680         /* At least one pmd so initialise curr_idx and idx_inc. */
4681         numa->cur_index = 0;
4682         numa->idx_inc = true;
4683     }
4684 }
4685
4686 /*
4687  * Returns the next pmd from the numa node.
4688  *
4689  * If 'updown' is 'true' it will alternate between selecting the next pmd in
4690  * either an up or down walk, switching between up/down when the first or last
4691  * core is reached. e.g. 1,2,3,3,2,1,1,2...
4692  *
4693  * If 'updown' is 'false' it will select the next pmd wrapping around when last
4694  * core reached. e.g. 1,2,3,1,2,3,1,2...
4695  */
4696 static struct dp_netdev_pmd_thread *
4697 rr_numa_get_pmd(struct rr_numa *numa, bool updown)
4698 {
4699     int numa_idx = numa->cur_index;
4700
4701     if (numa->idx_inc == true) {
4702         /* Incrementing through list of pmds. */
4703         if (numa->cur_index == numa->n_pmds-1) {
4704             /* Reached the last pmd. */
4705             if (updown) {
4706                 numa->idx_inc = false;
4707             } else {
4708                 numa->cur_index = 0;
4709             }
4710         } else {
4711             numa->cur_index++;
4712         }
4713     } else {
4714         /* Decrementing through list of pmds. */
4715         if (numa->cur_index == 0) {
4716             /* Reached the first pmd. */
4717             numa->idx_inc = true;
4718         } else {
4719             numa->cur_index--;
4720         }
4721     }
4722     return numa->pmds[numa_idx];
4723 }
4724
4725 static void
4726 rr_numa_list_destroy(struct rr_numa_list *rr)
4727 {
4728     struct rr_numa *numa;
4729
4730     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
4731         free(numa->pmds);
4732         free(numa);
4733     }
4734     hmap_destroy(&rr->numas);
4735 }
4736
4737 /* Sort Rx Queues by the processing cycles they are consuming. */
4738 static int
4739 compare_rxq_cycles(const void *a, const void *b)
4740 {
4741     struct dp_netdev_rxq *qa;
4742     struct dp_netdev_rxq *qb;
4743     uint64_t cycles_qa, cycles_qb;
4744
4745     qa = *(struct dp_netdev_rxq **) a;
4746     qb = *(struct dp_netdev_rxq **) b;
4747
4748     cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4749     cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4750
4751     if (cycles_qa != cycles_qb) {
4752         return (cycles_qa < cycles_qb) ? 1 : -1;
4753     } else {
4754         /* Cycles are the same so tiebreak on port/queue id.
4755          * Tiebreaking (as opposed to return 0) ensures consistent
4756          * sort results across multiple OS's. */
4757         uint32_t port_qa = odp_to_u32(qa->port->port_no);
4758         uint32_t port_qb = odp_to_u32(qb->port->port_no);
4759         if (port_qa != port_qb) {
4760             return port_qa > port_qb ? 1 : -1;
4761         } else {
4762             return netdev_rxq_get_queue_id(qa->rx)
4763                     - netdev_rxq_get_queue_id(qb->rx);
4764         }
4765     }
4766 }
4767
4768 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
4769  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
4770  * pmds to unpinned queues.
4771  *
4772  * The function doesn't touch the pmd threads, it just stores the assignment
4773  * in the 'pmd' member of each rxq. */
4774 static void
4775 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
4776 {
4777     struct dp_netdev_port *port;
4778     struct rr_numa_list rr;
4779     struct rr_numa *non_local_numa = NULL;
4780     struct dp_netdev_rxq ** rxqs = NULL;
4781     int n_rxqs = 0;
4782     struct rr_numa *numa = NULL;
4783     int numa_id;
4784     bool assign_cyc = dp->pmd_rxq_assign_cyc;
4785
4786     HMAP_FOR_EACH (port, node, &dp->ports) {
4787         if (!netdev_is_pmd(port->netdev)) {
4788             continue;
4789         }
4790
4791         for (int qid = 0; qid < port->n_rxq; qid++) {
4792             struct dp_netdev_rxq *q = &port->rxqs[qid];
4793
4794             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
4795                 struct dp_netdev_pmd_thread *pmd;
4796
4797                 pmd = dp_netdev_get_pmd(dp, q->core_id);
4798                 if (!pmd) {
4799                     VLOG_WARN("There is no PMD thread on core %d. Queue "
4800                               "%d on port \'%s\' will not be polled.",
4801                               q->core_id, qid, netdev_get_name(port->netdev));
4802                 } else {
4803                     q->pmd = pmd;
4804                     pmd->isolated = true;
4805                     VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4806                               "rx queue %d.", pmd->core_id, pmd->numa_id,
4807                               netdev_rxq_get_name(q->rx),
4808                               netdev_rxq_get_queue_id(q->rx));
4809                     dp_netdev_pmd_unref(pmd);
4810                 }
4811             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
4812                 uint64_t cycle_hist = 0;
4813
4814                 if (n_rxqs == 0) {
4815                     rxqs = xmalloc(sizeof *rxqs);
4816                 } else {
4817                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
4818                 }
4819
4820                 if (assign_cyc) {
4821                     /* Sum the queue intervals and store the cycle history. */
4822                     for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
4823                         cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
4824                     }
4825                     dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
4826                                              cycle_hist);
4827                 }
4828                 /* Store the queue. */
4829                 rxqs[n_rxqs++] = q;
4830             }
4831         }
4832     }
4833
4834     if (n_rxqs > 1 && assign_cyc) {
4835         /* Sort the queues in order of the processing cycles
4836          * they consumed during their last pmd interval. */
4837         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
4838     }
4839
4840     rr_numa_list_populate(dp, &rr);
4841     /* Assign the sorted queues to pmds in round robin. */
4842     for (int i = 0; i < n_rxqs; i++) {
4843         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
4844         numa = rr_numa_list_lookup(&rr, numa_id);
4845         if (!numa) {
4846             /* There are no pmds on the queue's local NUMA node.
4847                Round robin on the NUMA nodes that do have pmds. */
4848             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
4849             if (!non_local_numa) {
4850                 VLOG_ERR("There is no available (non-isolated) pmd "
4851                          "thread for port \'%s\' queue %d. This queue "
4852                          "will not be polled. Is pmd-cpu-mask set to "
4853                          "zero? Or are all PMDs isolated to other "
4854                          "queues?", netdev_rxq_get_name(rxqs[i]->rx),
4855                          netdev_rxq_get_queue_id(rxqs[i]->rx));
4856                 continue;
4857             }
4858             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
4859             VLOG_WARN("There's no available (non-isolated) pmd thread "
4860                       "on numa node %d. Queue %d on port \'%s\' will "
4861                       "be assigned to the pmd on core %d "
4862                       "(numa node %d). Expect reduced performance.",
4863                       numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
4864                       netdev_rxq_get_name(rxqs[i]->rx),
4865                       rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
4866         } else {
4867             rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
4868             if (assign_cyc) {
4869                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4870                           "rx queue %d "
4871                           "(measured processing cycles %"PRIu64").",
4872                           rxqs[i]->pmd->core_id, numa_id,
4873                           netdev_rxq_get_name(rxqs[i]->rx),
4874                           netdev_rxq_get_queue_id(rxqs[i]->rx),
4875                           dp_netdev_rxq_get_cycles(rxqs[i],
4876                                                    RXQ_CYCLES_PROC_HIST));
4877             } else {
4878                 VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
4879                           "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
4880                           netdev_rxq_get_name(rxqs[i]->rx),
4881                           netdev_rxq_get_queue_id(rxqs[i]->rx));
4882             }
4883         }
4884     }
4885
4886     rr_numa_list_destroy(&rr);
4887     free(rxqs);
4888 }
4889
4890 static void
4891 reload_affected_pmds(struct dp_netdev *dp)
4892 {
4893     struct dp_netdev_pmd_thread *pmd;
4894
4895     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4896         if (pmd->need_reload) {
4897             flow_mark_flush(pmd);
4898             dp_netdev_reload_pmd__(pmd);
4899         }
4900     }
4901
4902     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4903         if (pmd->need_reload) {
4904             if (pmd->core_id != NON_PMD_CORE_ID) {
4905                 bool reload;
4906
4907                 do {
4908                     atomic_read_explicit(&pmd->reload, &reload,
4909                                          memory_order_acquire);
4910                 } while (reload);
4911             }
4912             pmd->need_reload = false;
4913         }
4914     }
4915 }
4916
4917 static void
4918 reconfigure_pmd_threads(struct dp_netdev *dp)
4919     OVS_REQUIRES(dp->port_mutex)
4920 {
4921     struct dp_netdev_pmd_thread *pmd;
4922     struct ovs_numa_dump *pmd_cores;
4923     struct ovs_numa_info_core *core;
4924     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
4925     struct hmapx_node *node;
4926     bool changed = false;
4927     bool need_to_adjust_static_tx_qids = false;
4928
4929     /* The pmd threads should be started only if there's a pmd port in the
4930      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
4931      * NR_PMD_THREADS per numa node. */
4932     if (!has_pmd_port(dp)) {
4933         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
4934     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
4935         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
4936     } else {
4937         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
4938     }
4939
4940     /* We need to adjust 'static_tx_qid's only if we're reducing number of
4941      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
4942     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
4943         /* Adjustment is required to keep 'static_tx_qid's sequential and
4944          * avoid possible issues, for example, imbalanced tx queue usage
4945          * and unnecessary locking caused by remapping on netdev level. */
4946         need_to_adjust_static_tx_qids = true;
4947     }
4948
4949     /* Check for unwanted pmd threads */
4950     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4951         if (pmd->core_id == NON_PMD_CORE_ID) {
4952             continue;
4953         }
4954         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
4955                                                     pmd->core_id)) {
4956             hmapx_add(&to_delete, pmd);
4957         } else if (need_to_adjust_static_tx_qids) {
4958             atomic_store_relaxed(&pmd->reload_tx_qid, true);
4959             pmd->need_reload = true;
4960         }
4961     }
4962
4963     HMAPX_FOR_EACH (node, &to_delete) {
4964         pmd = (struct dp_netdev_pmd_thread *) node->data;
4965         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
4966                   pmd->numa_id, pmd->core_id);
4967         dp_netdev_del_pmd(dp, pmd);
4968     }
4969     changed = !hmapx_is_empty(&to_delete);
4970     hmapx_destroy(&to_delete);
4971
4972     if (need_to_adjust_static_tx_qids) {
4973         /* 'static_tx_qid's are not sequential now.
4974          * Reload remaining threads to fix this. */
4975         reload_affected_pmds(dp);
4976     }
4977
4978     /* Check for required new pmd threads */
4979     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
4980         pmd = dp_netdev_get_pmd(dp, core->core_id);
4981         if (!pmd) {
4982             struct ds name = DS_EMPTY_INITIALIZER;
4983
4984             pmd = xzalloc(sizeof *pmd);
4985             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
4986
4987             ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
4988             pmd->thread = ovs_thread_create(ds_cstr(&name),
4989                                             pmd_thread_main, pmd);
4990             ds_destroy(&name);
4991
4992             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
4993                       pmd->numa_id, pmd->core_id);
4994             changed = true;
4995         } else {
4996             dp_netdev_pmd_unref(pmd);
4997         }
4998     }
4999
5000     if (changed) {
5001         struct ovs_numa_info_numa *numa;
5002
5003         /* Log the number of pmd threads per numa node. */
5004         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5005             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5006                       numa->n_cores, numa->numa_id);
5007         }
5008     }
5009
5010     ovs_numa_dump_destroy(pmd_cores);
5011 }
5012
5013 static void
5014 pmd_remove_stale_ports(struct dp_netdev *dp,
5015                        struct dp_netdev_pmd_thread *pmd)
5016     OVS_EXCLUDED(pmd->port_mutex)
5017     OVS_REQUIRES(dp->port_mutex)
5018 {
5019     struct rxq_poll *poll, *poll_next;
5020     struct tx_port *tx, *tx_next;
5021
5022     ovs_mutex_lock(&pmd->port_mutex);
5023     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5024         struct dp_netdev_port *port = poll->rxq->port;
5025
5026         if (port->need_reconfigure
5027             || !hmap_contains(&dp->ports, &port->node)) {
5028             dp_netdev_del_rxq_from_pmd(pmd, poll);
5029         }
5030     }
5031     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
5032         struct dp_netdev_port *port = tx->port;
5033
5034         if (port->need_reconfigure
5035             || !hmap_contains(&dp->ports, &port->node)) {
5036             dp_netdev_del_port_tx_from_pmd(pmd, tx);
5037         }
5038     }
5039     ovs_mutex_unlock(&pmd->port_mutex);
5040 }
5041
5042 /* Must be called each time a port is added/removed or the cmask changes.
5043  * This creates and destroys pmd threads, reconfigures ports, opens their
5044  * rxqs and assigns all rxqs/txqs to pmd threads. */
5045 static void
5046 reconfigure_datapath(struct dp_netdev *dp)
5047     OVS_REQUIRES(dp->port_mutex)
5048 {
5049     struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5050     struct dp_netdev_pmd_thread *pmd;
5051     struct dp_netdev_port *port;
5052     int wanted_txqs;
5053
5054     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5055
5056     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5057      * on the system and the user configuration. */
5058     reconfigure_pmd_threads(dp);
5059
5060     wanted_txqs = cmap_count(&dp->poll_threads);
5061
5062     /* The number of pmd threads might have changed, or a port can be new:
5063      * adjust the txqs. */
5064     HMAP_FOR_EACH (port, node, &dp->ports) {
5065         netdev_set_tx_multiq(port->netdev, wanted_txqs);
5066     }
5067
5068     /* Step 2: Remove from the pmd threads ports that have been removed or
5069      * need reconfiguration. */
5070
5071     /* Check for all the ports that need reconfiguration.  We cache this in
5072      * 'port->need_reconfigure', because netdev_is_reconf_required() can
5073      * change at any time.
5074      * Also mark for reconfiguration all ports which will likely change their
5075      * 'dynamic_txqs' parameter.  It's required to stop using them before
5076      * changing this setting and it's simpler to mark ports here and allow
5077      * 'pmd_remove_stale_ports' to remove them from threads.  There will be
5078      * no actual reconfiguration in 'port_reconfigure' because it's
5079      * unnecessary.  */
5080     HMAP_FOR_EACH (port, node, &dp->ports) {
5081         if (netdev_is_reconf_required(port->netdev)
5082             || (port->dynamic_txqs
5083                 != (netdev_n_txq(port->netdev) < wanted_txqs))) {
5084             port->need_reconfigure = true;
5085         }
5086     }
5087
5088     /* Remove from the pmd threads all the ports that have been deleted or
5089      * need reconfiguration. */
5090     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5091         pmd_remove_stale_ports(dp, pmd);
5092     }
5093
5094     /* Reload affected pmd threads.  We must wait for the pmd threads before
5095      * reconfiguring the ports, because a port cannot be reconfigured while
5096      * it's being used. */
5097     reload_affected_pmds(dp);
5098
5099     /* Step 3: Reconfigure ports. */
5100
5101     /* We only reconfigure the ports that we determined above, because they're
5102      * not being used by any pmd thread at the moment.  If a port fails to
5103      * reconfigure we remove it from the datapath. */
5104     struct dp_netdev_port *next_port;
5105     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
5106         int err;
5107
5108         if (!port->need_reconfigure) {
5109             continue;
5110         }
5111
5112         err = port_reconfigure(port);
5113         if (err) {
5114             hmap_remove(&dp->ports, &port->node);
5115             seq_change(dp->port_seq);
5116             port_destroy(port);
5117         } else {
5118             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
5119         }
5120     }
5121
5122     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
5123      * for now, we just update the 'pmd' pointer in each rxq to point to the
5124      * wanted thread according to the scheduling policy. */
5125
5126     /* Reset all the pmd threads to non isolated. */
5127     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5128         pmd->isolated = false;
5129     }
5130
5131     /* Reset all the queues to unassigned */
5132     HMAP_FOR_EACH (port, node, &dp->ports) {
5133         for (int i = 0; i < port->n_rxq; i++) {
5134             port->rxqs[i].pmd = NULL;
5135         }
5136     }
5137
5138     /* Add pinned queues and mark pmd threads isolated. */
5139     rxq_scheduling(dp, true);
5140
5141     /* Add non-pinned queues. */
5142     rxq_scheduling(dp, false);
5143
5144     /* Step 5: Remove queues not compliant with new scheduling. */
5145
5146     /* Count all the threads that will have at least one queue to poll. */
5147     HMAP_FOR_EACH (port, node, &dp->ports) {
5148         for (int qid = 0; qid < port->n_rxq; qid++) {
5149             struct dp_netdev_rxq *q = &port->rxqs[qid];
5150
5151             if (q->pmd) {
5152                 hmapx_add(&busy_threads, q->pmd);
5153             }
5154         }
5155     }
5156
5157     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5158         struct rxq_poll *poll, *poll_next;
5159
5160         ovs_mutex_lock(&pmd->port_mutex);
5161         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
5162             if (poll->rxq->pmd != pmd) {
5163                 dp_netdev_del_rxq_from_pmd(pmd, poll);
5164
5165                 /* This pmd might sleep after this step if it has no rxq
5166                  * remaining. Tell it to busy wait for new assignment if it
5167                  * has at least one scheduled queue. */
5168                 if (hmap_count(&pmd->poll_list) == 0 &&
5169                     hmapx_contains(&busy_threads, pmd)) {
5170                     atomic_store_relaxed(&pmd->wait_for_reload, true);
5171                 }
5172             }
5173         }
5174         ovs_mutex_unlock(&pmd->port_mutex);
5175     }
5176
5177     hmapx_destroy(&busy_threads);
5178
5179     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
5180      * the old queues before readding them, otherwise a queue can be polled by
5181      * two threads at the same time. */
5182     reload_affected_pmds(dp);
5183
5184     /* Step 6: Add queues from scheduling, if they're not there already. */
5185     HMAP_FOR_EACH (port, node, &dp->ports) {
5186         if (!netdev_is_pmd(port->netdev)) {
5187             continue;
5188         }
5189
5190         for (int qid = 0; qid < port->n_rxq; qid++) {
5191             struct dp_netdev_rxq *q = &port->rxqs[qid];
5192
5193             if (q->pmd) {
5194                 ovs_mutex_lock(&q->pmd->port_mutex);
5195                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
5196                 ovs_mutex_unlock(&q->pmd->port_mutex);
5197             }
5198         }
5199     }
5200
5201     /* Add every port and bond to the tx port and bond caches of
5202      * every pmd thread, if it's not there already and if this pmd
5203      * has at least one rxq to poll.
5204      */
5205     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5206         ovs_mutex_lock(&pmd->port_mutex);
5207         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5208             struct tx_bond *bond;
5209
5210             HMAP_FOR_EACH (port, node, &dp->ports) {
5211                 dp_netdev_add_port_tx_to_pmd(pmd, port);
5212             }
5213
5214             CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5215                 dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5216             }
5217         }
5218         ovs_mutex_unlock(&pmd->port_mutex);
5219     }
5220
5221     /* Reload affected pmd threads. */
5222     reload_affected_pmds(dp);
5223
5224     /* Check if PMD Auto LB is to be enabled */
5225     set_pmd_auto_lb(dp);
5226 }
5227
5228 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5229 static bool
5230 ports_require_restart(const struct dp_netdev *dp)
5231     OVS_REQUIRES(dp->port_mutex)
5232 {
5233     struct dp_netdev_port *port;
5234
5235     HMAP_FOR_EACH (port, node, &dp->ports) {
5236         if (netdev_is_reconf_required(port->netdev)) {
5237             return true;
5238         }
5239     }
5240
5241     return false;
5242 }
5243
5244 /* Calculates variance in the values stored in array 'a'. 'n' is the number
5245  * of elements in array to be considered for calculating vairance.
5246  * Usage example: data array 'a' contains the processing load of each pmd and
5247  * 'n' is the number of PMDs. It returns the variance in processing load of
5248  * PMDs*/
5249 static uint64_t
5250 variance(uint64_t a[], int n)
5251 {
5252     /* Compute mean (average of elements). */
5253     uint64_t sum = 0;
5254     uint64_t mean = 0;
5255     uint64_t sqDiff = 0;
5256
5257     if (!n) {
5258         return 0;
5259     }
5260
5261     for (int i = 0; i < n; i++) {
5262         sum += a[i];
5263     }
5264
5265     if (sum) {
5266         mean = sum / n;
5267
5268         /* Compute sum squared differences with mean. */
5269         for (int i = 0; i < n; i++) {
5270             sqDiff += (a[i] - mean)*(a[i] - mean);
5271         }
5272     }
5273     return (sqDiff ? (sqDiff / n) : 0);
5274 }
5275
5276
5277 /* Returns the variance in the PMDs usage as part of dry run of rxqs
5278  * assignment to PMDs. */
5279 static bool
5280 get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
5281                      uint32_t num_pmds, uint64_t *predicted_variance)
5282     OVS_REQUIRES(dp->port_mutex)
5283 {
5284     struct dp_netdev_port *port;
5285     struct dp_netdev_pmd_thread *pmd;
5286     struct dp_netdev_rxq **rxqs = NULL;
5287     struct rr_numa *numa = NULL;
5288     struct rr_numa_list rr;
5289     int n_rxqs = 0;
5290     bool ret = false;
5291     uint64_t *pmd_usage;
5292
5293     if (!predicted_variance) {
5294         return ret;
5295     }
5296
5297     pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5298
5299     HMAP_FOR_EACH (port, node, &dp->ports) {
5300         if (!netdev_is_pmd(port->netdev)) {
5301             continue;
5302         }
5303
5304         for (int qid = 0; qid < port->n_rxq; qid++) {
5305             struct dp_netdev_rxq *q = &port->rxqs[qid];
5306             uint64_t cycle_hist = 0;
5307
5308             if (q->pmd->isolated) {
5309                 continue;
5310             }
5311
5312             if (n_rxqs == 0) {
5313                 rxqs = xmalloc(sizeof *rxqs);
5314             } else {
5315                 rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
5316             }
5317
5318             /* Sum the queue intervals and store the cycle history. */
5319             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5320                 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
5321             }
5322             dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
5323                                          cycle_hist);
5324             /* Store the queue. */
5325             rxqs[n_rxqs++] = q;
5326         }
5327     }
5328     if (n_rxqs > 1) {
5329         /* Sort the queues in order of the processing cycles
5330          * they consumed during their last pmd interval. */
5331         qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5332     }
5333     rr_numa_list_populate(dp, &rr);
5334
5335     for (int i = 0; i < n_rxqs; i++) {
5336         int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
5337         numa = rr_numa_list_lookup(&rr, numa_id);
5338         if (!numa) {
5339             /* Abort if cross NUMA polling. */
5340             VLOG_DBG("PMD auto lb dry run."
5341                      " Aborting due to cross-numa polling.");
5342             goto cleanup;
5343         }
5344
5345         pmd = rr_numa_get_pmd(numa, true);
5346         VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
5347                   "to be assigned port \'%s\' rx queue %d "
5348                   "(measured processing cycles %"PRIu64").",
5349                   pmd->core_id, numa_id,
5350                   netdev_rxq_get_name(rxqs[i]->rx),
5351                   netdev_rxq_get_queue_id(rxqs[i]->rx),
5352                   dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
5353
5354         for (int id = 0; id < num_pmds; id++) {
5355             if (pmd->core_id == core_list[id]) {
5356                 /* Add the processing cycles of rxq to pmd polling it. */
5357                 pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
5358                                         RXQ_CYCLES_PROC_HIST);
5359             }
5360         }
5361     }
5362
5363     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5364         uint64_t total_cycles = 0;
5365
5366         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5367             continue;
5368         }
5369
5370         /* Get the total pmd cycles for an interval. */
5371         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5372         /* Estimate the cycles to cover all intervals. */
5373         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5374         for (int id = 0; id < num_pmds; id++) {
5375             if (pmd->core_id == core_list[id]) {
5376                 if (pmd_usage[id]) {
5377                     pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
5378                 }
5379                 VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
5380                          "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
5381             }
5382         }
5383     }
5384     *predicted_variance = variance(pmd_usage, num_pmds);
5385     ret = true;
5386
5387 cleanup:
5388     rr_numa_list_destroy(&rr);
5389     free(rxqs);
5390     free(pmd_usage);
5391     return ret;
5392 }
5393
5394 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
5395  * better distribution of load on PMDs. */
5396 static bool
5397 pmd_rebalance_dry_run(struct dp_netdev *dp)
5398     OVS_REQUIRES(dp->port_mutex)
5399 {
5400     struct dp_netdev_pmd_thread *pmd;
5401     uint64_t *curr_pmd_usage;
5402
5403     uint64_t curr_variance;
5404     uint64_t new_variance;
5405     uint64_t improvement = 0;
5406     uint32_t num_pmds;
5407     uint32_t *pmd_corelist;
5408     struct rxq_poll *poll;
5409     bool ret;
5410
5411     num_pmds = cmap_count(&dp->poll_threads);
5412
5413     if (num_pmds > 1) {
5414         curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
5415         pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
5416     } else {
5417         return false;
5418     }
5419
5420     num_pmds = 0;
5421     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5422         uint64_t total_cycles = 0;
5423         uint64_t total_proc = 0;
5424
5425         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
5426             continue;
5427         }
5428
5429         /* Get the total pmd cycles for an interval. */
5430         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
5431         /* Estimate the cycles to cover all intervals. */
5432         total_cycles *= PMD_RXQ_INTERVAL_MAX;
5433
5434         ovs_mutex_lock(&pmd->port_mutex);
5435         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5436             for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
5437                 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
5438             }
5439         }
5440         ovs_mutex_unlock(&pmd->port_mutex);
5441
5442         if (total_proc) {
5443             curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
5444         }
5445
5446         VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
5447                   pmd->core_id, curr_pmd_usage[num_pmds]);
5448
5449         if (atomic_count_get(&pmd->pmd_overloaded)) {
5450             atomic_count_set(&pmd->pmd_overloaded, 0);
5451         }
5452
5453         pmd_corelist[num_pmds] = pmd->core_id;
5454         num_pmds++;
5455     }
5456
5457     curr_variance = variance(curr_pmd_usage, num_pmds);
5458     ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
5459
5460     if (ret) {
5461         VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
5462                   " Predicted PMD variance: %"PRIu64"",
5463                   curr_variance, new_variance);
5464
5465         if (new_variance < curr_variance) {
5466             improvement =
5467                 ((curr_variance - new_variance) * 100) / curr_variance;
5468         }
5469         if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) {
5470             ret = false;
5471         }
5472     }
5473
5474     free(curr_pmd_usage);
5475     free(pmd_corelist);
5476     return ret;
5477 }
5478
5479
5480 /* Return true if needs to revalidate datapath flows. */
5481 static bool
5482 dpif_netdev_run(struct dpif *dpif)
5483 {
5484     struct dp_netdev_port *port;
5485     struct dp_netdev *dp = get_dp_netdev(dpif);
5486     struct dp_netdev_pmd_thread *non_pmd;
5487     uint64_t new_tnl_seq;
5488     bool need_to_flush = true;
5489     bool pmd_rebalance = false;
5490     long long int now = time_msec();
5491     struct dp_netdev_pmd_thread *pmd;
5492
5493     ovs_mutex_lock(&dp->port_mutex);
5494     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5495     if (non_pmd) {
5496         ovs_mutex_lock(&dp->non_pmd_mutex);
5497         HMAP_FOR_EACH (port, node, &dp->ports) {
5498             if (!netdev_is_pmd(port->netdev)) {
5499                 int i;
5500
5501                 if (port->emc_enabled) {
5502                     atomic_read_relaxed(&dp->emc_insert_min,
5503                                         &non_pmd->ctx.emc_insert_min);
5504                 } else {
5505                     non_pmd->ctx.emc_insert_min = 0;
5506                 }
5507
5508                 for (i = 0; i < port->n_rxq; i++) {
5509
5510                     if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5511                         continue;
5512                     }
5513
5514                     if (dp_netdev_process_rxq_port(non_pmd,
5515                                                    &port->rxqs[i],
5516                                                    port->port_no)) {
5517                         need_to_flush = false;
5518                     }
5519                 }
5520             }
5521         }
5522         if (need_to_flush) {
5523             /* We didn't receive anything in the process loop.
5524              * Check if we need to send something.
5525              * There was no time updates on current iteration. */
5526             pmd_thread_ctx_time_update(non_pmd);
5527             dp_netdev_pmd_flush_output_packets(non_pmd, false);
5528         }
5529
5530         dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5531         ovs_mutex_unlock(&dp->non_pmd_mutex);
5532
5533         dp_netdev_pmd_unref(non_pmd);
5534     }
5535
5536     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5537     if (pmd_alb->is_enabled) {
5538         if (!pmd_alb->rebalance_poll_timer) {
5539             pmd_alb->rebalance_poll_timer = now;
5540         } else if ((pmd_alb->rebalance_poll_timer +
5541                    pmd_alb->rebalance_intvl) < now) {
5542             pmd_alb->rebalance_poll_timer = now;
5543             CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5544                 if (atomic_count_get(&pmd->pmd_overloaded) >=
5545                                     PMD_RXQ_INTERVAL_MAX) {
5546                     pmd_rebalance = true;
5547                     break;
5548                 }
5549             }
5550
5551             if (pmd_rebalance &&
5552                 !dp_netdev_is_reconf_required(dp) &&
5553                 !ports_require_restart(dp) &&
5554                 pmd_rebalance_dry_run(dp)) {
5555                 VLOG_INFO("PMD auto lb dry run."
5556                           " requesting datapath reconfigure.");
5557                 dp_netdev_request_reconfigure(dp);
5558             }
5559         }
5560     }
5561
5562     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5563         reconfigure_datapath(dp);
5564     }
5565     ovs_mutex_unlock(&dp->port_mutex);
5566
5567     tnl_neigh_cache_run();
5568     tnl_port_map_run();
5569     new_tnl_seq = seq_read(tnl_conf_seq);
5570
5571     if (dp->last_tnl_conf_seq != new_tnl_seq) {
5572         dp->last_tnl_conf_seq = new_tnl_seq;
5573         return true;
5574     }
5575     return false;
5576 }
5577
5578 static void
5579 dpif_netdev_wait(struct dpif *dpif)
5580 {
5581     struct dp_netdev_port *port;
5582     struct dp_netdev *dp = get_dp_netdev(dpif);
5583
5584     ovs_mutex_lock(&dp_netdev_mutex);
5585     ovs_mutex_lock(&dp->port_mutex);
5586     HMAP_FOR_EACH (port, node, &dp->ports) {
5587         netdev_wait_reconf_required(port->netdev);
5588         if (!netdev_is_pmd(port->netdev)) {
5589             int i;
5590
5591             for (i = 0; i < port->n_rxq; i++) {
5592                 netdev_rxq_wait(port->rxqs[i].rx);
5593             }
5594         }
5595     }
5596     ovs_mutex_unlock(&dp->port_mutex);
5597     ovs_mutex_unlock(&dp_netdev_mutex);
5598     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5599 }
5600
5601 static void
5602 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5603 {
5604     struct tx_port *tx_port_cached;
5605
5606     /* Flush all the queued packets. */
5607     dp_netdev_pmd_flush_output_packets(pmd, true);
5608     /* Free all used tx queue ids. */
5609     dpif_netdev_xps_revalidate_pmd(pmd, true);
5610
5611     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5612         free(tx_port_cached);
5613     }
5614     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5615         free(tx_port_cached);
5616     }
5617 }
5618
5619 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
5620  * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
5621  * device, otherwise to 'pmd->send_port_cache' if the port has at least
5622  * one txq. */
5623 static void
5624 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
5625     OVS_REQUIRES(pmd->port_mutex)
5626 {
5627     struct tx_port *tx_port, *tx_port_cached;
5628
5629     pmd_free_cached_ports(pmd);
5630     hmap_shrink(&pmd->send_port_cache);
5631     hmap_shrink(&pmd->tnl_port_cache);
5632
5633     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
5634         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
5635             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5636             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
5637                         hash_port_no(tx_port_cached->port->port_no));
5638         }
5639
5640         if (netdev_n_txq(tx_port->port->netdev)) {
5641             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
5642             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
5643                         hash_port_no(tx_port_cached->port->port_no));
5644         }
5645     }
5646 }
5647
5648 static void
5649 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5650 {
5651     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5652     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
5653         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
5654                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
5655     }
5656     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5657
5658     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
5659              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
5660 }
5661
5662 static void
5663 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
5664 {
5665     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
5666     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
5667     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
5668 }
5669
5670 static int
5671 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
5672                           struct polled_queue **ppoll_list)
5673 {
5674     struct polled_queue *poll_list = *ppoll_list;
5675     struct rxq_poll *poll;
5676     int i;
5677
5678     ovs_mutex_lock(&pmd->port_mutex);
5679     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
5680                                     * sizeof *poll_list);
5681
5682     i = 0;
5683     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
5684         poll_list[i].rxq = poll->rxq;
5685         poll_list[i].port_no = poll->rxq->port->port_no;
5686         poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
5687         poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
5688         poll_list[i].change_seq =
5689                      netdev_get_change_seq(poll->rxq->port->netdev);
5690         i++;
5691     }
5692
5693     pmd_load_cached_ports(pmd);
5694
5695     ovs_mutex_unlock(&pmd->port_mutex);
5696
5697     *ppoll_list = poll_list;
5698     return i;
5699 }
5700
5701 static void *
5702 pmd_thread_main(void *f_)
5703 {
5704     struct dp_netdev_pmd_thread *pmd = f_;
5705     struct pmd_perf_stats *s = &pmd->perf_stats;
5706     unsigned int lc = 0;
5707     struct polled_queue *poll_list;
5708     bool wait_for_reload = false;
5709     bool reload_tx_qid;
5710     bool exiting;
5711     bool reload;
5712     int poll_cnt;
5713     int i;
5714     int process_packets = 0;
5715
5716     poll_list = NULL;
5717
5718     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
5719     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
5720     ovs_numa_thread_setaffinity_core(pmd->core_id);
5721     dpdk_set_lcore_id(pmd->core_id);
5722     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5723     dfc_cache_init(&pmd->flow_cache);
5724     pmd_alloc_static_tx_qid(pmd);
5725
5726 reload:
5727     atomic_count_init(&pmd->pmd_overloaded, 0);
5728
5729     /* List port/core affinity */
5730     for (i = 0; i < poll_cnt; i++) {
5731        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
5732                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
5733                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
5734        /* Reset the rxq current cycles counter. */
5735        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
5736     }
5737
5738     if (!poll_cnt) {
5739         if (wait_for_reload) {
5740             /* Don't sleep, control thread will ask for a reload shortly. */
5741             do {
5742                 atomic_read_explicit(&pmd->reload, &reload,
5743                                      memory_order_acquire);
5744             } while (!reload);
5745         } else {
5746             while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
5747                 seq_wait(pmd->reload_seq, pmd->last_reload_seq);
5748                 poll_block();
5749             }
5750         }
5751     }
5752
5753     pmd->intrvl_tsc_prev = 0;
5754     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
5755     cycles_counter_update(s);
5756
5757     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5758
5759     /* Protect pmd stats from external clearing while polling. */
5760     ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
5761     for (;;) {
5762         uint64_t rx_packets = 0, tx_packets = 0;
5763
5764         pmd_perf_start_iteration(s);
5765
5766         for (i = 0; i < poll_cnt; i++) {
5767
5768             if (!poll_list[i].rxq_enabled) {
5769                 continue;
5770             }
5771
5772             if (poll_list[i].emc_enabled) {
5773                 atomic_read_relaxed(&pmd->dp->emc_insert_min,
5774                                     &pmd->ctx.emc_insert_min);
5775             } else {
5776                 pmd->ctx.emc_insert_min = 0;
5777             }
5778
5779             process_packets =
5780                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
5781                                            poll_list[i].port_no);
5782             rx_packets += process_packets;
5783         }
5784
5785         if (!rx_packets) {
5786             /* We didn't receive anything in the process loop.
5787              * Check if we need to send something.
5788              * There was no time updates on current iteration. */
5789             pmd_thread_ctx_time_update(pmd);
5790             tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false);
5791         }
5792
5793         /* Do RCU synchronization at fixed interval.  This ensures that
5794          * synchronization would not be delayed long even at high load of
5795          * packet processing. */
5796         if (pmd->ctx.now > pmd->next_rcu_quiesce) {
5797             if (!ovsrcu_try_quiesce()) {
5798                 pmd->next_rcu_quiesce =
5799                     pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5800             }
5801         }
5802
5803         if (lc++ > 1024) {
5804             lc = 0;
5805
5806             coverage_try_clear();
5807             dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
5808             if (!ovsrcu_try_quiesce()) {
5809                 emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
5810                 pmd->next_rcu_quiesce =
5811                     pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
5812             }
5813
5814             for (i = 0; i < poll_cnt; i++) {
5815                 uint64_t current_seq =
5816                          netdev_get_change_seq(poll_list[i].rxq->port->netdev);
5817                 if (poll_list[i].change_seq != current_seq) {
5818                     poll_list[i].change_seq = current_seq;
5819                     poll_list[i].rxq_enabled =
5820                                  netdev_rxq_enabled(poll_list[i].rxq->rx);
5821                 }
5822             }
5823         }
5824
5825         atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
5826         if (OVS_UNLIKELY(reload)) {
5827             break;
5828         }
5829
5830         pmd_perf_end_iteration(s, rx_packets, tx_packets,
5831                                pmd_perf_metrics_enabled(pmd));
5832     }
5833     ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
5834
5835     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
5836     atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
5837     atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
5838     atomic_read_relaxed(&pmd->exit, &exiting);
5839     /* Signal here to make sure the pmd finishes
5840      * reloading the updated configuration. */
5841     dp_netdev_pmd_reload_done(pmd);
5842
5843     if (reload_tx_qid) {
5844         pmd_free_static_tx_qid(pmd);
5845         pmd_alloc_static_tx_qid(pmd);
5846     }
5847
5848     if (!exiting) {
5849         goto reload;
5850     }
5851
5852     pmd_free_static_tx_qid(pmd);
5853     dfc_cache_uninit(&pmd->flow_cache);
5854     free(poll_list);
5855     pmd_free_cached_ports(pmd);
5856     return NULL;
5857 }
5858
5859 static void
5860 dp_netdev_disable_upcall(struct dp_netdev *dp)
5861     OVS_ACQUIRES(dp->upcall_rwlock)
5862 {
5863     fat_rwlock_wrlock(&dp->upcall_rwlock);
5864 }
5865
5866 \f
5867 /* Meters */
5868 static void
5869 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
5870                                struct ofputil_meter_features *features)
5871 {
5872     features->max_meters = MAX_METERS;
5873     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
5874     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
5875     features->max_bands = MAX_BANDS;
5876     features->max_color = 0;
5877 }
5878
5879 /* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
5880  * that exceed a band are dropped in-place. */
5881 static void
5882 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
5883                     uint32_t meter_id, long long int now)
5884 {
5885     struct dp_meter *meter;
5886     struct dp_meter_band *band;
5887     struct dp_packet *packet;
5888     long long int long_delta_t; /* msec */
5889     uint32_t delta_t; /* msec */
5890     uint32_t delta_in_us; /* usec */
5891     const size_t cnt = dp_packet_batch_size(packets_);
5892     uint32_t bytes, volume;
5893     int exceeded_band[NETDEV_MAX_BURST];
5894     uint32_t exceeded_rate[NETDEV_MAX_BURST];
5895     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
5896
5897     if (meter_id >= MAX_METERS) {
5898         return;
5899     }
5900
5901     meter_lock(dp, meter_id);
5902     meter = dp->meters[meter_id];
5903     if (!meter) {
5904         goto out;
5905     }
5906
5907     /* Initialize as negative values. */
5908     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
5909     /* Initialize as zeroes. */
5910     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
5911
5912     /* All packets will hit the meter at the same time. */
5913     long_delta_t = now / 1000 - meter->used / 1000; /* msec */
5914
5915     if (long_delta_t < 0) {
5916         /* This condition means that we have several threads fighting for a
5917            meter lock, and the one who received the packets a bit later wins.
5918            Assuming that all racing threads received packets at the same time
5919            to avoid overflow. */
5920         long_delta_t = 0;
5921         delta_in_us  = 0;
5922     } else {
5923         delta_in_us  = (now - meter->used) % 1000;
5924     }
5925
5926     /* Make sure delta_t will not be too large, so that bucket will not
5927      * wrap around below. */
5928     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
5929         ? meter->max_delta_t : (uint32_t)long_delta_t;
5930
5931     /* Update meter stats. */
5932     meter->used = now;
5933     meter->packet_count += cnt;
5934     bytes = 0;
5935     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5936         bytes += dp_packet_size(packet);
5937     }
5938     meter->byte_count += bytes;
5939
5940     /* Meters can operate in terms of packets per second or kilobits per
5941      * second. */
5942     if (meter->flags & OFPMF13_PKTPS) {
5943         /* Rate in packets/second, bucket 1/1000 packets. */
5944         /* msec * packets/sec = 1/1000 packets. */
5945         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
5946     } else {
5947         /* Rate in kbps, bucket in bits. */
5948         /* msec * kbps = bits */
5949         volume = bytes * 8;
5950     }
5951
5952     /* Update all bands and find the one hit with the highest rate for each
5953      * packet (if any). */
5954     for (int m = 0; m < meter->n_bands; ++m) {
5955         band = &meter->bands[m];
5956
5957         /* Update band's bucket. */
5958         band->bucket += delta_t * band->up.rate;
5959         band->bucket += delta_in_us * band->up.rate / 1000;
5960         if (band->bucket > band->up.burst_size) {
5961             band->bucket = band->up.burst_size;
5962         }
5963
5964         /* Drain the bucket for all the packets, if possible. */
5965         if (band->bucket >= volume) {
5966             band->bucket -= volume;
5967         } else {
5968             int band_exceeded_pkt;
5969
5970             /* Band limit hit, must process packet-by-packet. */
5971             if (meter->flags & OFPMF13_PKTPS) {
5972                 band_exceeded_pkt = band->bucket / 1000;
5973                 band->bucket %= 1000; /* Remainder stays in bucket. */
5974
5975                 /* Update the exceeding band for each exceeding packet.
5976                  * (Only one band will be fired by a packet, and that
5977                  * can be different for each packet.) */
5978                 for (int i = band_exceeded_pkt; i < cnt; i++) {
5979                     if (band->up.rate > exceeded_rate[i]) {
5980                         exceeded_rate[i] = band->up.rate;
5981                         exceeded_band[i] = m;
5982                     }
5983                 }
5984             } else {
5985                 /* Packet sizes differ, must process one-by-one. */
5986                 band_exceeded_pkt = cnt;
5987                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
5988                     uint32_t bits = dp_packet_size(packet) * 8;
5989
5990                     if (band->bucket >= bits) {
5991                         band->bucket -= bits;
5992                     } else {
5993                         if (i < band_exceeded_pkt) {
5994                             band_exceeded_pkt = i;
5995                         }
5996                         /* Update the exceeding band for the exceeding packet.
5997                          * (Only one band will be fired by a packet, and that
5998                          * can be different for each packet.) */
5999                         if (band->up.rate > exceeded_rate[i]) {
6000                             exceeded_rate[i] = band->up.rate;
6001                             exceeded_band[i] = m;
6002                         }
6003                     }
6004                 }
6005             }
6006             /* Remember the first exceeding packet. */
6007             if (exceeded_pkt > band_exceeded_pkt) {
6008                 exceeded_pkt = band_exceeded_pkt;
6009             }
6010         }
6011     }
6012
6013     /* Fire the highest rate band exceeded by each packet, and drop
6014      * packets if needed. */
6015     size_t j;
6016     DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6017         if (exceeded_band[j] >= 0) {
6018             /* Meter drop packet. */
6019             band = &meter->bands[exceeded_band[j]];
6020             band->packet_count += 1;
6021             band->byte_count += dp_packet_size(packet);
6022             COVERAGE_INC(datapath_drop_meter);
6023             dp_packet_delete(packet);
6024         } else {
6025             /* Meter accepts packet. */
6026             dp_packet_batch_refill(packets_, packet, j);
6027         }
6028     }
6029  out:
6030     meter_unlock(dp, meter_id);
6031 }
6032
6033 /* Meter set/get/del processing is still single-threaded. */
6034 static int
6035 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6036                       struct ofputil_meter_config *config)
6037 {
6038     struct dp_netdev *dp = get_dp_netdev(dpif);
6039     uint32_t mid = meter_id.uint32;
6040     struct dp_meter *meter;
6041     int i;
6042
6043     if (mid >= MAX_METERS) {
6044         return EFBIG; /* Meter_id out of range. */
6045     }
6046
6047     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6048         return EBADF; /* Unsupported flags set */
6049     }
6050
6051     if (config->n_bands > MAX_BANDS) {
6052         return EINVAL;
6053     }
6054
6055     for (i = 0; i < config->n_bands; ++i) {
6056         switch (config->bands[i].type) {
6057         case OFPMBT13_DROP:
6058             break;
6059         default:
6060             return ENODEV; /* Unsupported band type */
6061         }
6062     }
6063
6064     /* Allocate meter */
6065     meter = xzalloc(sizeof *meter
6066                     + config->n_bands * sizeof(struct dp_meter_band));
6067
6068     meter->flags = config->flags;
6069     meter->n_bands = config->n_bands;
6070     meter->max_delta_t = 0;
6071     meter->used = time_usec();
6072
6073     /* set up bands */
6074     for (i = 0; i < config->n_bands; ++i) {
6075         uint32_t band_max_delta_t;
6076
6077         /* Set burst size to a workable value if none specified. */
6078         if (config->bands[i].burst_size == 0) {
6079             config->bands[i].burst_size = config->bands[i].rate;
6080         }
6081
6082         meter->bands[i].up = config->bands[i];
6083         /* Convert burst size to the bucket units: */
6084         /* pkts => 1/1000 packets, kilobits => bits. */
6085         meter->bands[i].up.burst_size *= 1000;
6086         /* Initialize bucket to empty. */
6087         meter->bands[i].bucket = 0;
6088
6089         /* Figure out max delta_t that is enough to fill any bucket. */
6090         band_max_delta_t
6091             = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
6092         if (band_max_delta_t > meter->max_delta_t) {
6093             meter->max_delta_t = band_max_delta_t;
6094         }
6095     }
6096
6097     meter_lock(dp, mid);
6098     dp_delete_meter(dp, mid); /* Free existing meter, if any */
6099     dp->meters[mid] = meter;
6100     meter_unlock(dp, mid);
6101
6102     return 0;
6103 }
6104
6105 static int
6106 dpif_netdev_meter_get(const struct dpif *dpif,
6107                       ofproto_meter_id meter_id_,
6108                       struct ofputil_meter_stats *stats, uint16_t n_bands)
6109 {
6110     const struct dp_netdev *dp = get_dp_netdev(dpif);
6111     uint32_t meter_id = meter_id_.uint32;
6112     int retval = 0;
6113
6114     if (meter_id >= MAX_METERS) {
6115         return EFBIG;
6116     }
6117
6118     meter_lock(dp, meter_id);
6119     const struct dp_meter *meter = dp->meters[meter_id];
6120     if (!meter) {
6121         retval = ENOENT;
6122         goto done;
6123     }
6124     if (stats) {
6125         int i = 0;
6126
6127         stats->packet_in_count = meter->packet_count;
6128         stats->byte_in_count = meter->byte_count;
6129
6130         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6131             stats->bands[i].packet_count = meter->bands[i].packet_count;
6132             stats->bands[i].byte_count = meter->bands[i].byte_count;
6133         }
6134
6135         stats->n_bands = i;
6136     }
6137
6138 done:
6139     meter_unlock(dp, meter_id);
6140     return retval;
6141 }
6142
6143 static int
6144 dpif_netdev_meter_del(struct dpif *dpif,
6145                       ofproto_meter_id meter_id_,
6146                       struct ofputil_meter_stats *stats, uint16_t n_bands)
6147 {
6148     struct dp_netdev *dp = get_dp_netdev(dpif);
6149     int error;
6150
6151     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6152     if (!error) {
6153         uint32_t meter_id = meter_id_.uint32;
6154
6155         meter_lock(dp, meter_id);
6156         dp_delete_meter(dp, meter_id);
6157         meter_unlock(dp, meter_id);
6158     }
6159     return error;
6160 }
6161
6162 \f
6163 static void
6164 dpif_netdev_disable_upcall(struct dpif *dpif)
6165     OVS_NO_THREAD_SAFETY_ANALYSIS
6166 {
6167     struct dp_netdev *dp = get_dp_netdev(dpif);
6168     dp_netdev_disable_upcall(dp);
6169 }
6170
6171 static void
6172 dp_netdev_enable_upcall(struct dp_netdev *dp)
6173     OVS_RELEASES(dp->upcall_rwlock)
6174 {
6175     fat_rwlock_unlock(&dp->upcall_rwlock);
6176 }
6177
6178 static void
6179 dpif_netdev_enable_upcall(struct dpif *dpif)
6180     OVS_NO_THREAD_SAFETY_ANALYSIS
6181 {
6182     struct dp_netdev *dp = get_dp_netdev(dpif);
6183     dp_netdev_enable_upcall(dp);
6184 }
6185
6186 static void
6187 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6188 {
6189     atomic_store_relaxed(&pmd->wait_for_reload, false);
6190     atomic_store_relaxed(&pmd->reload_tx_qid, false);
6191     pmd->last_reload_seq = seq_read(pmd->reload_seq);
6192     atomic_store_explicit(&pmd->reload, false, memory_order_release);
6193 }
6194
6195 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
6196  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6197  * 'core_id' is NON_PMD_CORE_ID).
6198  *
6199  * Caller must unrefs the returned reference.  */
6200 static struct dp_netdev_pmd_thread *
6201 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6202 {
6203     struct dp_netdev_pmd_thread *pmd;
6204     const struct cmap_node *pnode;
6205
6206     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
6207     if (!pnode) {
6208         return NULL;
6209     }
6210     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
6211
6212     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6213 }
6214
6215 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6216 static void
6217 dp_netdev_set_nonpmd(struct dp_netdev *dp)
6218     OVS_REQUIRES(dp->port_mutex)
6219 {
6220     struct dp_netdev_pmd_thread *non_pmd;
6221
6222     non_pmd = xzalloc(sizeof *non_pmd);
6223     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6224 }
6225
6226 /* Caller must have valid pointer to 'pmd'. */
6227 static bool
6228 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6229 {
6230     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6231 }
6232
6233 static void
6234 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6235 {
6236     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6237         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6238     }
6239 }
6240
6241 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
6242  * fails, keeps checking for next node until reaching the end of cmap.
6243  *
6244  * Caller must unrefs the returned reference. */
6245 static struct dp_netdev_pmd_thread *
6246 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6247 {
6248     struct dp_netdev_pmd_thread *next;
6249
6250     do {
6251         struct cmap_node *node;
6252
6253         node = cmap_next_position(&dp->poll_threads, pos);
6254         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6255             : NULL;
6256     } while (next && !dp_netdev_pmd_try_ref(next));
6257
6258     return next;
6259 }
6260
6261 /* Configures the 'pmd' based on the input argument. */
6262 static void
6263 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6264                         unsigned core_id, int numa_id)
6265 {
6266     pmd->dp = dp;
6267     pmd->core_id = core_id;
6268     pmd->numa_id = numa_id;
6269     pmd->need_reload = false;
6270     pmd->n_output_batches = 0;
6271
6272     ovs_refcount_init(&pmd->ref_cnt);
6273     atomic_init(&pmd->exit, false);
6274     pmd->reload_seq = seq_create();
6275     pmd->last_reload_seq = seq_read(pmd->reload_seq);
6276     atomic_init(&pmd->reload, false);
6277     ovs_mutex_init(&pmd->flow_mutex);
6278     ovs_mutex_init(&pmd->port_mutex);
6279     ovs_mutex_init(&pmd->bond_mutex);
6280     cmap_init(&pmd->flow_table);
6281     cmap_init(&pmd->classifiers);
6282     pmd->ctx.last_rxq = NULL;
6283     pmd_thread_ctx_time_update(pmd);
6284     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6285     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6286     pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
6287     hmap_init(&pmd->poll_list);
6288     hmap_init(&pmd->tx_ports);
6289     hmap_init(&pmd->tnl_port_cache);
6290     hmap_init(&pmd->send_port_cache);
6291     cmap_init(&pmd->tx_bonds);
6292     /* init the 'flow_cache' since there is no
6293      * actual thread created for NON_PMD_CORE_ID. */
6294     if (core_id == NON_PMD_CORE_ID) {
6295         dfc_cache_init(&pmd->flow_cache);
6296         pmd_alloc_static_tx_qid(pmd);
6297     }
6298     pmd_perf_stats_init(&pmd->perf_stats);
6299     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6300                 hash_int(core_id, 0));
6301 }
6302
6303 static void
6304 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6305 {
6306     struct dpcls *cls;
6307
6308     dp_netdev_pmd_flow_flush(pmd);
6309     hmap_destroy(&pmd->send_port_cache);
6310     hmap_destroy(&pmd->tnl_port_cache);
6311     hmap_destroy(&pmd->tx_ports);
6312     cmap_destroy(&pmd->tx_bonds);
6313     hmap_destroy(&pmd->poll_list);
6314     /* All flows (including their dpcls_rules) have been deleted already */
6315     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6316         dpcls_destroy(cls);
6317         ovsrcu_postpone(free, cls);
6318     }
6319     cmap_destroy(&pmd->classifiers);
6320     cmap_destroy(&pmd->flow_table);
6321     ovs_mutex_destroy(&pmd->flow_mutex);
6322     seq_destroy(pmd->reload_seq);
6323     ovs_mutex_destroy(&pmd->port_mutex);
6324     ovs_mutex_destroy(&pmd->bond_mutex);
6325     free(pmd);
6326 }
6327
6328 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
6329  * and unrefs the struct. */
6330 static void
6331 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6332 {
6333     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6334      * but extra cleanup is necessary */
6335     if (pmd->core_id == NON_PMD_CORE_ID) {
6336         ovs_mutex_lock(&dp->non_pmd_mutex);
6337         dfc_cache_uninit(&pmd->flow_cache);
6338         pmd_free_cached_ports(pmd);
6339         pmd_free_static_tx_qid(pmd);
6340         ovs_mutex_unlock(&dp->non_pmd_mutex);
6341     } else {
6342         atomic_store_relaxed(&pmd->exit, true);
6343         dp_netdev_reload_pmd__(pmd);
6344         xpthread_join(pmd->thread, NULL);
6345     }
6346
6347     dp_netdev_pmd_clear_ports(pmd);
6348
6349     /* Purges the 'pmd''s flows after stopping the thread, but before
6350      * destroying the flows, so that the flow stats can be collected. */
6351     if (dp->dp_purge_cb) {
6352         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6353     }
6354     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6355     dp_netdev_pmd_unref(pmd);
6356 }
6357
6358 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6359  * thread. */
6360 static void
6361 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6362 {
6363     struct dp_netdev_pmd_thread *pmd;
6364     struct dp_netdev_pmd_thread **pmd_list;
6365     size_t k = 0, n_pmds;
6366
6367     n_pmds = cmap_count(&dp->poll_threads);
6368     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6369
6370     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6371         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6372             continue;
6373         }
6374         /* We cannot call dp_netdev_del_pmd(), since it alters
6375          * 'dp->poll_threads' (while we're iterating it) and it
6376          * might quiesce. */
6377         ovs_assert(k < n_pmds);
6378         pmd_list[k++] = pmd;
6379     }
6380
6381     for (size_t i = 0; i < k; i++) {
6382         dp_netdev_del_pmd(dp, pmd_list[i]);
6383     }
6384     free(pmd_list);
6385 }
6386
6387 /* Deletes all rx queues from pmd->poll_list and all the ports from
6388  * pmd->tx_ports. */
6389 static void
6390 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6391 {
6392     struct rxq_poll *poll;
6393     struct tx_port *port;
6394     struct tx_bond *tx;
6395
6396     ovs_mutex_lock(&pmd->port_mutex);
6397     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6398         free(poll);
6399     }
6400     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6401         free(port);
6402     }
6403     ovs_mutex_unlock(&pmd->port_mutex);
6404
6405     ovs_mutex_lock(&pmd->bond_mutex);
6406     CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6407         cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6408         ovsrcu_postpone(free, tx);
6409     }
6410     ovs_mutex_unlock(&pmd->bond_mutex);
6411 }
6412
6413 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6414 static void
6415 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6416                          struct dp_netdev_rxq *rxq)
6417     OVS_REQUIRES(pmd->port_mutex)
6418 {
6419     int qid = netdev_rxq_get_queue_id(rxq->rx);
6420     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6421     struct rxq_poll *poll;
6422
6423     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6424         if (poll->rxq == rxq) {
6425             /* 'rxq' is already polled by this thread. Do nothing. */
6426             return;
6427         }
6428     }
6429
6430     poll = xmalloc(sizeof *poll);
6431     poll->rxq = rxq;
6432     hmap_insert(&pmd->poll_list, &poll->node, hash);
6433
6434     pmd->need_reload = true;
6435 }
6436
6437 /* Delete 'poll' from poll_list of PMD thread. */
6438 static void
6439 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6440                            struct rxq_poll *poll)
6441     OVS_REQUIRES(pmd->port_mutex)
6442 {
6443     hmap_remove(&pmd->poll_list, &poll->node);
6444     free(poll);
6445
6446     pmd->need_reload = true;
6447 }
6448
6449 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6450  * changes to take effect. */
6451 static void
6452 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6453                              struct dp_netdev_port *port)
6454     OVS_REQUIRES(pmd->port_mutex)
6455 {
6456     struct tx_port *tx;
6457
6458     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6459     if (tx) {
6460         /* 'port' is already on this thread tx cache. Do nothing. */
6461         return;
6462     }
6463
6464     tx = xzalloc(sizeof *tx);
6465
6466     tx->port = port;
6467     tx->qid = -1;
6468     tx->flush_time = 0LL;
6469     dp_packet_batch_init(&tx->output_pkts);
6470
6471     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6472     pmd->need_reload = true;
6473 }
6474
6475 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6476  * changes to take effect. */
6477 static void
6478 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6479                                struct tx_port *tx)
6480     OVS_REQUIRES(pmd->port_mutex)
6481 {
6482     hmap_remove(&pmd->tx_ports, &tx->node);
6483     free(tx);
6484     pmd->need_reload = true;
6485 }
6486
6487 /* Add bond to the tx bond cmap of 'pmd'. */
6488 static void
6489 dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6490                              struct tx_bond *bond, bool update)
6491     OVS_EXCLUDED(pmd->bond_mutex)
6492 {
6493     struct tx_bond *tx;
6494
6495     ovs_mutex_lock(&pmd->bond_mutex);
6496     tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6497
6498     if (tx && !update) {
6499         /* It's not an update and the entry already exists.  Do nothing. */
6500         goto unlock;
6501     }
6502
6503     if (tx) {
6504         struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6505
6506         /* Copy the stats for each bucket. */
6507         for (int i = 0; i < BOND_BUCKETS; i++) {
6508             uint64_t n_packets, n_bytes;
6509
6510             atomic_read_relaxed(&tx->slave_buckets[i].n_packets, &n_packets);
6511             atomic_read_relaxed(&tx->slave_buckets[i].n_bytes, &n_bytes);
6512             atomic_init(&new_tx->slave_buckets[i].n_packets, n_packets);
6513             atomic_init(&new_tx->slave_buckets[i].n_bytes, n_bytes);
6514         }
6515         cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
6516                      hash_bond_id(bond->bond_id));
6517         ovsrcu_postpone(free, tx);
6518     } else {
6519         tx = xmemdup(bond, sizeof *bond);
6520         cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
6521     }
6522 unlock:
6523     ovs_mutex_unlock(&pmd->bond_mutex);
6524 }
6525
6526 /* Delete bond from the tx bond cmap of 'pmd'. */
6527 static void
6528 dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6529                                uint32_t bond_id)
6530     OVS_EXCLUDED(pmd->bond_mutex)
6531 {
6532     struct tx_bond *tx;
6533
6534     ovs_mutex_lock(&pmd->bond_mutex);
6535     tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
6536     if (tx) {
6537         cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6538         ovsrcu_postpone(free, tx);
6539     }
6540     ovs_mutex_unlock(&pmd->bond_mutex);
6541 }
6542 \f
6543 static char *
6544 dpif_netdev_get_datapath_version(void)
6545 {
6546      return xstrdup("<built-in>");
6547 }
6548
6549 static void
6550 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
6551                     uint16_t tcp_flags, long long now)
6552 {
6553     uint16_t flags;
6554
6555     atomic_store_relaxed(&netdev_flow->stats.used, now);
6556     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
6557     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
6558     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
6559     flags |= tcp_flags;
6560     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
6561 }
6562
6563 static int
6564 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
6565                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
6566                  enum dpif_upcall_type type, const struct nlattr *userdata,
6567                  struct ofpbuf *actions, struct ofpbuf *put_actions)
6568 {
6569     struct dp_netdev *dp = pmd->dp;
6570
6571     if (OVS_UNLIKELY(!dp->upcall_cb)) {
6572         return ENODEV;
6573     }
6574
6575     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
6576         struct ds ds = DS_EMPTY_INITIALIZER;
6577         char *packet_str;
6578         struct ofpbuf key;
6579         struct odp_flow_key_parms odp_parms = {
6580             .flow = flow,
6581             .mask = wc ? &wc->masks : NULL,
6582             .support = dp_netdev_support,
6583         };
6584
6585         ofpbuf_init(&key, 0);
6586         odp_flow_key_from_flow(&odp_parms, &key);
6587         packet_str = ofp_dp_packet_to_string(packet_);
6588
6589         odp_flow_key_format(key.data, key.size, &ds);
6590
6591         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
6592                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
6593
6594         ofpbuf_uninit(&key);
6595         free(packet_str);
6596
6597         ds_destroy(&ds);
6598     }
6599
6600     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
6601                          actions, wc, put_actions, dp->upcall_aux);
6602 }
6603
6604 static inline uint32_t
6605 dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet,
6606                                 const struct miniflow *mf)
6607 {
6608     uint32_t hash;
6609
6610     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6611         hash = dp_packet_get_rss_hash(packet);
6612     } else {
6613         hash = miniflow_hash_5tuple(mf, 0);
6614         dp_packet_set_rss_hash(packet, hash);
6615     }
6616
6617     return hash;
6618 }
6619
6620 static inline uint32_t
6621 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
6622                                 const struct miniflow *mf)
6623 {
6624     uint32_t hash, recirc_depth;
6625
6626     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
6627         hash = dp_packet_get_rss_hash(packet);
6628     } else {
6629         hash = miniflow_hash_5tuple(mf, 0);
6630         dp_packet_set_rss_hash(packet, hash);
6631     }
6632
6633     /* The RSS hash must account for the recirculation depth to avoid
6634      * collisions in the exact match cache */
6635     recirc_depth = *recirc_depth_get_unsafe();
6636     if (OVS_UNLIKELY(recirc_depth)) {
6637         hash = hash_finish(hash, recirc_depth);
6638     }
6639     return hash;
6640 }
6641
6642 struct packet_batch_per_flow {
6643     unsigned int byte_count;
6644     uint16_t tcp_flags;
6645     struct dp_netdev_flow *flow;
6646
6647     struct dp_packet_batch array;
6648 };
6649
6650 static inline void
6651 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
6652                              struct dp_packet *packet,
6653                              uint16_t tcp_flags)
6654 {
6655     batch->byte_count += dp_packet_size(packet);
6656     batch->tcp_flags |= tcp_flags;
6657     dp_packet_batch_add(&batch->array, packet);
6658 }
6659
6660 static inline void
6661 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
6662                            struct dp_netdev_flow *flow)
6663 {
6664     flow->batch = batch;
6665
6666     batch->flow = flow;
6667     dp_packet_batch_init(&batch->array);
6668     batch->byte_count = 0;
6669     batch->tcp_flags = 0;
6670 }
6671
6672 static inline void
6673 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
6674                               struct dp_netdev_pmd_thread *pmd)
6675 {
6676     struct dp_netdev_actions *actions;
6677     struct dp_netdev_flow *flow = batch->flow;
6678
6679     dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
6680                         batch->byte_count,
6681                         batch->tcp_flags, pmd->ctx.now / 1000);
6682
6683     actions = dp_netdev_flow_get_actions(flow);
6684
6685     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
6686                               actions->actions, actions->size);
6687 }
6688
6689 static inline void
6690 dp_netdev_queue_batches(struct dp_packet *pkt,
6691                         struct dp_netdev_flow *flow, uint16_t tcp_flags,
6692                         struct packet_batch_per_flow *batches,
6693                         size_t *n_batches)
6694 {
6695     struct packet_batch_per_flow *batch = flow->batch;
6696
6697     if (OVS_UNLIKELY(!batch)) {
6698         batch = &batches[(*n_batches)++];
6699         packet_batch_per_flow_init(batch, flow);
6700     }
6701
6702     packet_batch_per_flow_update(batch, pkt, tcp_flags);
6703 }
6704
6705 static inline void
6706 packet_enqueue_to_flow_map(struct dp_packet *packet,
6707                            struct dp_netdev_flow *flow,
6708                            uint16_t tcp_flags,
6709                            struct dp_packet_flow_map *flow_map,
6710                            size_t index)
6711 {
6712     struct dp_packet_flow_map *map = &flow_map[index];
6713     map->flow = flow;
6714     map->packet = packet;
6715     map->tcp_flags = tcp_flags;
6716 }
6717
6718 /* SMC lookup function for a batch of packets.
6719  * By doing batching SMC lookup, we can use prefetch
6720  * to hide memory access latency.
6721  */
6722 static inline void
6723 smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
6724             struct netdev_flow_key *keys,
6725             struct netdev_flow_key **missed_keys,
6726             struct dp_packet_batch *packets_,
6727             const int cnt,
6728             struct dp_packet_flow_map *flow_map,
6729             uint8_t *index_map)
6730 {
6731     int i;
6732     struct dp_packet *packet;
6733     size_t n_smc_hit = 0, n_missed = 0;
6734     struct dfc_cache *cache = &pmd->flow_cache;
6735     struct smc_cache *smc_cache = &cache->smc_cache;
6736     const struct cmap_node *flow_node;
6737     int recv_idx;
6738     uint16_t tcp_flags;
6739
6740     /* Prefetch buckets for all packets */
6741     for (i = 0; i < cnt; i++) {
6742         OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
6743     }
6744
6745     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6746         struct dp_netdev_flow *flow = NULL;
6747         flow_node = smc_entry_get(pmd, keys[i].hash);
6748         bool hit = false;
6749         /* Get the original order of this packet in received batch. */
6750         recv_idx = index_map[i];
6751
6752         if (OVS_LIKELY(flow_node != NULL)) {
6753             CMAP_NODE_FOR_EACH (flow, node, flow_node) {
6754                 /* Since we dont have per-port megaflow to check the port
6755                  * number, we need to  verify that the input ports match. */
6756                 if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
6757                 flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
6758                     tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
6759
6760                     /* SMC hit and emc miss, we insert into EMC */
6761                     keys[i].len =
6762                         netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
6763                     emc_probabilistic_insert(pmd, &keys[i], flow);
6764                     /* Add these packets into the flow map in the same order
6765                      * as received.
6766                      */
6767                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6768                                                flow_map, recv_idx);
6769                     n_smc_hit++;
6770                     hit = true;
6771                     break;
6772                 }
6773             }
6774             if (hit) {
6775                 continue;
6776             }
6777         }
6778
6779         /* SMC missed. Group missed packets together at
6780          * the beginning of the 'packets' array. */
6781         dp_packet_batch_refill(packets_, packet, i);
6782
6783         /* Preserve the order of packet for flow batching. */
6784         index_map[n_missed] = recv_idx;
6785
6786         /* Put missed keys to the pointer arrays return to the caller */
6787         missed_keys[n_missed++] = &keys[i];
6788     }
6789
6790     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
6791 }
6792
6793 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
6794  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
6795  * miniflow is copied into 'keys' and the packet pointer is moved at the
6796  * beginning of the 'packets' array. The pointers of missed keys are put in the
6797  * missed_keys pointer array for future processing.
6798  *
6799  * The function returns the number of packets that needs to be processed in the
6800  * 'packets' array (they have been moved to the beginning of the vector).
6801  *
6802  * For performance reasons a caller may choose not to initialize the metadata
6803  * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
6804  * is not valid and must be initialized by this function using 'port_no'.
6805  * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
6806  * will be ignored.
6807  */
6808 static inline size_t
6809 dfc_processing(struct dp_netdev_pmd_thread *pmd,
6810                struct dp_packet_batch *packets_,
6811                struct netdev_flow_key *keys,
6812                struct netdev_flow_key **missed_keys,
6813                struct packet_batch_per_flow batches[], size_t *n_batches,
6814                struct dp_packet_flow_map *flow_map,
6815                size_t *n_flows, uint8_t *index_map,
6816                bool md_is_valid, odp_port_t port_no)
6817 {
6818     struct netdev_flow_key *key = &keys[0];
6819     size_t n_missed = 0, n_emc_hit = 0;
6820     struct dfc_cache *cache = &pmd->flow_cache;
6821     struct dp_packet *packet;
6822     const size_t cnt = dp_packet_batch_size(packets_);
6823     uint32_t cur_min = pmd->ctx.emc_insert_min;
6824     int i;
6825     uint16_t tcp_flags;
6826     bool smc_enable_db;
6827     size_t map_cnt = 0;
6828     bool batch_enable = true;
6829
6830     atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
6831     pmd_perf_update_counter(&pmd->perf_stats,
6832                             md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
6833                             cnt);
6834
6835     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
6836         struct dp_netdev_flow *flow;
6837         uint32_t mark;
6838
6839         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
6840             dp_packet_delete(packet);
6841             COVERAGE_INC(datapath_drop_rx_invalid_packet);
6842             continue;
6843         }
6844
6845         if (i != cnt - 1) {
6846             struct dp_packet **packets = packets_->packets;
6847             /* Prefetch next packet data and metadata. */
6848             OVS_PREFETCH(dp_packet_data(packets[i+1]));
6849             pkt_metadata_prefetch_init(&packets[i+1]->md);
6850         }
6851
6852         if (!md_is_valid) {
6853             pkt_metadata_init(&packet->md, port_no);
6854         }
6855
6856         if ((*recirc_depth_get() == 0) &&
6857             dp_packet_has_flow_mark(packet, &mark)) {
6858             flow = mark_to_flow_find(pmd, mark);
6859             if (OVS_LIKELY(flow)) {
6860                 tcp_flags = parse_tcp_flags(packet);
6861                 if (OVS_LIKELY(batch_enable)) {
6862                     dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6863                                             n_batches);
6864                 } else {
6865                     /* Flow batching should be performed only after fast-path
6866                      * processing is also completed for packets with emc miss
6867                      * or else it will result in reordering of packets with
6868                      * same datapath flows. */
6869                     packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6870                                                flow_map, map_cnt++);
6871                 }
6872                 continue;
6873             }
6874         }
6875
6876         miniflow_extract(packet, &key->mf);
6877         key->len = 0; /* Not computed yet. */
6878         key->hash =
6879                 (md_is_valid == false)
6880                 ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
6881                 : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
6882
6883         /* If EMC is disabled skip emc_lookup */
6884         flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
6885         if (OVS_LIKELY(flow)) {
6886             tcp_flags = miniflow_get_tcp_flags(&key->mf);
6887             n_emc_hit++;
6888             if (OVS_LIKELY(batch_enable)) {
6889                 dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
6890                                         n_batches);
6891             } else {
6892                 /* Flow batching should be performed only after fast-path
6893                  * processing is also completed for packets with emc miss
6894                  * or else it will result in reordering of packets with
6895                  * same datapath flows. */
6896                 packet_enqueue_to_flow_map(packet, flow, tcp_flags,
6897                                            flow_map, map_cnt++);
6898             }
6899         } else {
6900             /* Exact match cache missed. Group missed packets together at
6901              * the beginning of the 'packets' array. */
6902             dp_packet_batch_refill(packets_, packet, i);
6903
6904             /* Preserve the order of packet for flow batching. */
6905             index_map[n_missed] = map_cnt;
6906             flow_map[map_cnt++].flow = NULL;
6907
6908             /* 'key[n_missed]' contains the key of the current packet and it
6909              * will be passed to SMC lookup. The next key should be extracted
6910              * to 'keys[n_missed + 1]'.
6911              * We also maintain a pointer array to keys missed both SMC and EMC
6912              * which will be returned to the caller for future processing. */
6913             missed_keys[n_missed] = key;
6914             key = &keys[++n_missed];
6915
6916             /* Skip batching for subsequent packets to avoid reordering. */
6917             batch_enable = false;
6918         }
6919     }
6920     /* Count of packets which are not flow batched. */
6921     *n_flows = map_cnt;
6922
6923     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
6924
6925     if (!smc_enable_db) {
6926         return dp_packet_batch_size(packets_);
6927     }
6928
6929     /* Packets miss EMC will do a batch lookup in SMC if enabled */
6930     smc_lookup_batch(pmd, keys, missed_keys, packets_,
6931                      n_missed, flow_map, index_map);
6932
6933     return dp_packet_batch_size(packets_);
6934 }
6935
6936 static inline int
6937 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
6938                      struct dp_packet *packet,
6939                      const struct netdev_flow_key *key,
6940                      struct ofpbuf *actions, struct ofpbuf *put_actions)
6941 {
6942     struct ofpbuf *add_actions;
6943     struct dp_packet_batch b;
6944     struct match match;
6945     ovs_u128 ufid;
6946     int error;
6947     uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
6948
6949     match.tun_md.valid = false;
6950     miniflow_expand(&key->mf, &match.flow);
6951     memset(&match.wc, 0, sizeof match.wc);
6952
6953     ofpbuf_clear(actions);
6954     ofpbuf_clear(put_actions);
6955
6956     odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
6957     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
6958                              &ufid, DPIF_UC_MISS, NULL, actions,
6959                              put_actions);
6960     if (OVS_UNLIKELY(error && error != ENOSPC)) {
6961         dp_packet_delete(packet);
6962         COVERAGE_INC(datapath_drop_upcall_error);
6963         return error;
6964     }
6965
6966     /* The Netlink encoding of datapath flow keys cannot express
6967      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
6968      * tag is interpreted as exact match on the fact that there is no
6969      * VLAN.  Unless we refactor a lot of code that translates between
6970      * Netlink and struct flow representations, we have to do the same
6971      * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
6972     if (!match.wc.masks.vlans[0].tci) {
6973         match.wc.masks.vlans[0].tci = htons(0xffff);
6974     }
6975
6976     /* We can't allow the packet batching in the next loop to execute
6977      * the actions.  Otherwise, if there are any slow path actions,
6978      * we'll send the packet up twice. */
6979     dp_packet_batch_init_packet(&b, packet);
6980     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
6981                               actions->data, actions->size);
6982
6983     add_actions = put_actions->size ? put_actions : actions;
6984     if (OVS_LIKELY(error != ENOSPC)) {
6985         struct dp_netdev_flow *netdev_flow;
6986
6987         /* XXX: There's a race window where a flow covering this packet
6988          * could have already been installed since we last did the flow
6989          * lookup before upcall.  This could be solved by moving the
6990          * mutex lock outside the loop, but that's an awful long time
6991          * to be locking revalidators out of making flow modifications. */
6992         ovs_mutex_lock(&pmd->flow_mutex);
6993         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
6994         if (OVS_LIKELY(!netdev_flow)) {
6995             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
6996                                              add_actions->data,
6997                                              add_actions->size);
6998         }
6999         ovs_mutex_unlock(&pmd->flow_mutex);
7000         uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7001         smc_insert(pmd, key, hash);
7002         emc_probabilistic_insert(pmd, key, netdev_flow);
7003     }
7004     if (pmd_perf_metrics_enabled(pmd)) {
7005         /* Update upcall stats. */
7006         cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7007         struct pmd_perf_stats *s = &pmd->perf_stats;
7008         s->current.upcalls++;
7009         s->current.upcall_cycles += cycles;
7010         histogram_add_sample(&s->cycles_per_upcall, cycles);
7011     }
7012     return error;
7013 }
7014
7015 static inline void
7016 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7017                      struct dp_packet_batch *packets_,
7018                      struct netdev_flow_key **keys,
7019                      struct dp_packet_flow_map *flow_map,
7020                      uint8_t *index_map,
7021                      odp_port_t in_port)
7022 {
7023     const size_t cnt = dp_packet_batch_size(packets_);
7024 #if !defined(__CHECKER__) && !defined(_WIN32)
7025     const size_t PKT_ARRAY_SIZE = cnt;
7026 #else
7027     /* Sparse or MSVC doesn't like variable length array. */
7028     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7029 #endif
7030     struct dp_packet *packet;
7031     struct dpcls *cls;
7032     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7033     struct dp_netdev *dp = pmd->dp;
7034     int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7035     int lookup_cnt = 0, add_lookup_cnt;
7036     bool any_miss;
7037
7038     for (size_t i = 0; i < cnt; i++) {
7039         /* Key length is needed in all the cases, hash computed on demand. */
7040         keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7041     }
7042     /* Get the classifier for the in_port */
7043     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7044     if (OVS_LIKELY(cls)) {
7045         any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7046                                 rules, cnt, &lookup_cnt);
7047     } else {
7048         any_miss = true;
7049         memset(rules, 0, sizeof(rules));
7050     }
7051     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7052         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7053         struct ofpbuf actions, put_actions;
7054
7055         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7056         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7057
7058         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7059             struct dp_netdev_flow *netdev_flow;
7060
7061             if (OVS_LIKELY(rules[i])) {
7062                 continue;
7063             }
7064
7065             /* It's possible that an earlier slow path execution installed
7066              * a rule covering this flow.  In this case, it's a lot cheaper
7067              * to catch it here than execute a miss. */
7068             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7069                                                     &add_lookup_cnt);
7070             if (netdev_flow) {
7071                 lookup_cnt += add_lookup_cnt;
7072                 rules[i] = &netdev_flow->cr;
7073                 continue;
7074             }
7075
7076             int error = handle_packet_upcall(pmd, packet, keys[i],
7077                                              &actions, &put_actions);
7078
7079             if (OVS_UNLIKELY(error)) {
7080                 upcall_fail_cnt++;
7081             } else {
7082                 upcall_ok_cnt++;
7083             }
7084         }
7085
7086         ofpbuf_uninit(&actions);
7087         ofpbuf_uninit(&put_actions);
7088         fat_rwlock_unlock(&dp->upcall_rwlock);
7089     } else if (OVS_UNLIKELY(any_miss)) {
7090         DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7091             if (OVS_UNLIKELY(!rules[i])) {
7092                 dp_packet_delete(packet);
7093                 COVERAGE_INC(datapath_drop_lock_error);
7094                 upcall_fail_cnt++;
7095             }
7096         }
7097     }
7098
7099     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7100         struct dp_netdev_flow *flow;
7101         /* Get the original order of this packet in received batch. */
7102         int recv_idx = index_map[i];
7103         uint16_t tcp_flags;
7104
7105         if (OVS_UNLIKELY(!rules[i])) {
7106             continue;
7107         }
7108
7109         flow = dp_netdev_flow_cast(rules[i]);
7110         uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
7111         smc_insert(pmd, keys[i], hash);
7112
7113         emc_probabilistic_insert(pmd, keys[i], flow);
7114         /* Add these packets into the flow map in the same order
7115          * as received.
7116          */
7117         tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7118         packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7119                                    flow_map, recv_idx);
7120     }
7121
7122     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7123                             cnt - upcall_ok_cnt - upcall_fail_cnt);
7124     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7125                             lookup_cnt);
7126     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7127                             upcall_ok_cnt);
7128     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7129                             upcall_fail_cnt);
7130 }
7131
7132 /* Packets enter the datapath from a port (or from recirculation) here.
7133  *
7134  * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7135  * When false the metadata in 'packets' need to be initialized. */
7136 static void
7137 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7138                   struct dp_packet_batch *packets,
7139                   bool md_is_valid, odp_port_t port_no)
7140 {
7141 #if !defined(__CHECKER__) && !defined(_WIN32)
7142     const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7143 #else
7144     /* Sparse or MSVC doesn't like variable length array. */
7145     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7146 #endif
7147     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7148         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7149     struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7150     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7151     size_t n_batches;
7152     struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7153     uint8_t index_map[PKT_ARRAY_SIZE];
7154     size_t n_flows, i;
7155
7156     odp_port_t in_port;
7157
7158     n_batches = 0;
7159     dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7160                    flow_map, &n_flows, index_map, md_is_valid, port_no);
7161
7162     if (!dp_packet_batch_is_empty(packets)) {
7163         /* Get ingress port from first packet's metadata. */
7164         in_port = packets->packets[0]->md.in_port.odp_port;
7165         fast_path_processing(pmd, packets, missed_keys,
7166                              flow_map, index_map, in_port);
7167     }
7168
7169     /* Batch rest of packets which are in flow map. */
7170     for (i = 0; i < n_flows; i++) {
7171         struct dp_packet_flow_map *map = &flow_map[i];
7172
7173         if (OVS_UNLIKELY(!map->flow)) {
7174             continue;
7175         }
7176         dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7177                                 batches, &n_batches);
7178      }
7179
7180     /* All the flow batches need to be reset before any call to
7181      * packet_batch_per_flow_execute() as it could potentially trigger
7182      * recirculation. When a packet matching flow ‘j’ happens to be
7183      * recirculated, the nested call to dp_netdev_input__() could potentially
7184      * classify the packet as matching another flow - say 'k'. It could happen
7185      * that in the previous call to dp_netdev_input__() that same flow 'k' had
7186      * already its own batches[k] still waiting to be served.  So if its
7187      * ‘batch’ member is not reset, the recirculated packet would be wrongly
7188      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7189     for (i = 0; i < n_batches; i++) {
7190         batches[i].flow->batch = NULL;
7191     }
7192
7193     for (i = 0; i < n_batches; i++) {
7194         packet_batch_per_flow_execute(&batches[i], pmd);
7195     }
7196 }
7197
7198 static void
7199 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7200                 struct dp_packet_batch *packets,
7201                 odp_port_t port_no)
7202 {
7203     dp_netdev_input__(pmd, packets, false, port_no);
7204 }
7205
7206 static void
7207 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7208                       struct dp_packet_batch *packets)
7209 {
7210     dp_netdev_input__(pmd, packets, true, 0);
7211 }
7212
7213 struct dp_netdev_execute_aux {
7214     struct dp_netdev_pmd_thread *pmd;
7215     const struct flow *flow;
7216 };
7217
7218 static void
7219 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7220                                  void *aux)
7221 {
7222     struct dp_netdev *dp = get_dp_netdev(dpif);
7223     dp->dp_purge_aux = aux;
7224     dp->dp_purge_cb = cb;
7225 }
7226
7227 static void
7228 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7229                                void *aux)
7230 {
7231     struct dp_netdev *dp = get_dp_netdev(dpif);
7232     dp->upcall_aux = aux;
7233     dp->upcall_cb = cb;
7234 }
7235
7236 static void
7237 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7238                                bool purge)
7239 {
7240     struct tx_port *tx;
7241     struct dp_netdev_port *port;
7242     long long interval;
7243
7244     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7245         if (!tx->port->dynamic_txqs) {
7246             continue;
7247         }
7248         interval = pmd->ctx.now - tx->last_used;
7249         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7250             port = tx->port;
7251             ovs_mutex_lock(&port->txq_used_mutex);
7252             port->txq_used[tx->qid]--;
7253             ovs_mutex_unlock(&port->txq_used_mutex);
7254             tx->qid = -1;
7255         }
7256     }
7257 }
7258
7259 static int
7260 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7261                            struct tx_port *tx)
7262 {
7263     struct dp_netdev_port *port;
7264     long long interval;
7265     int i, min_cnt, min_qid;
7266
7267     interval = pmd->ctx.now - tx->last_used;
7268     tx->last_used = pmd->ctx.now;
7269
7270     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7271         return tx->qid;
7272     }
7273
7274     port = tx->port;
7275
7276     ovs_mutex_lock(&port->txq_used_mutex);
7277     if (tx->qid >= 0) {
7278         port->txq_used[tx->qid]--;
7279         tx->qid = -1;
7280     }
7281
7282     min_cnt = -1;
7283     min_qid = 0;
7284     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7285         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7286             min_cnt = port->txq_used[i];
7287             min_qid = i;
7288         }
7289     }
7290
7291     port->txq_used[min_qid]++;
7292     tx->qid = min_qid;
7293
7294     ovs_mutex_unlock(&port->txq_used_mutex);
7295
7296     dpif_netdev_xps_revalidate_pmd(pmd, false);
7297
7298     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7299              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7300     return min_qid;
7301 }
7302
7303 static struct tx_port *
7304 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7305                           odp_port_t port_no)
7306 {
7307     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7308 }
7309
7310 static struct tx_port *
7311 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7312                            odp_port_t port_no)
7313 {
7314     return tx_port_lookup(&pmd->send_port_cache, port_no);
7315 }
7316
7317 static int
7318 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7319                 const struct nlattr *attr,
7320                 struct dp_packet_batch *batch)
7321 {
7322     struct tx_port *tun_port;
7323     const struct ovs_action_push_tnl *data;
7324     int err;
7325
7326     data = nl_attr_get(attr);
7327
7328     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7329     if (!tun_port) {
7330         err = -EINVAL;
7331         goto error;
7332     }
7333     err = netdev_push_header(tun_port->port->netdev, batch, data);
7334     if (!err) {
7335         return 0;
7336     }
7337 error:
7338     dp_packet_delete_batch(batch, true);
7339     return err;
7340 }
7341
7342 static void
7343 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7344                             struct dp_packet *packet, bool should_steal,
7345                             struct flow *flow, ovs_u128 *ufid,
7346                             struct ofpbuf *actions,
7347                             const struct nlattr *userdata)
7348 {
7349     struct dp_packet_batch b;
7350     int error;
7351
7352     ofpbuf_clear(actions);
7353
7354     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7355                              DPIF_UC_ACTION, userdata, actions,
7356                              NULL);
7357     if (!error || error == ENOSPC) {
7358         dp_packet_batch_init_packet(&b, packet);
7359         dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7360                                   actions->data, actions->size);
7361     } else if (should_steal) {
7362         dp_packet_delete(packet);
7363         COVERAGE_INC(datapath_drop_userspace_action_error);
7364     }
7365 }
7366
7367 static bool
7368 dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7369                          struct dp_packet_batch *packets_,
7370                          bool should_steal, odp_port_t port_no)
7371 {
7372     struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7373     struct dp_packet_batch out;
7374
7375     if (!OVS_LIKELY(p)) {
7376         COVERAGE_ADD(datapath_drop_invalid_port,
7377                      dp_packet_batch_size(packets_));
7378         dp_packet_delete_batch(packets_, should_steal);
7379         return false;
7380     }
7381     if (!should_steal) {
7382         dp_packet_batch_clone(&out, packets_);
7383         dp_packet_batch_reset_cutlen(packets_);
7384         packets_ = &out;
7385     }
7386     dp_packet_batch_apply_cutlen(packets_);
7387 #ifdef DPDK_NETDEV
7388     if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
7389                      && packets_->packets[0]->source
7390                         != p->output_pkts.packets[0]->source)) {
7391         /* XXX: netdev-dpdk assumes that all packets in a single
7392          *      output batch has the same source. Flush here to
7393          *      avoid memory access issues. */
7394         dp_netdev_pmd_flush_output_on_port(pmd, p);
7395     }
7396 #endif
7397     if (dp_packet_batch_size(&p->output_pkts)
7398         + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7399         /* Flush here to avoid overflow. */
7400         dp_netdev_pmd_flush_output_on_port(pmd, p);
7401     }
7402     if (dp_packet_batch_is_empty(&p->output_pkts)) {
7403         pmd->n_output_batches++;
7404     }
7405
7406     struct dp_packet *packet;
7407     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7408         p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7409             pmd->ctx.last_rxq;
7410         dp_packet_batch_add(&p->output_pkts, packet);
7411     }
7412     return true;
7413 }
7414
7415 static void
7416 dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7417                             struct dp_packet_batch *packets_,
7418                             bool should_steal, uint32_t bond)
7419 {
7420     struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7421     struct dp_packet_batch out;
7422     struct dp_packet *packet;
7423
7424     if (!p_bond) {
7425         COVERAGE_ADD(datapath_drop_invalid_bond,
7426                      dp_packet_batch_size(packets_));
7427         dp_packet_delete_batch(packets_, should_steal);
7428         return;
7429     }
7430     if (!should_steal) {
7431         dp_packet_batch_clone(&out, packets_);
7432         dp_packet_batch_reset_cutlen(packets_);
7433         packets_ = &out;
7434     }
7435     dp_packet_batch_apply_cutlen(packets_);
7436
7437     DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7438         /*
7439          * Lookup the bond-hash table using hash to get the slave.
7440          */
7441         uint32_t hash = dp_packet_get_rss_hash(packet);
7442         struct slave_entry *s_entry = &p_bond->slave_buckets[hash & BOND_MASK];
7443         odp_port_t bond_member = s_entry->slave_id;
7444         uint32_t size = dp_packet_size(packet);
7445         struct dp_packet_batch output_pkt;
7446
7447         dp_packet_batch_init_packet(&output_pkt, packet);
7448         if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
7449                                                 bond_member))) {
7450             /* Update slave stats. */
7451             non_atomic_ullong_add(&s_entry->n_packets, 1);
7452             non_atomic_ullong_add(&s_entry->n_bytes, size);
7453         }
7454     }
7455 }
7456
7457 static void
7458 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
7459               const struct nlattr *a, bool should_steal)
7460     OVS_NO_THREAD_SAFETY_ANALYSIS
7461 {
7462     struct dp_netdev_execute_aux *aux = aux_;
7463     uint32_t *depth = recirc_depth_get();
7464     struct dp_netdev_pmd_thread *pmd = aux->pmd;
7465     struct dp_netdev *dp = pmd->dp;
7466     int type = nl_attr_type(a);
7467     struct tx_port *p;
7468     uint32_t packet_count, packets_dropped;
7469
7470     switch ((enum ovs_action_attr)type) {
7471     case OVS_ACTION_ATTR_OUTPUT:
7472         dp_execute_output_action(pmd, packets_, should_steal,
7473                                  nl_attr_get_odp_port(a));
7474         return;
7475
7476     case OVS_ACTION_ATTR_LB_OUTPUT:
7477         dp_execute_lb_output_action(pmd, packets_, should_steal,
7478                                     nl_attr_get_u32(a));
7479         return;
7480
7481     case OVS_ACTION_ATTR_TUNNEL_PUSH:
7482         if (should_steal) {
7483             /* We're requested to push tunnel header, but also we need to take
7484              * the ownership of these packets. Thus, we can avoid performing
7485              * the action, because the caller will not use the result anyway.
7486              * Just break to free the batch. */
7487             break;
7488         }
7489         dp_packet_batch_apply_cutlen(packets_);
7490         packet_count = dp_packet_batch_size(packets_);
7491         if (push_tnl_action(pmd, a, packets_)) {
7492             COVERAGE_ADD(datapath_drop_tunnel_push_error,
7493                          packet_count);
7494         }
7495         return;
7496
7497     case OVS_ACTION_ATTR_TUNNEL_POP:
7498         if (*depth < MAX_RECIRC_DEPTH) {
7499             struct dp_packet_batch *orig_packets_ = packets_;
7500             odp_port_t portno = nl_attr_get_odp_port(a);
7501
7502             p = pmd_tnl_port_cache_lookup(pmd, portno);
7503             if (p) {
7504                 struct dp_packet_batch tnl_pkt;
7505
7506                 if (!should_steal) {
7507                     dp_packet_batch_clone(&tnl_pkt, packets_);
7508                     packets_ = &tnl_pkt;
7509                     dp_packet_batch_reset_cutlen(orig_packets_);
7510                 }
7511
7512                 dp_packet_batch_apply_cutlen(packets_);
7513
7514                 packet_count = dp_packet_batch_size(packets_);
7515                 netdev_pop_header(p->port->netdev, packets_);
7516                 packets_dropped =
7517                    packet_count - dp_packet_batch_size(packets_);
7518                 if (packets_dropped) {
7519                     COVERAGE_ADD(datapath_drop_tunnel_pop_error,
7520                                  packets_dropped);
7521                 }
7522                 if (dp_packet_batch_is_empty(packets_)) {
7523                     return;
7524                 }
7525
7526                 struct dp_packet *packet;
7527                 DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7528                     packet->md.in_port.odp_port = portno;
7529                 }
7530
7531                 (*depth)++;
7532                 dp_netdev_recirculate(pmd, packets_);
7533                 (*depth)--;
7534                 return;
7535             }
7536             COVERAGE_ADD(datapath_drop_invalid_tnl_port,
7537                          dp_packet_batch_size(packets_));
7538         } else {
7539             COVERAGE_ADD(datapath_drop_recirc_error,
7540                          dp_packet_batch_size(packets_));
7541         }
7542         break;
7543
7544     case OVS_ACTION_ATTR_USERSPACE:
7545         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7546             struct dp_packet_batch *orig_packets_ = packets_;
7547             const struct nlattr *userdata;
7548             struct dp_packet_batch usr_pkt;
7549             struct ofpbuf actions;
7550             struct flow flow;
7551             ovs_u128 ufid;
7552             bool clone = false;
7553
7554             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
7555             ofpbuf_init(&actions, 0);
7556
7557             if (packets_->trunc) {
7558                 if (!should_steal) {
7559                     dp_packet_batch_clone(&usr_pkt, packets_);
7560                     packets_ = &usr_pkt;
7561                     clone = true;
7562                     dp_packet_batch_reset_cutlen(orig_packets_);
7563                 }
7564
7565                 dp_packet_batch_apply_cutlen(packets_);
7566             }
7567
7568             struct dp_packet *packet;
7569             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7570                 flow_extract(packet, &flow);
7571                 odp_flow_key_hash(&flow, sizeof flow, &ufid);
7572                 dp_execute_userspace_action(pmd, packet, should_steal, &flow,
7573                                             &ufid, &actions, userdata);
7574             }
7575
7576             if (clone) {
7577                 dp_packet_delete_batch(packets_, true);
7578             }
7579
7580             ofpbuf_uninit(&actions);
7581             fat_rwlock_unlock(&dp->upcall_rwlock);
7582
7583             return;
7584         }
7585         COVERAGE_ADD(datapath_drop_lock_error,
7586                      dp_packet_batch_size(packets_));
7587         break;
7588
7589     case OVS_ACTION_ATTR_RECIRC:
7590         if (*depth < MAX_RECIRC_DEPTH) {
7591             struct dp_packet_batch recirc_pkts;
7592
7593             if (!should_steal) {
7594                dp_packet_batch_clone(&recirc_pkts, packets_);
7595                packets_ = &recirc_pkts;
7596             }
7597
7598             struct dp_packet *packet;
7599             DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7600                 packet->md.recirc_id = nl_attr_get_u32(a);
7601             }
7602
7603             (*depth)++;
7604             dp_netdev_recirculate(pmd, packets_);
7605             (*depth)--;
7606
7607             return;
7608         }
7609
7610         COVERAGE_ADD(datapath_drop_recirc_error,
7611                      dp_packet_batch_size(packets_));
7612         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
7613         break;
7614
7615     case OVS_ACTION_ATTR_CT: {
7616         const struct nlattr *b;
7617         bool force = false;
7618         bool commit = false;
7619         unsigned int left;
7620         uint16_t zone = 0;
7621         uint32_t tp_id = 0;
7622         const char *helper = NULL;
7623         const uint32_t *setmark = NULL;
7624         const struct ovs_key_ct_labels *setlabel = NULL;
7625         struct nat_action_info_t nat_action_info;
7626         struct nat_action_info_t *nat_action_info_ref = NULL;
7627         bool nat_config = false;
7628
7629         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
7630                                  nl_attr_get_size(a)) {
7631             enum ovs_ct_attr sub_type = nl_attr_type(b);
7632
7633             switch(sub_type) {
7634             case OVS_CT_ATTR_FORCE_COMMIT:
7635                 force = true;
7636                 /* fall through. */
7637             case OVS_CT_ATTR_COMMIT:
7638                 commit = true;
7639                 break;
7640             case OVS_CT_ATTR_ZONE:
7641                 zone = nl_attr_get_u16(b);
7642                 break;
7643             case OVS_CT_ATTR_HELPER:
7644                 helper = nl_attr_get_string(b);
7645                 break;
7646             case OVS_CT_ATTR_MARK:
7647                 setmark = nl_attr_get(b);
7648                 break;
7649             case OVS_CT_ATTR_LABELS:
7650                 setlabel = nl_attr_get(b);
7651                 break;
7652             case OVS_CT_ATTR_EVENTMASK:
7653                 /* Silently ignored, as userspace datapath does not generate
7654                  * netlink events. */
7655                 break;
7656             case OVS_CT_ATTR_TIMEOUT:
7657                 if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
7658                     VLOG_WARN("Invalid Timeout Policy ID: %s.",
7659                               nl_attr_get_string(b));
7660                     tp_id = DEFAULT_TP_ID;
7661                 }
7662                 break;
7663             case OVS_CT_ATTR_NAT: {
7664                 const struct nlattr *b_nest;
7665                 unsigned int left_nest;
7666                 bool ip_min_specified = false;
7667                 bool proto_num_min_specified = false;
7668                 bool ip_max_specified = false;
7669                 bool proto_num_max_specified = false;
7670                 memset(&nat_action_info, 0, sizeof nat_action_info);
7671                 nat_action_info_ref = &nat_action_info;
7672
7673                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
7674                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
7675
7676                     switch (sub_type_nest) {
7677                     case OVS_NAT_ATTR_SRC:
7678                     case OVS_NAT_ATTR_DST:
7679                         nat_config = true;
7680                         nat_action_info.nat_action |=
7681                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
7682                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
7683                         break;
7684                     case OVS_NAT_ATTR_IP_MIN:
7685                         memcpy(&nat_action_info.min_addr,
7686                                nl_attr_get(b_nest),
7687                                nl_attr_get_size(b_nest));
7688                         ip_min_specified = true;
7689                         break;
7690                     case OVS_NAT_ATTR_IP_MAX:
7691                         memcpy(&nat_action_info.max_addr,
7692                                nl_attr_get(b_nest),
7693                                nl_attr_get_size(b_nest));
7694                         ip_max_specified = true;
7695                         break;
7696                     case OVS_NAT_ATTR_PROTO_MIN:
7697                         nat_action_info.min_port =
7698                             nl_attr_get_u16(b_nest);
7699                         proto_num_min_specified = true;
7700                         break;
7701                     case OVS_NAT_ATTR_PROTO_MAX:
7702                         nat_action_info.max_port =
7703                             nl_attr_get_u16(b_nest);
7704                         proto_num_max_specified = true;
7705                         break;
7706                     case OVS_NAT_ATTR_PERSISTENT:
7707                     case OVS_NAT_ATTR_PROTO_HASH:
7708                     case OVS_NAT_ATTR_PROTO_RANDOM:
7709                         break;
7710                     case OVS_NAT_ATTR_UNSPEC:
7711                     case __OVS_NAT_ATTR_MAX:
7712                         OVS_NOT_REACHED();
7713                     }
7714                 }
7715
7716                 if (ip_min_specified && !ip_max_specified) {
7717                     nat_action_info.max_addr = nat_action_info.min_addr;
7718                 }
7719                 if (proto_num_min_specified && !proto_num_max_specified) {
7720                     nat_action_info.max_port = nat_action_info.min_port;
7721                 }
7722                 if (proto_num_min_specified || proto_num_max_specified) {
7723                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
7724                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
7725                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
7726                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
7727                     }
7728                 }
7729                 break;
7730             }
7731             case OVS_CT_ATTR_UNSPEC:
7732             case __OVS_CT_ATTR_MAX:
7733                 OVS_NOT_REACHED();
7734             }
7735         }
7736
7737         /* We won't be able to function properly in this case, hence
7738          * complain loudly. */
7739         if (nat_config && !commit) {
7740             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
7741             VLOG_WARN_RL(&rl, "NAT specified without commit.");
7742         }
7743
7744         conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
7745                           commit, zone, setmark, setlabel, aux->flow->tp_src,
7746                           aux->flow->tp_dst, helper, nat_action_info_ref,
7747                           pmd->ctx.now / 1000, tp_id);
7748         break;
7749     }
7750
7751     case OVS_ACTION_ATTR_METER:
7752         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
7753                             pmd->ctx.now);
7754         break;
7755
7756     case OVS_ACTION_ATTR_PUSH_VLAN:
7757     case OVS_ACTION_ATTR_POP_VLAN:
7758     case OVS_ACTION_ATTR_PUSH_MPLS:
7759     case OVS_ACTION_ATTR_POP_MPLS:
7760     case OVS_ACTION_ATTR_SET:
7761     case OVS_ACTION_ATTR_SET_MASKED:
7762     case OVS_ACTION_ATTR_SAMPLE:
7763     case OVS_ACTION_ATTR_HASH:
7764     case OVS_ACTION_ATTR_UNSPEC:
7765     case OVS_ACTION_ATTR_TRUNC:
7766     case OVS_ACTION_ATTR_PUSH_ETH:
7767     case OVS_ACTION_ATTR_POP_ETH:
7768     case OVS_ACTION_ATTR_CLONE:
7769     case OVS_ACTION_ATTR_PUSH_NSH:
7770     case OVS_ACTION_ATTR_POP_NSH:
7771     case OVS_ACTION_ATTR_CT_CLEAR:
7772     case OVS_ACTION_ATTR_CHECK_PKT_LEN:
7773     case OVS_ACTION_ATTR_DROP:
7774     case __OVS_ACTION_ATTR_MAX:
7775         OVS_NOT_REACHED();
7776     }
7777
7778     dp_packet_delete_batch(packets_, should_steal);
7779 }
7780
7781 static void
7782 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
7783                           struct dp_packet_batch *packets,
7784                           bool should_steal, const struct flow *flow,
7785                           const struct nlattr *actions, size_t actions_len)
7786 {
7787     struct dp_netdev_execute_aux aux = { pmd, flow };
7788
7789     odp_execute_actions(&aux, packets, should_steal, actions,
7790                         actions_len, dp_execute_cb);
7791 }
7792
7793 struct dp_netdev_ct_dump {
7794     struct ct_dpif_dump_state up;
7795     struct conntrack_dump dump;
7796     struct conntrack *ct;
7797     struct dp_netdev *dp;
7798 };
7799
7800 static int
7801 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
7802                           const uint16_t *pzone, int *ptot_bkts)
7803 {
7804     struct dp_netdev *dp = get_dp_netdev(dpif);
7805     struct dp_netdev_ct_dump *dump;
7806
7807     dump = xzalloc(sizeof *dump);
7808     dump->dp = dp;
7809     dump->ct = dp->conntrack;
7810
7811     conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
7812
7813     *dump_ = &dump->up;
7814
7815     return 0;
7816 }
7817
7818 static int
7819 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
7820                          struct ct_dpif_dump_state *dump_,
7821                          struct ct_dpif_entry *entry)
7822 {
7823     struct dp_netdev_ct_dump *dump;
7824
7825     INIT_CONTAINER(dump, dump_, up);
7826
7827     return conntrack_dump_next(&dump->dump, entry);
7828 }
7829
7830 static int
7831 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
7832                          struct ct_dpif_dump_state *dump_)
7833 {
7834     struct dp_netdev_ct_dump *dump;
7835     int err;
7836
7837     INIT_CONTAINER(dump, dump_, up);
7838
7839     err = conntrack_dump_done(&dump->dump);
7840
7841     free(dump);
7842
7843     return err;
7844 }
7845
7846 static int
7847 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
7848                      const struct ct_dpif_tuple *tuple)
7849 {
7850     struct dp_netdev *dp = get_dp_netdev(dpif);
7851
7852     if (tuple) {
7853         return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
7854     }
7855     return conntrack_flush(dp->conntrack, zone);
7856 }
7857
7858 static int
7859 dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
7860 {
7861     struct dp_netdev *dp = get_dp_netdev(dpif);
7862
7863     return conntrack_set_maxconns(dp->conntrack, maxconns);
7864 }
7865
7866 static int
7867 dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
7868 {
7869     struct dp_netdev *dp = get_dp_netdev(dpif);
7870
7871     return conntrack_get_maxconns(dp->conntrack, maxconns);
7872 }
7873
7874 static int
7875 dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
7876 {
7877     struct dp_netdev *dp = get_dp_netdev(dpif);
7878
7879     return conntrack_get_nconns(dp->conntrack, nconns);
7880 }
7881
7882 static int
7883 dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
7884 {
7885     struct dp_netdev *dp = get_dp_netdev(dpif);
7886
7887     return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
7888 }
7889
7890 static int
7891 dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
7892 {
7893     struct dp_netdev *dp = get_dp_netdev(dpif);
7894     *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
7895     return 0;
7896 }
7897
7898 static int
7899 dpif_netdev_ct_set_limits(struct dpif *dpif OVS_UNUSED,
7900                            const uint32_t *default_limits,
7901                            const struct ovs_list *zone_limits)
7902 {
7903     int err = 0;
7904     struct dp_netdev *dp = get_dp_netdev(dpif);
7905     if (default_limits) {
7906         err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits);
7907         if (err != 0) {
7908             return err;
7909         }
7910     }
7911
7912     struct ct_dpif_zone_limit *zone_limit;
7913     LIST_FOR_EACH (zone_limit, node, zone_limits) {
7914         err = zone_limit_update(dp->conntrack, zone_limit->zone,
7915                                 zone_limit->limit);
7916         if (err != 0) {
7917             break;
7918         }
7919     }
7920     return err;
7921 }
7922
7923 static int
7924 dpif_netdev_ct_get_limits(struct dpif *dpif OVS_UNUSED,
7925                            uint32_t *default_limit,
7926                            const struct ovs_list *zone_limits_request,
7927                            struct ovs_list *zone_limits_reply)
7928 {
7929     struct dp_netdev *dp = get_dp_netdev(dpif);
7930     struct conntrack_zone_limit czl;
7931
7932     czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
7933     if (czl.zone == DEFAULT_ZONE) {
7934         *default_limit = czl.limit;
7935     } else {
7936         return EINVAL;
7937     }
7938
7939     if (!ovs_list_is_empty(zone_limits_request)) {
7940         struct ct_dpif_zone_limit *zone_limit;
7941         LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
7942             czl = zone_limit_get(dp->conntrack, zone_limit->zone);
7943             if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
7944                 ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
7945                                         czl.limit, czl.count);
7946             } else {
7947                 return EINVAL;
7948             }
7949         }
7950     } else {
7951         for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
7952             czl = zone_limit_get(dp->conntrack, z);
7953             if (czl.zone == z) {
7954                 ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
7955                                         czl.count);
7956             }
7957         }
7958     }
7959
7960     return 0;
7961 }
7962
7963 static int
7964 dpif_netdev_ct_del_limits(struct dpif *dpif OVS_UNUSED,
7965                            const struct ovs_list *zone_limits)
7966 {
7967     int err = 0;
7968     struct dp_netdev *dp = get_dp_netdev(dpif);
7969     struct ct_dpif_zone_limit *zone_limit;
7970     LIST_FOR_EACH (zone_limit, node, zone_limits) {
7971         err = zone_limit_delete(dp->conntrack, zone_limit->zone);
7972         if (err != 0) {
7973             break;
7974         }
7975     }
7976
7977     return err;
7978 }
7979
7980 static int
7981 dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
7982                                   const struct ct_dpif_timeout_policy *dpif_tp)
7983 {
7984     struct timeout_policy tp;
7985     struct dp_netdev *dp;
7986
7987     dp = get_dp_netdev(dpif);
7988     memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
7989     return timeout_policy_update(dp->conntrack, &tp);
7990 }
7991
7992 static int
7993 dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
7994                                   struct ct_dpif_timeout_policy *dpif_tp)
7995 {
7996     struct timeout_policy *tp;
7997     struct dp_netdev *dp;
7998     int err = 0;
7999
8000     dp = get_dp_netdev(dpif);
8001     tp = timeout_policy_get(dp->conntrack, tp_id);
8002     if (!tp) {
8003         return ENOENT;
8004     }
8005     memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8006     return err;
8007 }
8008
8009 static int
8010 dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8011                                   uint32_t tp_id)
8012 {
8013     struct dp_netdev *dp;
8014     int err = 0;
8015
8016     dp = get_dp_netdev(dpif);
8017     err = timeout_policy_delete(dp->conntrack, tp_id);
8018     return err;
8019 }
8020
8021 static int
8022 dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8023                                        uint32_t tp_id,
8024                                        uint16_t dl_type OVS_UNUSED,
8025                                        uint8_t nw_proto OVS_UNUSED,
8026                                        char **tp_name, bool *is_generic)
8027 {
8028     struct ds ds = DS_EMPTY_INITIALIZER;
8029
8030     ds_put_format(&ds, "%"PRIu32, tp_id);
8031     *tp_name = ds_steal_cstr(&ds);
8032     *is_generic = true;
8033     return 0;
8034 }
8035
8036 static int
8037 dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8038 {
8039     struct dp_netdev *dp = get_dp_netdev(dpif);
8040     return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8041 }
8042
8043 static int
8044 dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8045 {
8046     struct dp_netdev *dp = get_dp_netdev(dpif);
8047     return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8048 }
8049
8050 static int
8051 dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8052 {
8053     struct dp_netdev *dp = get_dp_netdev(dpif);
8054     return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8055 }
8056
8057 /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8058  * diverge. */
8059 static int
8060 dpif_netdev_ipf_get_status(struct dpif *dpif,
8061                            struct dpif_ipf_status *dpif_ipf_status)
8062 {
8063     struct dp_netdev *dp = get_dp_netdev(dpif);
8064     ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8065                    (struct ipf_status *) dpif_ipf_status);
8066     return 0;
8067 }
8068
8069 static int
8070 dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8071                            struct ipf_dump_ctx **ipf_dump_ctx)
8072 {
8073     return ipf_dump_start(ipf_dump_ctx);
8074 }
8075
8076 static int
8077 dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8078 {
8079     struct dp_netdev *dp = get_dp_netdev(dpif);
8080     return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8081                          dump);
8082 }
8083
8084 static int
8085 dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8086 {
8087     return ipf_dump_done(ipf_dump_ctx);
8088
8089 }
8090
8091 static int
8092 dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8093                      odp_port_t *slave_map)
8094 {
8095     struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8096     struct dp_netdev *dp = get_dp_netdev(dpif);
8097     struct dp_netdev_pmd_thread *pmd;
8098
8099     /* Prepare new bond mapping. */
8100     new_tx->bond_id = bond_id;
8101     for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8102         new_tx->slave_buckets[bucket].slave_id = slave_map[bucket];
8103     }
8104
8105     ovs_mutex_lock(&dp->bond_mutex);
8106     /* Check if bond already existed. */
8107     struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8108     if (old_tx) {
8109         cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8110                      hash_bond_id(bond_id));
8111         ovsrcu_postpone(free, old_tx);
8112     } else {
8113         cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8114     }
8115     ovs_mutex_unlock(&dp->bond_mutex);
8116
8117     /* Update all PMDs with new bond mapping. */
8118     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8119         dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8120     }
8121     return 0;
8122 }
8123
8124 static int
8125 dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8126 {
8127     struct dp_netdev *dp = get_dp_netdev(dpif);
8128     struct dp_netdev_pmd_thread *pmd;
8129     struct tx_bond *tx;
8130
8131     ovs_mutex_lock(&dp->bond_mutex);
8132     /* Check if bond existed. */
8133     tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8134     if (tx) {
8135         cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8136         ovsrcu_postpone(free, tx);
8137     } else {
8138         /* Bond is not present. */
8139         ovs_mutex_unlock(&dp->bond_mutex);
8140         return ENOENT;
8141     }
8142     ovs_mutex_unlock(&dp->bond_mutex);
8143
8144     /* Remove the bond map in all pmds. */
8145     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8146         dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8147     }
8148     return 0;
8149 }
8150
8151 static int
8152 dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8153                            uint64_t *n_bytes)
8154 {
8155     struct dp_netdev *dp = get_dp_netdev(dpif);
8156     struct dp_netdev_pmd_thread *pmd;
8157
8158     if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8159         return ENOENT;
8160     }
8161
8162     /* Search the bond in all PMDs. */
8163     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8164         struct tx_bond *pmd_bond_entry
8165             = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8166
8167         if (!pmd_bond_entry) {
8168             continue;
8169         }
8170
8171         /* Read bond stats. */
8172         for (int i = 0; i < BOND_BUCKETS; i++) {
8173             uint64_t pmd_n_bytes;
8174
8175             atomic_read_relaxed(&pmd_bond_entry->slave_buckets[i].n_bytes,
8176                                 &pmd_n_bytes);
8177             n_bytes[i] += pmd_n_bytes;
8178         }
8179     }
8180     return 0;
8181 }
8182
8183 const struct dpif_class dpif_netdev_class = {
8184     "netdev",
8185     true,                       /* cleanup_required */
8186     dpif_netdev_init,
8187     dpif_netdev_enumerate,
8188     dpif_netdev_port_open_type,
8189     dpif_netdev_open,
8190     dpif_netdev_close,
8191     dpif_netdev_destroy,
8192     dpif_netdev_run,
8193     dpif_netdev_wait,
8194     dpif_netdev_get_stats,
8195     NULL,                      /* set_features */
8196     dpif_netdev_port_add,
8197     dpif_netdev_port_del,
8198     dpif_netdev_port_set_config,
8199     dpif_netdev_port_query_by_number,
8200     dpif_netdev_port_query_by_name,
8201     NULL,                       /* port_get_pid */
8202     dpif_netdev_port_dump_start,
8203     dpif_netdev_port_dump_next,
8204     dpif_netdev_port_dump_done,
8205     dpif_netdev_port_poll,
8206     dpif_netdev_port_poll_wait,
8207     dpif_netdev_flow_flush,
8208     dpif_netdev_flow_dump_create,
8209     dpif_netdev_flow_dump_destroy,
8210     dpif_netdev_flow_dump_thread_create,
8211     dpif_netdev_flow_dump_thread_destroy,
8212     dpif_netdev_flow_dump_next,
8213     dpif_netdev_operate,
8214     NULL,                       /* recv_set */
8215     NULL,                       /* handlers_set */
8216     dpif_netdev_set_config,
8217     dpif_netdev_queue_to_priority,
8218     NULL,                       /* recv */
8219     NULL,                       /* recv_wait */
8220     NULL,                       /* recv_purge */
8221     dpif_netdev_register_dp_purge_cb,
8222     dpif_netdev_register_upcall_cb,
8223     dpif_netdev_enable_upcall,
8224     dpif_netdev_disable_upcall,
8225     dpif_netdev_get_datapath_version,
8226     dpif_netdev_ct_dump_start,
8227     dpif_netdev_ct_dump_next,
8228     dpif_netdev_ct_dump_done,
8229     dpif_netdev_ct_flush,
8230     dpif_netdev_ct_set_maxconns,
8231     dpif_netdev_ct_get_maxconns,
8232     dpif_netdev_ct_get_nconns,
8233     dpif_netdev_ct_set_tcp_seq_chk,
8234     dpif_netdev_ct_get_tcp_seq_chk,
8235     dpif_netdev_ct_set_limits,
8236     dpif_netdev_ct_get_limits,
8237     dpif_netdev_ct_del_limits,
8238     dpif_netdev_ct_set_timeout_policy,
8239     dpif_netdev_ct_get_timeout_policy,
8240     dpif_netdev_ct_del_timeout_policy,
8241     NULL,                       /* ct_timeout_policy_dump_start */
8242     NULL,                       /* ct_timeout_policy_dump_next */
8243     NULL,                       /* ct_timeout_policy_dump_done */
8244     dpif_netdev_ct_get_timeout_policy_name,
8245     dpif_netdev_ipf_set_enabled,
8246     dpif_netdev_ipf_set_min_frag,
8247     dpif_netdev_ipf_set_max_nfrags,
8248     dpif_netdev_ipf_get_status,
8249     dpif_netdev_ipf_dump_start,
8250     dpif_netdev_ipf_dump_next,
8251     dpif_netdev_ipf_dump_done,
8252     dpif_netdev_meter_get_features,
8253     dpif_netdev_meter_set,
8254     dpif_netdev_meter_get,
8255     dpif_netdev_meter_del,
8256     dpif_netdev_bond_add,
8257     dpif_netdev_bond_del,
8258     dpif_netdev_bond_stats_get,
8259 };
8260
8261 static void
8262 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8263                               const char *argv[], void *aux OVS_UNUSED)
8264 {
8265     struct dp_netdev_port *port;
8266     struct dp_netdev *dp;
8267     odp_port_t port_no;
8268
8269     ovs_mutex_lock(&dp_netdev_mutex);
8270     dp = shash_find_data(&dp_netdevs, argv[1]);
8271     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8272         ovs_mutex_unlock(&dp_netdev_mutex);
8273         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8274         return;
8275     }
8276     ovs_refcount_ref(&dp->ref_cnt);
8277     ovs_mutex_unlock(&dp_netdev_mutex);
8278
8279     ovs_mutex_lock(&dp->port_mutex);
8280     if (get_port_by_name(dp, argv[2], &port)) {
8281         unixctl_command_reply_error(conn, "unknown port");
8282         goto exit;
8283     }
8284
8285     port_no = u32_to_odp(atoi(argv[3]));
8286     if (!port_no || port_no == ODPP_NONE) {
8287         unixctl_command_reply_error(conn, "bad port number");
8288         goto exit;
8289     }
8290     if (dp_netdev_lookup_port(dp, port_no)) {
8291         unixctl_command_reply_error(conn, "port number already in use");
8292         goto exit;
8293     }
8294
8295     /* Remove port. */
8296     hmap_remove(&dp->ports, &port->node);
8297     reconfigure_datapath(dp);
8298
8299     /* Reinsert with new port number. */
8300     port->port_no = port_no;
8301     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8302     reconfigure_datapath(dp);
8303
8304     seq_change(dp->port_seq);
8305     unixctl_command_reply(conn, NULL);
8306
8307 exit:
8308     ovs_mutex_unlock(&dp->port_mutex);
8309     dp_netdev_unref(dp);
8310 }
8311
8312 static void
8313 dpif_dummy_register__(const char *type)
8314 {
8315     struct dpif_class *class;
8316
8317     class = xmalloc(sizeof *class);
8318     *class = dpif_netdev_class;
8319     class->type = xstrdup(type);
8320     dp_register_provider(class);
8321 }
8322
8323 static void
8324 dpif_dummy_override(const char *type)
8325 {
8326     int error;
8327
8328     /*
8329      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8330      * a userland-only build.  It's useful for testsuite.
8331      */
8332     error = dp_unregister_provider(type);
8333     if (error == 0 || error == EAFNOSUPPORT) {
8334         dpif_dummy_register__(type);
8335     }
8336 }
8337
8338 void
8339 dpif_dummy_register(enum dummy_level level)
8340 {
8341     if (level == DUMMY_OVERRIDE_ALL) {
8342         struct sset types;
8343         const char *type;
8344
8345         sset_init(&types);
8346         dp_enumerate_types(&types);
8347         SSET_FOR_EACH (type, &types) {
8348             dpif_dummy_override(type);
8349         }
8350         sset_destroy(&types);
8351     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
8352         dpif_dummy_override("system");
8353     }
8354
8355     dpif_dummy_register__("dummy");
8356
8357     unixctl_command_register("dpif-dummy/change-port-number",
8358                              "dp port new-number",
8359                              3, 3, dpif_dummy_change_port_number, NULL);
8360 }
8361 \f
8362 /* Datapath Classifier. */
8363
8364 static void
8365 dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
8366 {
8367     cmap_destroy(&subtable->rules);
8368     ovsrcu_postpone(free, subtable->mf_masks);
8369     ovsrcu_postpone(free, subtable);
8370 }
8371
8372 /* Initializes 'cls' as a classifier that initially contains no classification
8373  * rules. */
8374 static void
8375 dpcls_init(struct dpcls *cls)
8376 {
8377     cmap_init(&cls->subtables_map);
8378     pvector_init(&cls->subtables);
8379 }
8380
8381 static void
8382 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
8383 {
8384     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
8385     pvector_remove(&cls->subtables, subtable);
8386     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
8387                 subtable->mask.hash);
8388     ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
8389 }
8390
8391 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
8392  * caller's responsibility.
8393  * May only be called after all the readers have been terminated. */
8394 static void
8395 dpcls_destroy(struct dpcls *cls)
8396 {
8397     if (cls) {
8398         struct dpcls_subtable *subtable;
8399
8400         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
8401             ovs_assert(cmap_count(&subtable->rules) == 0);
8402             dpcls_destroy_subtable(cls, subtable);
8403         }
8404         cmap_destroy(&cls->subtables_map);
8405         pvector_destroy(&cls->subtables);
8406     }
8407 }
8408
8409 static struct dpcls_subtable *
8410 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8411 {
8412     struct dpcls_subtable *subtable;
8413
8414     /* Need to add one. */
8415     subtable = xmalloc(sizeof *subtable
8416                        - sizeof subtable->mask.mf + mask->len);
8417     cmap_init(&subtable->rules);
8418     subtable->hit_cnt = 0;
8419     netdev_flow_key_clone(&subtable->mask, mask);
8420
8421     /* The count of bits in the mask defines the space required for masks.
8422      * Then call gen_masks() to create the appropriate masks, avoiding the cost
8423      * of doing runtime calculations. */
8424     uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
8425     uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
8426     subtable->mf_bits_set_unit0 = unit0;
8427     subtable->mf_bits_set_unit1 = unit1;
8428     subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
8429     netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
8430
8431     /* Probe for a specialized generic lookup function. */
8432     subtable->lookup_func = dpcls_subtable_generic_probe(unit0, unit1);
8433
8434     /* If not set, assign generic lookup. Generic works for any miniflow. */
8435     if (!subtable->lookup_func) {
8436         subtable->lookup_func = dpcls_subtable_lookup_generic;
8437     }
8438
8439     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
8440     /* Add the new subtable at the end of the pvector (with no hits yet) */
8441     pvector_insert(&cls->subtables, subtable, 0);
8442     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
8443              cmap_count(&cls->subtables_map), subtable, cls->in_port);
8444     pvector_publish(&cls->subtables);
8445
8446     return subtable;
8447 }
8448
8449 static inline struct dpcls_subtable *
8450 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
8451 {
8452     struct dpcls_subtable *subtable;
8453
8454     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
8455                              &cls->subtables_map) {
8456         if (netdev_flow_key_equal(&subtable->mask, mask)) {
8457             return subtable;
8458         }
8459     }
8460     return dpcls_create_subtable(cls, mask);
8461 }
8462
8463
8464 /* Periodically sort the dpcls subtable vectors according to hit counts */
8465 static void
8466 dpcls_sort_subtable_vector(struct dpcls *cls)
8467 {
8468     struct pvector *pvec = &cls->subtables;
8469     struct dpcls_subtable *subtable;
8470
8471     PVECTOR_FOR_EACH (subtable, pvec) {
8472         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
8473         subtable->hit_cnt = 0;
8474     }
8475     pvector_publish(pvec);
8476 }
8477
8478 static inline void
8479 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
8480                            struct polled_queue *poll_list, int poll_cnt)
8481 {
8482     struct dpcls *cls;
8483     uint64_t tot_idle = 0, tot_proc = 0;
8484     unsigned int pmd_load = 0;
8485
8486     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
8487         uint64_t curr_tsc;
8488         struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
8489         if (pmd_alb->is_enabled && !pmd->isolated
8490             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
8491                                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
8492             && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
8493                                         pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
8494             {
8495             tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
8496                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
8497             tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
8498                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
8499
8500             if (tot_proc) {
8501                 pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
8502             }
8503
8504             if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) {
8505                 atomic_count_inc(&pmd->pmd_overloaded);
8506             } else {
8507                 atomic_count_set(&pmd->pmd_overloaded, 0);
8508             }
8509         }
8510
8511         pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
8512                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
8513         pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
8514                         pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
8515
8516         /* Get the cycles that were used to process each queue and store. */
8517         for (unsigned i = 0; i < poll_cnt; i++) {
8518             uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
8519                                                         RXQ_CYCLES_PROC_CURR);
8520             dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
8521             dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
8522                                      0);
8523         }
8524         curr_tsc = cycles_counter_update(&pmd->perf_stats);
8525         if (pmd->intrvl_tsc_prev) {
8526             /* There is a prev timestamp, store a new intrvl cycle count. */
8527             atomic_store_relaxed(&pmd->intrvl_cycles,
8528                                  curr_tsc - pmd->intrvl_tsc_prev);
8529         }
8530         pmd->intrvl_tsc_prev = curr_tsc;
8531         /* Start new measuring interval */
8532         pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
8533     }
8534
8535     if (pmd->ctx.now > pmd->next_optimization) {
8536         /* Try to obtain the flow lock to block out revalidator threads.
8537          * If not possible, just try next time. */
8538         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
8539             /* Optimize each classifier */
8540             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
8541                 dpcls_sort_subtable_vector(cls);
8542             }
8543             ovs_mutex_unlock(&pmd->flow_mutex);
8544             /* Start new measuring interval */
8545             pmd->next_optimization = pmd->ctx.now
8546                                      + DPCLS_OPTIMIZATION_INTERVAL;
8547         }
8548     }
8549 }
8550
8551 /* Insert 'rule' into 'cls'. */
8552 static void
8553 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
8554              const struct netdev_flow_key *mask)
8555 {
8556     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
8557
8558     /* Refer to subtable's mask, also for later removal. */
8559     rule->mask = &subtable->mask;
8560     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
8561 }
8562
8563 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
8564 static void
8565 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
8566 {
8567     struct dpcls_subtable *subtable;
8568
8569     ovs_assert(rule->mask);
8570
8571     /* Get subtable from reference in rule->mask. */
8572     INIT_CONTAINER(subtable, rule->mask, mask);
8573     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
8574         == 0) {
8575         /* Delete empty subtable. */
8576         dpcls_destroy_subtable(cls, subtable);
8577         pvector_publish(&cls->subtables);
8578     }
8579 }
8580
8581 /* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks. */
8582 static inline void
8583 netdev_flow_key_gen_mask_unit(uint64_t iter,
8584                               const uint64_t count,
8585                               uint64_t *mf_masks)
8586 {
8587     int i;
8588     for (i = 0; i < count; i++) {
8589         uint64_t lowest_bit = (iter & -iter);
8590         iter &= ~lowest_bit;
8591         mf_masks[i] = (lowest_bit - 1);
8592     }
8593     /* Checks that count has covered all bits in the iter bitmap. */
8594     ovs_assert(iter == 0);
8595 }
8596
8597 /* Generate a mask for each block in the miniflow, based on the bits set. This
8598  * allows easily masking packets with the generated array here, without
8599  * calculations. This replaces runtime-calculating the masks.
8600  * @param key The table to generate the mf_masks for
8601  * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
8602  * @param mf_bits_total Number of bits set in the whole miniflow (both units)
8603  * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
8604  */
8605 void
8606 netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
8607                           uint64_t *mf_masks,
8608                           const uint32_t mf_bits_u0,
8609                           const uint32_t mf_bits_u1)
8610 {
8611     uint64_t iter_u0 = tbl->mf.map.bits[0];
8612     uint64_t iter_u1 = tbl->mf.map.bits[1];
8613
8614     netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
8615     netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
8616 }
8617
8618 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
8619  * in 'mask' the values in 'key' and 'target' are the same. */
8620 bool
8621 dpcls_rule_matches_key(const struct dpcls_rule *rule,
8622                        const struct netdev_flow_key *target)
8623 {
8624     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
8625     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
8626     uint64_t value;
8627
8628     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
8629         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
8630             return false;
8631         }
8632     }
8633     return true;
8634 }
8635
8636 /* For each miniflow in 'keys' performs a classifier lookup writing the result
8637  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
8638  * NULL it is skipped.
8639  *
8640  * This function is optimized for use in the userspace datapath and therefore
8641  * does not implement a lot of features available in the standard
8642  * classifier_lookup() function.  Specifically, it does not implement
8643  * priorities, instead returning any rule which matches the flow.
8644  *
8645  * Returns true if all miniflows found a corresponding rule. */
8646 static bool
8647 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
8648              struct dpcls_rule **rules, const size_t cnt,
8649              int *num_lookups_p)
8650 {
8651     /* The received 'cnt' miniflows are the search-keys that will be processed
8652      * to find a matching entry into the available subtables.
8653      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
8654 #define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
8655     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
8656
8657     struct dpcls_subtable *subtable;
8658     uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
8659
8660     if (cnt != MAP_BITS) {
8661         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
8662     }
8663     memset(rules, 0, cnt * sizeof *rules);
8664
8665     int lookups_match = 0, subtable_pos = 1;
8666     uint32_t found_map;
8667
8668     /* The Datapath classifier - aka dpcls - is composed of subtables.
8669      * Subtables are dynamically created as needed when new rules are inserted.
8670      * Each subtable collects rules with matches on a specific subset of packet
8671      * fields as defined by the subtable's mask.  We proceed to process every
8672      * search-key against each subtable, but when a match is found for a
8673      * search-key, the search for that key can stop because the rules are
8674      * non-overlapping. */
8675     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
8676         /* Call the subtable specific lookup function. */
8677         found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
8678
8679         /* Count the number of subtables searched for this packet match. This
8680          * estimates the "spread" of subtables looked at per matched packet. */
8681         uint32_t pkts_matched = count_1bits(found_map);
8682         lookups_match += pkts_matched * subtable_pos;
8683
8684         /* Clear the found rules, and return early if all packets are found. */
8685         keys_map &= ~found_map;
8686         if (!keys_map) {
8687             if (num_lookups_p) {
8688                 *num_lookups_p = lookups_match;
8689             }
8690             return true;
8691         }
8692         subtable_pos++;
8693     }
8694
8695     if (num_lookups_p) {
8696         *num_lookups_p = lookups_match;
8697     }
8698     return false;
8699 }