lib/dpif-netdev.c

   1 /*
   2  * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "dpif-netdev.h"
  19
  20 #include <ctype.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <inttypes.h>
  24 #include <net/if.h>
  25 #include <netinet/in.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <sys/ioctl.h>
  30 #include <sys/socket.h>
  31 #include <sys/stat.h>
  32 #include <unistd.h>
  33
  34 #ifdef DPDK_NETDEV
  35 #include <rte_cycles.h>
  36 #endif
  37
  38 #include "bitmap.h"
  39 #include "cmap.h"
  40 #include "conntrack.h"
  41 #include "coverage.h"
  42 #include "ct-dpif.h"
  43 #include "csum.h"
  44 #include "dp-packet.h"
  45 #include "dpif.h"
  46 #include "dpif-provider.h"
  47 #include "dummy.h"
  48 #include "fat-rwlock.h"
  49 #include "flow.h"
  50 #include "hmapx.h"
  51 #include "id-pool.h"
  52 #include "latch.h"
  53 #include "netdev.h"
  54 #include "netdev-vport.h"
  55 #include "netlink.h"
  56 #include "odp-execute.h"
  57 #include "odp-util.h"
  58 #include "openvswitch/dynamic-string.h"
  59 #include "openvswitch/list.h"
  60 #include "openvswitch/match.h"
  61 #include "openvswitch/ofp-print.h"
  62 #include "openvswitch/ofp-util.h"
  63 #include "openvswitch/ofpbuf.h"
  64 #include "openvswitch/shash.h"
  65 #include "openvswitch/vlog.h"
  66 #include "ovs-numa.h"
  67 #include "ovs-rcu.h"
  68 #include "packets.h"
  69 #include "poll-loop.h"
  70 #include "pvector.h"
  71 #include "random.h"
  72 #include "seq.h"
  73 #include "smap.h"
  74 #include "sset.h"
  75 #include "timeval.h"
  76 #include "tnl-neigh-cache.h"
  77 #include "tnl-ports.h"
  78 #include "unixctl.h"
  79 #include "util.h"
  80
  81 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
  82
  83 #define FLOW_DUMP_MAX_BATCH 50
  84 /* Use per thread recirc_depth to prevent recirculation loop. */
  85 #define MAX_RECIRC_DEPTH 5
  86 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
  87
  88 /* Configuration parameters. */
  89 enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
  90 enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
  91 enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
  92 enum { N_METER_LOCKS = 64 };    /* Maximum number of meters. */
  93
  94 /* Protects against changes to 'dp_netdevs'. */
  95 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
  96
  97 /* Contains all 'struct dp_netdev's. */
  98 static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
  99     = SHASH_INITIALIZER(&dp_netdevs);
 100
 101 static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
 102
 103 #define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
 104                                      | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
 105                                      | CS_SRC_NAT | CS_DST_NAT)
 106 #define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
 107
 108 static struct odp_support dp_netdev_support = {
 109     .max_vlan_headers = SIZE_MAX,
 110     .max_mpls_depth = SIZE_MAX,
 111     .recirc = true,
 112     .ct_state = true,
 113     .ct_zone = true,
 114     .ct_mark = true,
 115     .ct_label = true,
 116     .ct_state_nat = true,
 117     .ct_orig_tuple = true,
 118     .ct_orig_tuple6 = true,
 119 };
 120
 121 /* Stores a miniflow with inline values */
 122
 123 struct netdev_flow_key {
 124     uint32_t hash;       /* Hash function differs for different users. */
 125     uint32_t len;        /* Length of the following miniflow (incl. map). */
 126     struct miniflow mf;
 127     uint64_t buf[FLOW_MAX_PACKET_U64S];
 128 };
 129
 130 /* Exact match cache for frequently used flows
 131  *
 132  * The cache uses a 32-bit hash of the packet (which can be the RSS hash) to
 133  * search its entries for a miniflow that matches exactly the miniflow of the
 134  * packet. It stores the 'dpcls_rule' (rule) that matches the miniflow.
 135  *
 136  * A cache entry holds a reference to its 'dp_netdev_flow'.
 137  *
 138  * A miniflow with a given hash can be in one of EM_FLOW_HASH_SEGS different
 139  * entries. The 32-bit hash is split into EM_FLOW_HASH_SEGS values (each of
 140  * them is EM_FLOW_HASH_SHIFT bits wide and the remainder is thrown away). Each
 141  * value is the index of a cache entry where the miniflow could be.
 142  *
 143  *
 144  * Thread-safety
 145  * =============
 146  *
 147  * Each pmd_thread has its own private exact match cache.
 148  * If dp_netdev_input is not called from a pmd thread, a mutex is used.
 149  */
 150
 151 #define EM_FLOW_HASH_SHIFT 13
 152 #define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
 153 #define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
 154 #define EM_FLOW_HASH_SEGS 2
 155
 156 /* Default EMC insert probability is 1 / DEFAULT_EM_FLOW_INSERT_INV_PROB */
 157 #define DEFAULT_EM_FLOW_INSERT_INV_PROB 100
 158 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
 159                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 160
 161 struct emc_entry {
 162     struct dp_netdev_flow *flow;
 163     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
 164 };
 165
 166 struct emc_cache {
 167     struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
 168     int sweep_idx;                /* For emc_cache_slow_sweep(). */
 169 };
 170
 171 /* Iterate in the exact match cache through every entry that might contain a
 172  * miniflow with hash 'HASH'. */
 173 #define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \
 174     for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \
 175          (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
 176          i__ < EM_FLOW_HASH_SEGS;                                            \
 177          i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)
 178 \f
 179 /* Simple non-wildcarding single-priority classifier. */
 180
 181 /* Time in ms between successive optimizations of the dpcls subtable vector */
 182 #define DPCLS_OPTIMIZATION_INTERVAL 1000
 183
 184 /* Number of intervals for which cycles are stored
 185  * and used during rxq to pmd assignment. */
 186 #define PMD_RXQ_INTERVAL_MAX 6
 187
 188 struct dpcls {
 189     struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
 190     odp_port_t in_port;
 191     struct cmap subtables_map;
 192     struct pvector subtables;
 193 };
 194
 195 /* A rule to be inserted to the classifier. */
 196 struct dpcls_rule {
 197     struct cmap_node cmap_node;   /* Within struct dpcls_subtable 'rules'. */
 198     struct netdev_flow_key *mask; /* Subtable's mask. */
 199     struct netdev_flow_key flow;  /* Matching key. */
 200     /* 'flow' must be the last field, additional space is allocated here. */
 201 };
 202
 203 static void dpcls_init(struct dpcls *);
 204 static void dpcls_destroy(struct dpcls *);
 205 static void dpcls_sort_subtable_vector(struct dpcls *);
 206 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
 207                          const struct netdev_flow_key *mask);
 208 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
 209 static bool dpcls_lookup(struct dpcls *cls,
 210                          const struct netdev_flow_key keys[],
 211                          struct dpcls_rule **rules, size_t cnt,
 212                          int *num_lookups_p);
 213 \f
 214 /* Set of supported meter flags */
 215 #define DP_SUPPORTED_METER_FLAGS_MASK \
 216     (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
 217
 218 /* Set of supported meter band types */
 219 #define DP_SUPPORTED_METER_BAND_TYPES           \
 220     ( 1 << OFPMBT13_DROP )
 221
 222 struct dp_meter_band {
 223     struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
 224     uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
 225     uint64_t packet_count;
 226     uint64_t byte_count;
 227 };
 228
 229 struct dp_meter {
 230     uint16_t flags;
 231     uint16_t n_bands;
 232     uint32_t max_delta_t;
 233     uint64_t used;
 234     uint64_t packet_count;
 235     uint64_t byte_count;
 236     struct dp_meter_band bands[];
 237 };
 238
 239 /* Datapath based on the network device interface from netdev.h.
 240  *
 241  *
 242  * Thread-safety
 243  * =============
 244  *
 245  * Some members, marked 'const', are immutable.  Accessing other members
 246  * requires synchronization, as noted in more detail below.
 247  *
 248  * Acquisition order is, from outermost to innermost:
 249  *
 250  *    dp_netdev_mutex (global)
 251  *    port_mutex
 252  *    non_pmd_mutex
 253  */
 254 struct dp_netdev {
 255     const struct dpif_class *const class;
 256     const char *const name;
 257     struct dpif *dpif;
 258     struct ovs_refcount ref_cnt;
 259     atomic_flag destroyed;
 260
 261     /* Ports.
 262      *
 263      * Any lookup into 'ports' or any access to the dp_netdev_ports found
 264      * through 'ports' requires taking 'port_mutex'. */
 265     struct ovs_mutex port_mutex;
 266     struct hmap ports;
 267     struct seq *port_seq;       /* Incremented whenever a port changes. */
 268
 269     /* Meters. */
 270     struct ovs_mutex meter_locks[N_METER_LOCKS];
 271     struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
 272
 273     /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
 274     OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
 275
 276     /* Protects access to ofproto-dpif-upcall interface during revalidator
 277      * thread synchronization. */
 278     struct fat_rwlock upcall_rwlock;
 279     upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
 280     void *upcall_aux;
 281
 282     /* Callback function for notifying the purging of dp flows (during
 283      * reseting pmd deletion). */
 284     dp_purge_callback *dp_purge_cb;
 285     void *dp_purge_aux;
 286
 287     /* Stores all 'struct dp_netdev_pmd_thread's. */
 288     struct cmap poll_threads;
 289     /* id pool for per thread static_tx_qid. */
 290     struct id_pool *tx_qid_pool;
 291     struct ovs_mutex tx_qid_pool_mutex;
 292
 293     /* Protects the access of the 'struct dp_netdev_pmd_thread'
 294      * instance for non-pmd thread. */
 295     struct ovs_mutex non_pmd_mutex;
 296
 297     /* Each pmd thread will store its pointer to
 298      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
 299     ovsthread_key_t per_pmd_key;
 300
 301     struct seq *reconfigure_seq;
 302     uint64_t last_reconfigure_seq;
 303
 304     /* Cpu mask for pin of pmd threads. */
 305     char *pmd_cmask;
 306
 307     uint64_t last_tnl_conf_seq;
 308
 309     struct conntrack conntrack;
 310 };
 311
 312 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
 313     OVS_ACQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
 314 {
 315     ovs_mutex_lock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 316 }
 317
 318 static void meter_unlock(const struct dp_netdev *dp, uint32_t meter_id)
 319     OVS_RELEASES(dp->meter_locks[meter_id % N_METER_LOCKS])
 320 {
 321     ovs_mutex_unlock(&dp->meter_locks[meter_id % N_METER_LOCKS]);
 322 }
 323
 324
 325 static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
 326                                                     odp_port_t)
 327     OVS_REQUIRES(dp->port_mutex);
 328
 329 enum dp_stat_type {
 330     DP_STAT_EXACT_HIT,          /* Packets that had an exact match (emc). */
 331     DP_STAT_MASKED_HIT,         /* Packets that matched in the flow table. */
 332     DP_STAT_MISS,               /* Packets that did not match. */
 333     DP_STAT_LOST,               /* Packets not passed up to the client. */
 334     DP_STAT_LOOKUP_HIT,         /* Number of subtable lookups for flow table
 335                                    hits */
 336     DP_N_STATS
 337 };
 338
 339 enum pmd_cycles_counter_type {
 340     PMD_CYCLES_IDLE,            /* Cycles spent idle or unsuccessful polling */
 341     PMD_CYCLES_PROCESSING,      /* Cycles spent successfully polling and
 342                                  * processing polled packets */
 343     PMD_N_CYCLES
 344 };
 345
 346 enum rxq_cycles_counter_type {
 347     RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
 348                                    processing packets during the current
 349                                    interval. */
 350     RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
 351                                    during rxq to pmd assignment. */
 352     RXQ_N_CYCLES
 353 };
 354
 355 #define XPS_TIMEOUT_MS 500LL
 356
 357 /* Contained by struct dp_netdev_port's 'rxqs' member.  */
 358 struct dp_netdev_rxq {
 359     struct dp_netdev_port *port;
 360     struct netdev_rxq *rx;
 361     unsigned core_id;                  /* Core to which this queue should be
 362                                           pinned. OVS_CORE_UNSPEC if the
 363                                           queue doesn't need to be pinned to a
 364                                           particular core. */
 365     struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
 366
 367     /* Counters of cycles spent successfully polling and processing pkts. */
 368     atomic_ullong cycles[RXQ_N_CYCLES];
 369     /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
 370        sum them to yield the cycles used for an rxq. */
 371     atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
 372     unsigned intrvl_idx;               /* Write index for 'cycles_intrvl'. */
 373 };
 374
 375 /* A port in a netdev-based datapath. */
 376 struct dp_netdev_port {
 377     odp_port_t port_no;
 378     bool dynamic_txqs;          /* If true XPS will be used. */
 379     bool need_reconfigure;      /* True if we should reconfigure netdev. */
 380     struct netdev *netdev;
 381     struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
 382     struct netdev_saved_flags *sf;
 383     struct dp_netdev_rxq *rxqs;
 384     unsigned n_rxq;             /* Number of elements in 'rxqs' */
 385     unsigned *txq_used;         /* Number of threads that use each tx queue. */
 386     struct ovs_mutex txq_used_mutex;
 387     char *type;                 /* Port type as requested by user. */
 388     char *rxq_affinity_list;    /* Requested affinity of rx queues. */
 389 };
 390
 391 /* Contained by struct dp_netdev_flow's 'stats' member.  */
 392 struct dp_netdev_flow_stats {
 393     atomic_llong used;             /* Last used time, in monotonic msecs. */
 394     atomic_ullong packet_count;    /* Number of packets matched. */
 395     atomic_ullong byte_count;      /* Number of bytes matched. */
 396     atomic_uint16_t tcp_flags;     /* Bitwise-OR of seen tcp_flags values. */
 397 };
 398
 399 /* A flow in 'dp_netdev_pmd_thread's 'flow_table'.
 400  *
 401  *
 402  * Thread-safety
 403  * =============
 404  *
 405  * Except near the beginning or ending of its lifespan, rule 'rule' belongs to
 406  * its pmd thread's classifier.  The text below calls this classifier 'cls'.
 407  *
 408  * Motivation
 409  * ----------
 410  *
 411  * The thread safety rules described here for "struct dp_netdev_flow" are
 412  * motivated by two goals:
 413  *
 414  *    - Prevent threads that read members of "struct dp_netdev_flow" from
 415  *      reading bad data due to changes by some thread concurrently modifying
 416  *      those members.
 417  *
 418  *    - Prevent two threads making changes to members of a given "struct
 419  *      dp_netdev_flow" from interfering with each other.
 420  *
 421  *
 422  * Rules
 423  * -----
 424  *
 425  * A flow 'flow' may be accessed without a risk of being freed during an RCU
 426  * grace period.  Code that needs to hold onto a flow for a while
 427  * should try incrementing 'flow->ref_cnt' with dp_netdev_flow_ref().
 428  *
 429  * 'flow->ref_cnt' protects 'flow' from being freed.  It doesn't protect the
 430  * flow from being deleted from 'cls' and it doesn't protect members of 'flow'
 431  * from modification.
 432  *
 433  * Some members, marked 'const', are immutable.  Accessing other members
 434  * requires synchronization, as noted in more detail below.
 435  */
 436 struct dp_netdev_flow {
 437     const struct flow flow;      /* Unmasked flow that created this entry. */
 438     /* Hash table index by unmasked flow. */
 439     const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
 440                                  /* 'flow_table'. */
 441     const ovs_u128 ufid;         /* Unique flow identifier. */
 442     const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this */
 443                                  /* flow. */
 444
 445     /* Number of references.
 446      * The classifier owns one reference.
 447      * Any thread trying to keep a rule from being freed should hold its own
 448      * reference. */
 449     struct ovs_refcount ref_cnt;
 450
 451     bool dead;
 452
 453     /* Statistics. */
 454     struct dp_netdev_flow_stats stats;
 455
 456     /* Actions. */
 457     OVSRCU_TYPE(struct dp_netdev_actions *) actions;
 458
 459     /* While processing a group of input packets, the datapath uses the next
 460      * member to store a pointer to the output batch for the flow.  It is
 461      * reset after the batch has been sent out (See dp_netdev_queue_batches(),
 462      * packet_batch_per_flow_init() and packet_batch_per_flow_execute()). */
 463     struct packet_batch_per_flow *batch;
 464
 465     /* Packet classification. */
 466     struct dpcls_rule cr;        /* In owning dp_netdev's 'cls'. */
 467     /* 'cr' must be the last member. */
 468 };
 469
 470 static void dp_netdev_flow_unref(struct dp_netdev_flow *);
 471 static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
 472 static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
 473                                          struct flow *, bool);
 474
 475 /* A set of datapath actions within a "struct dp_netdev_flow".
 476  *
 477  *
 478  * Thread-safety
 479  * =============
 480  *
 481  * A struct dp_netdev_actions 'actions' is protected with RCU. */
 482 struct dp_netdev_actions {
 483     /* These members are immutable: they do not change during the struct's
 484      * lifetime.  */
 485     unsigned int size;          /* Size of 'actions', in bytes. */
 486     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 487 };
 488
 489 struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
 490                                                    size_t);
 491 struct dp_netdev_actions *dp_netdev_flow_get_actions(
 492     const struct dp_netdev_flow *);
 493 static void dp_netdev_actions_free(struct dp_netdev_actions *);
 494
 495 /* Contained by struct dp_netdev_pmd_thread's 'stats' member.  */
 496 struct dp_netdev_pmd_stats {
 497     /* Indexed by DP_STAT_*. */
 498     atomic_ullong n[DP_N_STATS];
 499 };
 500
 501 /* Contained by struct dp_netdev_pmd_thread's 'cycle' member.  */
 502 struct dp_netdev_pmd_cycles {
 503     /* Indexed by PMD_CYCLES_*. */
 504     atomic_ullong n[PMD_N_CYCLES];
 505 };
 506
 507 struct polled_queue {
 508     struct dp_netdev_rxq *rxq;
 509     odp_port_t port_no;
 510 };
 511
 512 /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
 513 struct rxq_poll {
 514     struct dp_netdev_rxq *rxq;
 515     struct hmap_node node;
 516 };
 517
 518 /* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
 519  * 'tnl_port_cache' or 'tx_ports'. */
 520 struct tx_port {
 521     struct dp_netdev_port *port;
 522     int qid;
 523     long long last_used;
 524     struct hmap_node node;
 525 };
 526
 527 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
 528  * the performance overhead of interrupt processing.  Therefore netdev can
 529  * not implement rx-wait for these devices.  dpif-netdev needs to poll
 530  * these device to check for recv buffer.  pmd-thread does polling for
 531  * devices assigned to itself.
 532  *
 533  * DPDK used PMD for accessing NIC.
 534  *
 535  * Note, instance with cpu core id NON_PMD_CORE_ID will be reserved for
 536  * I/O of all non-pmd threads.  There will be no actual thread created
 537  * for the instance.
 538  *
 539  * Each struct has its own flow cache and classifier per managed ingress port.
 540  * For packets received on ingress port, a look up is done on corresponding PMD
 541  * thread's flow cache and in case of a miss, lookup is performed in the
 542  * corresponding classifier of port.  Packets are executed with the found
 543  * actions in either case.
 544  * */
 545 struct dp_netdev_pmd_thread {
 546     struct dp_netdev *dp;
 547     struct ovs_refcount ref_cnt;    /* Every reference must be refcount'ed. */
 548     struct cmap_node node;          /* In 'dp->poll_threads'. */
 549
 550     pthread_cond_t cond;            /* For synchronizing pmd thread reload. */
 551     struct ovs_mutex cond_mutex;    /* Mutex for condition variable. */
 552
 553     /* Per thread exact-match cache.  Note, the instance for cpu core
 554      * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
 555      * need to be protected by 'non_pmd_mutex'.  Every other instance
 556      * will only be accessed by its own pmd thread. */
 557     struct emc_cache flow_cache;
 558
 559     /* Flow-Table and classifiers
 560      *
 561      * Writers of 'flow_table' must take the 'flow_mutex'.  Corresponding
 562      * changes to 'classifiers' must be made while still holding the
 563      * 'flow_mutex'.
 564      */
 565     struct ovs_mutex flow_mutex;
 566     struct cmap flow_table OVS_GUARDED; /* Flow table. */
 567
 568     /* One classifier per in_port polled by the pmd */
 569     struct cmap classifiers;
 570     /* Periodically sort subtable vectors according to hit frequencies */
 571     long long int next_optimization;
 572
 573     /* Statistics. */
 574     struct dp_netdev_pmd_stats stats;
 575
 576     /* Cycles counters */
 577     struct dp_netdev_pmd_cycles cycles;
 578
 579     /* Used to count cicles. See 'cycles_counter_end()' */
 580     unsigned long long last_cycles;
 581
 582     struct latch exit_latch;        /* For terminating the pmd thread. */
 583     struct seq *reload_seq;
 584     uint64_t last_reload_seq;
 585     atomic_bool reload;             /* Do we need to reload ports? */
 586     pthread_t thread;
 587     unsigned core_id;               /* CPU core id of this pmd thread. */
 588     int numa_id;                    /* numa node id of this pmd thread. */
 589     bool isolated;
 590
 591     /* Queue id used by this pmd thread to send packets on all netdevs if
 592      * XPS disabled for this netdev. All static_tx_qid's are unique and less
 593      * than 'cmap_count(dp->poll_threads)'. */
 594     uint32_t static_tx_qid;
 595
 596     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
 597     /* List of rx queues to poll. */
 598     struct hmap poll_list OVS_GUARDED;
 599     /* Map of 'tx_port's used for transmission.  Written by the main thread,
 600      * read by the pmd thread. */
 601     struct hmap tx_ports OVS_GUARDED;
 602
 603     /* These are thread-local copies of 'tx_ports'.  One contains only tunnel
 604      * ports (that support push_tunnel/pop_tunnel), the other contains ports
 605      * with at least one txq (that support send).  A port can be in both.
 606      *
 607      * There are two separate maps to make sure that we don't try to execute
 608      * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
 609      *
 610      * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
 611      * threads, and thusly need to be protected by 'non_pmd_mutex'.  Every
 612      * other instance will only be accessed by its own pmd thread. */
 613     struct hmap tnl_port_cache;
 614     struct hmap send_port_cache;
 615
 616     /* Only a pmd thread can write on its own 'cycles' and 'stats'.
 617      * The main thread keeps 'stats_zero' and 'cycles_zero' as base
 618      * values and subtracts them from 'stats' and 'cycles' before
 619      * reporting to the user */
 620     unsigned long long stats_zero[DP_N_STATS];
 621     uint64_t cycles_zero[PMD_N_CYCLES];
 622
 623     /* Set to true if the pmd thread needs to be reloaded. */
 624     bool need_reload;
 625 };
 626
 627 /* Interface to netdev-based datapath. */
 628 struct dpif_netdev {
 629     struct dpif dpif;
 630     struct dp_netdev *dp;
 631     uint64_t last_port_seq;
 632 };
 633
 634 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
 635                               struct dp_netdev_port **portp)
 636     OVS_REQUIRES(dp->port_mutex);
 637 static int get_port_by_name(struct dp_netdev *dp, const char *devname,
 638                             struct dp_netdev_port **portp)
 639     OVS_REQUIRES(dp->port_mutex);
 640 static void dp_netdev_free(struct dp_netdev *)
 641     OVS_REQUIRES(dp_netdev_mutex);
 642 static int do_add_port(struct dp_netdev *dp, const char *devname,
 643                        const char *type, odp_port_t port_no)
 644     OVS_REQUIRES(dp->port_mutex);
 645 static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
 646     OVS_REQUIRES(dp->port_mutex);
 647 static int dpif_netdev_open(const struct dpif_class *, const char *name,
 648                             bool create, struct dpif **);
 649 static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
 650                                       struct dp_packet_batch *,
 651                                       bool may_steal, const struct flow *flow,
 652                                       const struct nlattr *actions,
 653                                       size_t actions_len,
 654                                       long long now);
 655 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
 656                             struct dp_packet_batch *, odp_port_t port_no);
 657 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
 658                                   struct dp_packet_batch *);
 659
 660 static void dp_netdev_disable_upcall(struct dp_netdev *);
 661 static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
 662 static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
 663                                     struct dp_netdev *dp, unsigned core_id,
 664                                     int numa_id);
 665 static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
 666 static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
 667     OVS_REQUIRES(dp->port_mutex);
 668
 669 static void *pmd_thread_main(void *);
 670 static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 671                                                       unsigned core_id);
 672 static struct dp_netdev_pmd_thread *
 673 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 674 static void dp_netdev_del_pmd(struct dp_netdev *dp,
 675                               struct dp_netdev_pmd_thread *pmd);
 676 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 677 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 678 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 679                                          struct dp_netdev_port *port)
 680     OVS_REQUIRES(pmd->port_mutex);
 681 static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
 682                                            struct tx_port *tx)
 683     OVS_REQUIRES(pmd->port_mutex);
 684 static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
 685                                      struct dp_netdev_rxq *rxq)
 686     OVS_REQUIRES(pmd->port_mutex);
 687 static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
 688                                        struct rxq_poll *poll)
 689     OVS_REQUIRES(pmd->port_mutex);
 690 static void reconfigure_datapath(struct dp_netdev *dp)
 691     OVS_REQUIRES(dp->port_mutex);
 692 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 693 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 694 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
 695 static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
 696     OVS_REQUIRES(pmd->port_mutex);
 697 static inline void
 698 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
 699 static void
 700 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
 701                                long long now, bool purge);
 702 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 703                                       struct tx_port *tx, long long now);
 704
 705 static inline bool emc_entry_alive(struct emc_entry *ce);
 706 static void emc_clear_entry(struct emc_entry *ce);
 707
 708 static void
 709 emc_cache_init(struct emc_cache *flow_cache)
 710 {
 711     int i;
 712
 713     flow_cache->sweep_idx = 0;
 714     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 715         flow_cache->entries[i].flow = NULL;
 716         flow_cache->entries[i].key.hash = 0;
 717         flow_cache->entries[i].key.len = sizeof(struct miniflow);
 718         flowmap_init(&flow_cache->entries[i].key.mf.map);
 719     }
 720 }
 721
 722 static void
 723 emc_cache_uninit(struct emc_cache *flow_cache)
 724 {
 725     int i;
 726
 727     for (i = 0; i < ARRAY_SIZE(flow_cache->entries); i++) {
 728         emc_clear_entry(&flow_cache->entries[i]);
 729     }
 730 }
 731
 732 /* Check and clear dead flow references slowly (one entry at each
 733  * invocation).  */
 734 static void
 735 emc_cache_slow_sweep(struct emc_cache *flow_cache)
 736 {
 737     struct emc_entry *entry = &flow_cache->entries[flow_cache->sweep_idx];
 738
 739     if (!emc_entry_alive(entry)) {
 740         emc_clear_entry(entry);
 741     }
 742     flow_cache->sweep_idx = (flow_cache->sweep_idx + 1) & EM_FLOW_HASH_MASK;
 743 }
 744
 745 /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
 746 bool
 747 dpif_is_netdev(const struct dpif *dpif)
 748 {
 749     return dpif->dpif_class->open == dpif_netdev_open;
 750 }
 751
 752 static struct dpif_netdev *
 753 dpif_netdev_cast(const struct dpif *dpif)
 754 {
 755     ovs_assert(dpif_is_netdev(dpif));
 756     return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
 757 }
 758
 759 static struct dp_netdev *
 760 get_dp_netdev(const struct dpif *dpif)
 761 {
 762     return dpif_netdev_cast(dpif)->dp;
 763 }
 764 \f
 765 enum pmd_info_type {
 766     PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
 767     PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
 768     PMD_INFO_SHOW_RXQ     /* Show poll-lists of pmd threads. */
 769 };
 770
 771 static void
 772 pmd_info_show_stats(struct ds *reply,
 773                     struct dp_netdev_pmd_thread *pmd,
 774                     unsigned long long stats[DP_N_STATS],
 775                     uint64_t cycles[PMD_N_CYCLES])
 776 {
 777     unsigned long long total_packets = 0;
 778     uint64_t total_cycles = 0;
 779     int i;
 780
 781     /* These loops subtracts reference values ('*_zero') from the counters.
 782      * Since loads and stores are relaxed, it might be possible for a '*_zero'
 783      * value to be more recent than the current value we're reading from the
 784      * counter.  This is not a big problem, since these numbers are not
 785      * supposed to be too accurate, but we should at least make sure that
 786      * the result is not negative. */
 787     for (i = 0; i < DP_N_STATS; i++) {
 788         if (stats[i] > pmd->stats_zero[i]) {
 789             stats[i] -= pmd->stats_zero[i];
 790         } else {
 791             stats[i] = 0;
 792         }
 793
 794         if (i != DP_STAT_LOST) {
 795             /* Lost packets are already included in DP_STAT_MISS */
 796             total_packets += stats[i];
 797         }
 798     }
 799
 800     for (i = 0; i < PMD_N_CYCLES; i++) {
 801         if (cycles[i] > pmd->cycles_zero[i]) {
 802            cycles[i] -= pmd->cycles_zero[i];
 803         } else {
 804             cycles[i] = 0;
 805         }
 806
 807         total_cycles += cycles[i];
 808     }
 809
 810     ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
 811                         ? "main thread" : "pmd thread");
 812
 813     if (pmd->numa_id != OVS_NUMA_UNSPEC) {
 814         ds_put_format(reply, " numa_id %d", pmd->numa_id);
 815     }
 816     if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
 817         ds_put_format(reply, " core_id %u", pmd->core_id);
 818     }
 819     ds_put_cstr(reply, ":\n");
 820
 821     ds_put_format(reply,
 822                   "\temc hits:%llu\n\tmegaflow hits:%llu\n"
 823                   "\tavg. subtable lookups per hit:%.2f\n"
 824                   "\tmiss:%llu\n\tlost:%llu\n",
 825                   stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
 826                   stats[DP_STAT_MASKED_HIT] > 0
 827                   ? (1.0*stats[DP_STAT_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT]
 828                   : 0,
 829                   stats[DP_STAT_MISS], stats[DP_STAT_LOST]);
 830
 831     if (total_cycles == 0) {
 832         return;
 833     }
 834
 835     ds_put_format(reply,
 836                   "\tidle cycles:%"PRIu64" (%.02f%%)\n"
 837                   "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
 838                   cycles[PMD_CYCLES_IDLE],
 839                   cycles[PMD_CYCLES_IDLE] / (double)total_cycles * 100,
 840                   cycles[PMD_CYCLES_PROCESSING],
 841                   cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
 842
 843     if (total_packets == 0) {
 844         return;
 845     }
 846
 847     ds_put_format(reply,
 848                   "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
 849                   total_cycles / (double)total_packets,
 850                   total_cycles, total_packets);
 851
 852     ds_put_format(reply,
 853                   "\tavg processing cycles per packet: "
 854                   "%.02f (%"PRIu64"/%llu)\n",
 855                   cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
 856                   cycles[PMD_CYCLES_PROCESSING], total_packets);
 857 }
 858
 859 static void
 860 pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
 861                     struct dp_netdev_pmd_thread *pmd,
 862                     unsigned long long stats[DP_N_STATS],
 863                     uint64_t cycles[PMD_N_CYCLES])
 864 {
 865     int i;
 866
 867     /* We cannot write 'stats' and 'cycles' (because they're written by other
 868      * threads) and we shouldn't change 'stats' (because they're used to count
 869      * datapath stats, which must not be cleared here).  Instead, we save the
 870      * current values and subtract them from the values to be displayed in the
 871      * future */
 872     for (i = 0; i < DP_N_STATS; i++) {
 873         pmd->stats_zero[i] = stats[i];
 874     }
 875     for (i = 0; i < PMD_N_CYCLES; i++) {
 876         pmd->cycles_zero[i] = cycles[i];
 877     }
 878 }
 879
 880 static int
 881 compare_poll_list(const void *a_, const void *b_)
 882 {
 883     const struct rxq_poll *a = a_;
 884     const struct rxq_poll *b = b_;
 885
 886     const char *namea = netdev_rxq_get_name(a->rxq->rx);
 887     const char *nameb = netdev_rxq_get_name(b->rxq->rx);
 888
 889     int cmp = strcmp(namea, nameb);
 890     if (!cmp) {
 891         return netdev_rxq_get_queue_id(a->rxq->rx)
 892                - netdev_rxq_get_queue_id(b->rxq->rx);
 893     } else {
 894         return cmp;
 895     }
 896 }
 897
 898 static void
 899 sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
 900                  size_t *n)
 901 {
 902     struct rxq_poll *ret, *poll;
 903     size_t i;
 904
 905     *n = hmap_count(&pmd->poll_list);
 906     if (!*n) {
 907         ret = NULL;
 908     } else {
 909         ret = xcalloc(*n, sizeof *ret);
 910         i = 0;
 911         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
 912             ret[i] = *poll;
 913             i++;
 914         }
 915         ovs_assert(i == *n);
 916         qsort(ret, *n, sizeof *ret, compare_poll_list);
 917     }
 918
 919     *list = ret;
 920 }
 921
 922 static void
 923 pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
 924 {
 925     if (pmd->core_id != NON_PMD_CORE_ID) {
 926         const char *prev_name = NULL;
 927         struct rxq_poll *list;
 928         size_t i, n;
 929
 930         ds_put_format(reply,
 931                       "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
 932                       pmd->numa_id, pmd->core_id, (pmd->isolated)
 933                                                   ? "true" : "false");
 934
 935         ovs_mutex_lock(&pmd->port_mutex);
 936         sorted_poll_list(pmd, &list, &n);
 937         for (i = 0; i < n; i++) {
 938             const char *name = netdev_rxq_get_name(list[i].rxq->rx);
 939
 940             if (!prev_name || strcmp(name, prev_name)) {
 941                 if (prev_name) {
 942                     ds_put_cstr(reply, "\n");
 943                 }
 944                 ds_put_format(reply, "\tport: %s\tqueue-id:", name);
 945             }
 946             ds_put_format(reply, " %d",
 947                           netdev_rxq_get_queue_id(list[i].rxq->rx));
 948             prev_name = name;
 949         }
 950         ovs_mutex_unlock(&pmd->port_mutex);
 951         ds_put_cstr(reply, "\n");
 952         free(list);
 953     }
 954 }
 955
 956 static int
 957 compare_poll_thread_list(const void *a_, const void *b_)
 958 {
 959     const struct dp_netdev_pmd_thread *a, *b;
 960
 961     a = *(struct dp_netdev_pmd_thread **)a_;
 962     b = *(struct dp_netdev_pmd_thread **)b_;
 963
 964     if (a->core_id < b->core_id) {
 965         return -1;
 966     }
 967     if (a->core_id > b->core_id) {
 968         return 1;
 969     }
 970     return 0;
 971 }
 972
 973 /* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
 974  * this list, as long as we do not go to quiescent state. */
 975 static void
 976 sorted_poll_thread_list(struct dp_netdev *dp,
 977                         struct dp_netdev_pmd_thread ***list,
 978                         size_t *n)
 979 {
 980     struct dp_netdev_pmd_thread *pmd;
 981     struct dp_netdev_pmd_thread **pmd_list;
 982     size_t k = 0, n_pmds;
 983
 984     n_pmds = cmap_count(&dp->poll_threads);
 985     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
 986
 987     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
 988         if (k >= n_pmds) {
 989             break;
 990         }
 991         pmd_list[k++] = pmd;
 992     }
 993
 994     qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
 995
 996     *list = pmd_list;
 997     *n = k;
 998 }
 999
1000 static void
1001 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1002                      void *aux)
1003 {
1004     struct ds reply = DS_EMPTY_INITIALIZER;
1005     struct dp_netdev_pmd_thread **pmd_list;
1006     struct dp_netdev *dp = NULL;
1007     size_t n;
1008     enum pmd_info_type type = *(enum pmd_info_type *) aux;
1009
1010     ovs_mutex_lock(&dp_netdev_mutex);
1011
1012     if (argc == 2) {
1013         dp = shash_find_data(&dp_netdevs, argv[1]);
1014     } else if (shash_count(&dp_netdevs) == 1) {
1015         /* There's only one datapath */
1016         dp = shash_first(&dp_netdevs)->data;
1017     }
1018
1019     if (!dp) {
1020         ovs_mutex_unlock(&dp_netdev_mutex);
1021         unixctl_command_reply_error(conn,
1022                                     "please specify an existing datapath");
1023         return;
1024     }
1025
1026     sorted_poll_thread_list(dp, &pmd_list, &n);
1027     for (size_t i = 0; i < n; i++) {
1028         struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1029         if (!pmd) {
1030             break;
1031         }
1032
1033         if (type == PMD_INFO_SHOW_RXQ) {
1034             pmd_info_show_rxq(&reply, pmd);
1035         } else {
1036             unsigned long long stats[DP_N_STATS];
1037             uint64_t cycles[PMD_N_CYCLES];
1038
1039             /* Read current stats and cycle counters */
1040             for (size_t j = 0; j < ARRAY_SIZE(stats); j++) {
1041                 atomic_read_relaxed(&pmd->stats.n[j], &stats[j]);
1042             }
1043             for (size_t j = 0; j < ARRAY_SIZE(cycles); j++) {
1044                 atomic_read_relaxed(&pmd->cycles.n[j], &cycles[j]);
1045             }
1046
1047             if (type == PMD_INFO_CLEAR_STATS) {
1048                 pmd_info_clear_stats(&reply, pmd, stats, cycles);
1049             } else if (type == PMD_INFO_SHOW_STATS) {
1050                 pmd_info_show_stats(&reply, pmd, stats, cycles);
1051             }
1052         }
1053     }
1054     free(pmd_list);
1055
1056     ovs_mutex_unlock(&dp_netdev_mutex);
1057
1058     unixctl_command_reply(conn, ds_cstr(&reply));
1059     ds_destroy(&reply);
1060 }
1061 \f
1062 static int
1063 dpif_netdev_init(void)
1064 {
1065     static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1066                               clear_aux = PMD_INFO_CLEAR_STATS,
1067                               poll_aux = PMD_INFO_SHOW_RXQ;
1068
1069     unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
1070                              0, 1, dpif_netdev_pmd_info,
1071                              (void *)&show_aux);
1072     unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
1073                              0, 1, dpif_netdev_pmd_info,
1074                              (void *)&clear_aux);
1075     unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
1076                              0, 1, dpif_netdev_pmd_info,
1077                              (void *)&poll_aux);
1078     return 0;
1079 }
1080
1081 static int
1082 dpif_netdev_enumerate(struct sset *all_dps,
1083                       const struct dpif_class *dpif_class)
1084 {
1085     struct shash_node *node;
1086
1087     ovs_mutex_lock(&dp_netdev_mutex);
1088     SHASH_FOR_EACH(node, &dp_netdevs) {
1089         struct dp_netdev *dp = node->data;
1090         if (dpif_class != dp->class) {
1091             /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1092              * If the class doesn't match, skip this dpif. */
1093              continue;
1094         }
1095         sset_add(all_dps, node->name);
1096     }
1097     ovs_mutex_unlock(&dp_netdev_mutex);
1098
1099     return 0;
1100 }
1101
1102 static bool
1103 dpif_netdev_class_is_dummy(const struct dpif_class *class)
1104 {
1105     return class != &dpif_netdev_class;
1106 }
1107
1108 static const char *
1109 dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1110 {
1111     return strcmp(type, "internal") ? type
1112                   : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1113                   : "tap";
1114 }
1115
1116 static struct dpif *
1117 create_dpif_netdev(struct dp_netdev *dp)
1118 {
1119     uint16_t netflow_id = hash_string(dp->name, 0);
1120     struct dpif_netdev *dpif;
1121
1122     ovs_refcount_ref(&dp->ref_cnt);
1123
1124     dpif = xmalloc(sizeof *dpif);
1125     dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1126     dpif->dp = dp;
1127     dpif->last_port_seq = seq_read(dp->port_seq);
1128
1129     return &dpif->dpif;
1130 }
1131
1132 /* Choose an unused, non-zero port number and return it on success.
1133  * Return ODPP_NONE on failure. */
1134 static odp_port_t
1135 choose_port(struct dp_netdev *dp, const char *name)
1136     OVS_REQUIRES(dp->port_mutex)
1137 {
1138     uint32_t port_no;
1139
1140     if (dp->class != &dpif_netdev_class) {
1141         const char *p;
1142         int start_no = 0;
1143
1144         /* If the port name begins with "br", start the number search at
1145          * 100 to make writing tests easier. */
1146         if (!strncmp(name, "br", 2)) {
1147             start_no = 100;
1148         }
1149
1150         /* If the port name contains a number, try to assign that port number.
1151          * This can make writing unit tests easier because port numbers are
1152          * predictable. */
1153         for (p = name; *p != '\0'; p++) {
1154             if (isdigit((unsigned char) *p)) {
1155                 port_no = start_no + strtol(p, NULL, 10);
1156                 if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1157                     && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1158                     return u32_to_odp(port_no);
1159                 }
1160                 break;
1161             }
1162         }
1163     }
1164
1165     for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1166         if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1167             return u32_to_odp(port_no);
1168         }
1169     }
1170
1171     return ODPP_NONE;
1172 }
1173
1174 static int
1175 create_dp_netdev(const char *name, const struct dpif_class *class,
1176                  struct dp_netdev **dpp)
1177     OVS_REQUIRES(dp_netdev_mutex)
1178 {
1179     struct dp_netdev *dp;
1180     int error;
1181
1182     dp = xzalloc(sizeof *dp);
1183     shash_add(&dp_netdevs, name, dp);
1184
1185     *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1186     *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1187     ovs_refcount_init(&dp->ref_cnt);
1188     atomic_flag_clear(&dp->destroyed);
1189
1190     ovs_mutex_init(&dp->port_mutex);
1191     hmap_init(&dp->ports);
1192     dp->port_seq = seq_create();
1193     fat_rwlock_init(&dp->upcall_rwlock);
1194
1195     dp->reconfigure_seq = seq_create();
1196     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1197
1198     for (int i = 0; i < N_METER_LOCKS; ++i) {
1199         ovs_mutex_init_adaptive(&dp->meter_locks[i]);
1200     }
1201
1202     /* Disable upcalls by default. */
1203     dp_netdev_disable_upcall(dp);
1204     dp->upcall_aux = NULL;
1205     dp->upcall_cb = NULL;
1206
1207     conntrack_init(&dp->conntrack);
1208
1209     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1210
1211     cmap_init(&dp->poll_threads);
1212
1213     ovs_mutex_init(&dp->tx_qid_pool_mutex);
1214     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1215     dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1216
1217     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1218     ovsthread_key_create(&dp->per_pmd_key, NULL);
1219
1220     ovs_mutex_lock(&dp->port_mutex);
1221     /* non-PMD will be created before all other threads and will
1222      * allocate static_tx_qid = 0. */
1223     dp_netdev_set_nonpmd(dp);
1224
1225     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1226                                                              "internal"),
1227                         ODPP_LOCAL);
1228     ovs_mutex_unlock(&dp->port_mutex);
1229     if (error) {
1230         dp_netdev_free(dp);
1231         return error;
1232     }
1233
1234     dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1235     *dpp = dp;
1236     return 0;
1237 }
1238
1239 static void
1240 dp_netdev_request_reconfigure(struct dp_netdev *dp)
1241 {
1242     seq_change(dp->reconfigure_seq);
1243 }
1244
1245 static bool
1246 dp_netdev_is_reconf_required(struct dp_netdev *dp)
1247 {
1248     return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1249 }
1250
1251 static int
1252 dpif_netdev_open(const struct dpif_class *class, const char *name,
1253                  bool create, struct dpif **dpifp)
1254 {
1255     struct dp_netdev *dp;
1256     int error;
1257
1258     ovs_mutex_lock(&dp_netdev_mutex);
1259     dp = shash_find_data(&dp_netdevs, name);
1260     if (!dp) {
1261         error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1262     } else {
1263         error = (dp->class != class ? EINVAL
1264                  : create ? EEXIST
1265                  : 0);
1266     }
1267     if (!error) {
1268         *dpifp = create_dpif_netdev(dp);
1269         dp->dpif = *dpifp;
1270     }
1271     ovs_mutex_unlock(&dp_netdev_mutex);
1272
1273     return error;
1274 }
1275
1276 static void
1277 dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1278     OVS_NO_THREAD_SAFETY_ANALYSIS
1279 {
1280     /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1281     ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1282
1283     /* Before freeing a lock we should release it */
1284     fat_rwlock_unlock(&dp->upcall_rwlock);
1285     fat_rwlock_destroy(&dp->upcall_rwlock);
1286 }
1287
1288 static void
1289 dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
1290     OVS_REQUIRES(dp->meter_locks[meter_id % N_METER_LOCKS])
1291 {
1292     if (dp->meters[meter_id]) {
1293         free(dp->meters[meter_id]);
1294         dp->meters[meter_id] = NULL;
1295     }
1296 }
1297
1298 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1299  * through the 'dp_netdevs' shash while freeing 'dp'. */
1300 static void
1301 dp_netdev_free(struct dp_netdev *dp)
1302     OVS_REQUIRES(dp_netdev_mutex)
1303 {
1304     struct dp_netdev_port *port, *next;
1305
1306     shash_find_and_delete(&dp_netdevs, dp->name);
1307
1308     ovs_mutex_lock(&dp->port_mutex);
1309     HMAP_FOR_EACH_SAFE (port, next, node, &dp->ports) {
1310         do_del_port(dp, port);
1311     }
1312     ovs_mutex_unlock(&dp->port_mutex);
1313
1314     dp_netdev_destroy_all_pmds(dp, true);
1315     cmap_destroy(&dp->poll_threads);
1316
1317     ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1318     id_pool_destroy(dp->tx_qid_pool);
1319
1320     ovs_mutex_destroy(&dp->non_pmd_mutex);
1321     ovsthread_key_delete(dp->per_pmd_key);
1322
1323     conntrack_destroy(&dp->conntrack);
1324
1325
1326     seq_destroy(dp->reconfigure_seq);
1327
1328     seq_destroy(dp->port_seq);
1329     hmap_destroy(&dp->ports);
1330     ovs_mutex_destroy(&dp->port_mutex);
1331
1332     /* Upcalls must be disabled at this point */
1333     dp_netdev_destroy_upcall_lock(dp);
1334
1335     int i;
1336
1337     for (i = 0; i < MAX_METERS; ++i) {
1338         meter_lock(dp, i);
1339         dp_delete_meter(dp, i);
1340         meter_unlock(dp, i);
1341     }
1342     for (i = 0; i < N_METER_LOCKS; ++i) {
1343         ovs_mutex_destroy(&dp->meter_locks[i]);
1344     }
1345
1346     free(dp->pmd_cmask);
1347     free(CONST_CAST(char *, dp->name));
1348     free(dp);
1349 }
1350
1351 static void
1352 dp_netdev_unref(struct dp_netdev *dp)
1353 {
1354     if (dp) {
1355         /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1356          * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1357         ovs_mutex_lock(&dp_netdev_mutex);
1358         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1359             dp_netdev_free(dp);
1360         }
1361         ovs_mutex_unlock(&dp_netdev_mutex);
1362     }
1363 }
1364
1365 static void
1366 dpif_netdev_close(struct dpif *dpif)
1367 {
1368     struct dp_netdev *dp = get_dp_netdev(dpif);
1369
1370     dp_netdev_unref(dp);
1371     free(dpif);
1372 }
1373
1374 static int
1375 dpif_netdev_destroy(struct dpif *dpif)
1376 {
1377     struct dp_netdev *dp = get_dp_netdev(dpif);
1378
1379     if (!atomic_flag_test_and_set(&dp->destroyed)) {
1380         if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1381             /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1382             OVS_NOT_REACHED();
1383         }
1384     }
1385
1386     return 0;
1387 }
1388
1389 /* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1390  * load/store semantics.  While the increment is not atomic, the load and
1391  * store operations are, making it impossible to read inconsistent values.
1392  *
1393  * This is used to update thread local stats counters. */
1394 static void
1395 non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1396 {
1397     unsigned long long tmp;
1398
1399     atomic_read_relaxed(var, &tmp);
1400     tmp += n;
1401     atomic_store_relaxed(var, tmp);
1402 }
1403
1404 static int
1405 dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1406 {
1407     struct dp_netdev *dp = get_dp_netdev(dpif);
1408     struct dp_netdev_pmd_thread *pmd;
1409
1410     stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1411     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1412         unsigned long long n;
1413         stats->n_flows += cmap_count(&pmd->flow_table);
1414
1415         atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
1416         stats->n_hit += n;
1417         atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
1418         stats->n_hit += n;
1419         atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
1420         stats->n_missed += n;
1421         atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
1422         stats->n_lost += n;
1423     }
1424     stats->n_masks = UINT32_MAX;
1425     stats->n_mask_hit = UINT64_MAX;
1426
1427     return 0;
1428 }
1429
1430 static void
1431 dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1432 {
1433     if (pmd->core_id == NON_PMD_CORE_ID) {
1434         ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1435         ovs_mutex_lock(&pmd->port_mutex);
1436         pmd_load_cached_ports(pmd);
1437         ovs_mutex_unlock(&pmd->port_mutex);
1438         ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1439         return;
1440     }
1441
1442     ovs_mutex_lock(&pmd->cond_mutex);
1443     seq_change(pmd->reload_seq);
1444     atomic_store_relaxed(&pmd->reload, true);
1445     ovs_mutex_cond_wait(&pmd->cond, &pmd->cond_mutex);
1446     ovs_mutex_unlock(&pmd->cond_mutex);
1447 }
1448
1449 static uint32_t
1450 hash_port_no(odp_port_t port_no)
1451 {
1452     return hash_int(odp_to_u32(port_no), 0);
1453 }
1454
1455 static int
1456 port_create(const char *devname, const char *type,
1457             odp_port_t port_no, struct dp_netdev_port **portp)
1458 {
1459     struct netdev_saved_flags *sf;
1460     struct dp_netdev_port *port;
1461     enum netdev_flags flags;
1462     struct netdev *netdev;
1463     int error;
1464
1465     *portp = NULL;
1466
1467     /* Open and validate network device. */
1468     error = netdev_open(devname, type, &netdev);
1469     if (error) {
1470         return error;
1471     }
1472     /* XXX reject non-Ethernet devices */
1473
1474     netdev_get_flags(netdev, &flags);
1475     if (flags & NETDEV_LOOPBACK) {
1476         VLOG_ERR("%s: cannot add a loopback device", devname);
1477         error = EINVAL;
1478         goto out;
1479     }
1480
1481     error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
1482     if (error) {
1483         VLOG_ERR("%s: cannot set promisc flag", devname);
1484         goto out;
1485     }
1486
1487     port = xzalloc(sizeof *port);
1488     port->port_no = port_no;
1489     port->netdev = netdev;
1490     port->type = xstrdup(type);
1491     port->sf = sf;
1492     port->need_reconfigure = true;
1493     ovs_mutex_init(&port->txq_used_mutex);
1494
1495     *portp = port;
1496
1497     return 0;
1498
1499 out:
1500     netdev_close(netdev);
1501     return error;
1502 }
1503
1504 static int
1505 do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1506             odp_port_t port_no)
1507     OVS_REQUIRES(dp->port_mutex)
1508 {
1509     struct dp_netdev_port *port;
1510     int error;
1511
1512     /* Reject devices already in 'dp'. */
1513     if (!get_port_by_name(dp, devname, &port)) {
1514         return EEXIST;
1515     }
1516
1517     error = port_create(devname, type, port_no, &port);
1518     if (error) {
1519         return error;
1520     }
1521
1522     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1523     seq_change(dp->port_seq);
1524
1525     reconfigure_datapath(dp);
1526
1527     return 0;
1528 }
1529
1530 static int
1531 dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1532                      odp_port_t *port_nop)
1533 {
1534     struct dp_netdev *dp = get_dp_netdev(dpif);
1535     char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1536     const char *dpif_port;
1537     odp_port_t port_no;
1538     int error;
1539
1540     ovs_mutex_lock(&dp->port_mutex);
1541     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1542     if (*port_nop != ODPP_NONE) {
1543         port_no = *port_nop;
1544         error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1545     } else {
1546         port_no = choose_port(dp, dpif_port);
1547         error = port_no == ODPP_NONE ? EFBIG : 0;
1548     }
1549     if (!error) {
1550         *port_nop = port_no;
1551         error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1552     }
1553     ovs_mutex_unlock(&dp->port_mutex);
1554
1555     return error;
1556 }
1557
1558 static int
1559 dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1560 {
1561     struct dp_netdev *dp = get_dp_netdev(dpif);
1562     int error;
1563
1564     ovs_mutex_lock(&dp->port_mutex);
1565     if (port_no == ODPP_LOCAL) {
1566         error = EINVAL;
1567     } else {
1568         struct dp_netdev_port *port;
1569
1570         error = get_port_by_number(dp, port_no, &port);
1571         if (!error) {
1572             do_del_port(dp, port);
1573         }
1574     }
1575     ovs_mutex_unlock(&dp->port_mutex);
1576
1577     return error;
1578 }
1579
1580 static bool
1581 is_valid_port_number(odp_port_t port_no)
1582 {
1583     return port_no != ODPP_NONE;
1584 }
1585
1586 static struct dp_netdev_port *
1587 dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1588     OVS_REQUIRES(dp->port_mutex)
1589 {
1590     struct dp_netdev_port *port;
1591
1592     HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1593         if (port->port_no == port_no) {
1594             return port;
1595         }
1596     }
1597     return NULL;
1598 }
1599
1600 static int
1601 get_port_by_number(struct dp_netdev *dp,
1602                    odp_port_t port_no, struct dp_netdev_port **portp)
1603     OVS_REQUIRES(dp->port_mutex)
1604 {
1605     if (!is_valid_port_number(port_no)) {
1606         *portp = NULL;
1607         return EINVAL;
1608     } else {
1609         *portp = dp_netdev_lookup_port(dp, port_no);
1610         return *portp ? 0 : ENODEV;
1611     }
1612 }
1613
1614 static void
1615 port_destroy(struct dp_netdev_port *port)
1616 {
1617     if (!port) {
1618         return;
1619     }
1620
1621     netdev_close(port->netdev);
1622     netdev_restore_flags(port->sf);
1623
1624     for (unsigned i = 0; i < port->n_rxq; i++) {
1625         netdev_rxq_close(port->rxqs[i].rx);
1626     }
1627     ovs_mutex_destroy(&port->txq_used_mutex);
1628     free(port->rxq_affinity_list);
1629     free(port->txq_used);
1630     free(port->rxqs);
1631     free(port->type);
1632     free(port);
1633 }
1634
1635 static int
1636 get_port_by_name(struct dp_netdev *dp,
1637                  const char *devname, struct dp_netdev_port **portp)
1638     OVS_REQUIRES(dp->port_mutex)
1639 {
1640     struct dp_netdev_port *port;
1641
1642     HMAP_FOR_EACH (port, node, &dp->ports) {
1643         if (!strcmp(netdev_get_name(port->netdev), devname)) {
1644             *portp = port;
1645             return 0;
1646         }
1647     }
1648
1649     /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1650      * existing port. */
1651     return ENODEV;
1652 }
1653
1654 /* Returns 'true' if there is a port with pmd netdev. */
1655 static bool
1656 has_pmd_port(struct dp_netdev *dp)
1657     OVS_REQUIRES(dp->port_mutex)
1658 {
1659     struct dp_netdev_port *port;
1660
1661     HMAP_FOR_EACH (port, node, &dp->ports) {
1662         if (netdev_is_pmd(port->netdev)) {
1663             return true;
1664         }
1665     }
1666
1667     return false;
1668 }
1669
1670 static void
1671 do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1672     OVS_REQUIRES(dp->port_mutex)
1673 {
1674     hmap_remove(&dp->ports, &port->node);
1675     seq_change(dp->port_seq);
1676
1677     reconfigure_datapath(dp);
1678
1679     port_destroy(port);
1680 }
1681
1682 static void
1683 answer_port_query(const struct dp_netdev_port *port,
1684                   struct dpif_port *dpif_port)
1685 {
1686     dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1687     dpif_port->type = xstrdup(port->type);
1688     dpif_port->port_no = port->port_no;
1689 }
1690
1691 static int
1692 dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1693                                  struct dpif_port *dpif_port)
1694 {
1695     struct dp_netdev *dp = get_dp_netdev(dpif);
1696     struct dp_netdev_port *port;
1697     int error;
1698
1699     ovs_mutex_lock(&dp->port_mutex);
1700     error = get_port_by_number(dp, port_no, &port);
1701     if (!error && dpif_port) {
1702         answer_port_query(port, dpif_port);
1703     }
1704     ovs_mutex_unlock(&dp->port_mutex);
1705
1706     return error;
1707 }
1708
1709 static int
1710 dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1711                                struct dpif_port *dpif_port)
1712 {
1713     struct dp_netdev *dp = get_dp_netdev(dpif);
1714     struct dp_netdev_port *port;
1715     int error;
1716
1717     ovs_mutex_lock(&dp->port_mutex);
1718     error = get_port_by_name(dp, devname, &port);
1719     if (!error && dpif_port) {
1720         answer_port_query(port, dpif_port);
1721     }
1722     ovs_mutex_unlock(&dp->port_mutex);
1723
1724     return error;
1725 }
1726
1727 static void
1728 dp_netdev_flow_free(struct dp_netdev_flow *flow)
1729 {
1730     dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1731     free(flow);
1732 }
1733
1734 static void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1735 {
1736     if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1737         ovsrcu_postpone(dp_netdev_flow_free, flow);
1738     }
1739 }
1740
1741 static uint32_t
1742 dp_netdev_flow_hash(const ovs_u128 *ufid)
1743 {
1744     return ufid->u32[0];
1745 }
1746
1747 static inline struct dpcls *
1748 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1749                            odp_port_t in_port)
1750 {
1751     struct dpcls *cls;
1752     uint32_t hash = hash_port_no(in_port);
1753     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1754         if (cls->in_port == in_port) {
1755             /* Port classifier exists already */
1756             return cls;
1757         }
1758     }
1759     return NULL;
1760 }
1761
1762 static inline struct dpcls *
1763 dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1764                          odp_port_t in_port)
1765     OVS_REQUIRES(pmd->flow_mutex)
1766 {
1767     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1768     uint32_t hash = hash_port_no(in_port);
1769
1770     if (!cls) {
1771         /* Create new classifier for in_port */
1772         cls = xmalloc(sizeof(*cls));
1773         dpcls_init(cls);
1774         cls->in_port = in_port;
1775         cmap_insert(&pmd->classifiers, &cls->node, hash);
1776         VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1777     }
1778     return cls;
1779 }
1780
1781 static void
1782 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
1783                           struct dp_netdev_flow *flow)
1784     OVS_REQUIRES(pmd->flow_mutex)
1785 {
1786     struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
1787     struct dpcls *cls;
1788     odp_port_t in_port = flow->flow.in_port.odp_port;
1789
1790     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1791     ovs_assert(cls != NULL);
1792     dpcls_remove(cls, &flow->cr);
1793     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
1794     flow->dead = true;
1795
1796     dp_netdev_flow_unref(flow);
1797 }
1798
1799 static void
1800 dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
1801 {
1802     struct dp_netdev_flow *netdev_flow;
1803
1804     ovs_mutex_lock(&pmd->flow_mutex);
1805     CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
1806         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
1807     }
1808     ovs_mutex_unlock(&pmd->flow_mutex);
1809 }
1810
1811 static int
1812 dpif_netdev_flow_flush(struct dpif *dpif)
1813 {
1814     struct dp_netdev *dp = get_dp_netdev(dpif);
1815     struct dp_netdev_pmd_thread *pmd;
1816
1817     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1818         dp_netdev_pmd_flow_flush(pmd);
1819     }
1820
1821     return 0;
1822 }
1823
1824 struct dp_netdev_port_state {
1825     struct hmap_position position;
1826     char *name;
1827 };
1828
1829 static int
1830 dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
1831 {
1832     *statep = xzalloc(sizeof(struct dp_netdev_port_state));
1833     return 0;
1834 }
1835
1836 static int
1837 dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
1838                            struct dpif_port *dpif_port)
1839 {
1840     struct dp_netdev_port_state *state = state_;
1841     struct dp_netdev *dp = get_dp_netdev(dpif);
1842     struct hmap_node *node;
1843     int retval;
1844
1845     ovs_mutex_lock(&dp->port_mutex);
1846     node = hmap_at_position(&dp->ports, &state->position);
1847     if (node) {
1848         struct dp_netdev_port *port;
1849
1850         port = CONTAINER_OF(node, struct dp_netdev_port, node);
1851
1852         free(state->name);
1853         state->name = xstrdup(netdev_get_name(port->netdev));
1854         dpif_port->name = state->name;
1855         dpif_port->type = port->type;
1856         dpif_port->port_no = port->port_no;
1857
1858         retval = 0;
1859     } else {
1860         retval = EOF;
1861     }
1862     ovs_mutex_unlock(&dp->port_mutex);
1863
1864     return retval;
1865 }
1866
1867 static int
1868 dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
1869 {
1870     struct dp_netdev_port_state *state = state_;
1871     free(state->name);
1872     free(state);
1873     return 0;
1874 }
1875
1876 static int
1877 dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
1878 {
1879     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1880     uint64_t new_port_seq;
1881     int error;
1882
1883     new_port_seq = seq_read(dpif->dp->port_seq);
1884     if (dpif->last_port_seq != new_port_seq) {
1885         dpif->last_port_seq = new_port_seq;
1886         error = ENOBUFS;
1887     } else {
1888         error = EAGAIN;
1889     }
1890
1891     return error;
1892 }
1893
1894 static void
1895 dpif_netdev_port_poll_wait(const struct dpif *dpif_)
1896 {
1897     struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
1898
1899     seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
1900 }
1901
1902 static struct dp_netdev_flow *
1903 dp_netdev_flow_cast(const struct dpcls_rule *cr)
1904 {
1905     return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
1906 }
1907
1908 static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
1909 {
1910     return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
1911 }
1912
1913 /* netdev_flow_key utilities.
1914  *
1915  * netdev_flow_key is basically a miniflow.  We use these functions
1916  * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
1917  * functions (miniflow_clone_inline, miniflow_equal, ...), because:
1918  *
1919  * - Since we are dealing exclusively with miniflows created by
1920  *   miniflow_extract(), if the map is different the miniflow is different.
1921  *   Therefore we can be faster by comparing the map and the miniflow in a
1922  *   single memcmp().
1923  * - These functions can be inlined by the compiler. */
1924
1925 /* Given the number of bits set in miniflow's maps, returns the size of the
1926  * 'netdev_flow_key.mf' */
1927 static inline size_t
1928 netdev_flow_key_size(size_t flow_u64s)
1929 {
1930     return sizeof(struct miniflow) + MINIFLOW_VALUES_SIZE(flow_u64s);
1931 }
1932
1933 static inline bool
1934 netdev_flow_key_equal(const struct netdev_flow_key *a,
1935                       const struct netdev_flow_key *b)
1936 {
1937     /* 'b->len' may be not set yet. */
1938     return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
1939 }
1940
1941 /* Used to compare 'netdev_flow_key' in the exact match cache to a miniflow.
1942  * The maps are compared bitwise, so both 'key->mf' and 'mf' must have been
1943  * generated by miniflow_extract. */
1944 static inline bool
1945 netdev_flow_key_equal_mf(const struct netdev_flow_key *key,
1946                          const struct miniflow *mf)
1947 {
1948     return !memcmp(&key->mf, mf, key->len);
1949 }
1950
1951 static inline void
1952 netdev_flow_key_clone(struct netdev_flow_key *dst,
1953                       const struct netdev_flow_key *src)
1954 {
1955     memcpy(dst, src,
1956            offsetof(struct netdev_flow_key, mf) + src->len);
1957 }
1958
1959 /* Initialize a netdev_flow_key 'mask' from 'match'. */
1960 static inline void
1961 netdev_flow_mask_init(struct netdev_flow_key *mask,
1962                       const struct match *match)
1963 {
1964     uint64_t *dst = miniflow_values(&mask->mf);
1965     struct flowmap fmap;
1966     uint32_t hash = 0;
1967     size_t idx;
1968
1969     /* Only check masks that make sense for the flow. */
1970     flow_wc_map(&match->flow, &fmap);
1971     flowmap_init(&mask->mf.map);
1972
1973     FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
1974         uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
1975
1976         if (mask_u64) {
1977             flowmap_set(&mask->mf.map, idx, 1);
1978             *dst++ = mask_u64;
1979             hash = hash_add64(hash, mask_u64);
1980         }
1981     }
1982
1983     map_t map;
1984
1985     FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
1986         hash = hash_add64(hash, map);
1987     }
1988
1989     size_t n = dst - miniflow_get_values(&mask->mf);
1990
1991     mask->hash = hash_finish(hash, n * 8);
1992     mask->len = netdev_flow_key_size(n);
1993 }
1994
1995 /* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
1996 static inline void
1997 netdev_flow_key_init_masked(struct netdev_flow_key *dst,
1998                             const struct flow *flow,
1999                             const struct netdev_flow_key *mask)
2000 {
2001     uint64_t *dst_u64 = miniflow_values(&dst->mf);
2002     const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2003     uint32_t hash = 0;
2004     uint64_t value;
2005
2006     dst->len = mask->len;
2007     dst->mf = mask->mf;   /* Copy maps. */
2008
2009     FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2010         *dst_u64 = value & *mask_u64++;
2011         hash = hash_add64(hash, *dst_u64++);
2012     }
2013     dst->hash = hash_finish(hash,
2014                             (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2015 }
2016
2017 /* Iterate through netdev_flow_key TNL u64 values specified by 'FLOWMAP'. */
2018 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
2019     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
2020
2021 /* Returns a hash value for the bits of 'key' where there are 1-bits in
2022  * 'mask'. */
2023 static inline uint32_t
2024 netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
2025                              const struct netdev_flow_key *mask)
2026 {
2027     const uint64_t *p = miniflow_get_values(&mask->mf);
2028     uint32_t hash = 0;
2029     uint64_t value;
2030
2031     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, key, mask->mf.map) {
2032         hash = hash_add64(hash, value & *p++);
2033     }
2034
2035     return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
2036 }
2037
2038 static inline bool
2039 emc_entry_alive(struct emc_entry *ce)
2040 {
2041     return ce->flow && !ce->flow->dead;
2042 }
2043
2044 static void
2045 emc_clear_entry(struct emc_entry *ce)
2046 {
2047     if (ce->flow) {
2048         dp_netdev_flow_unref(ce->flow);
2049         ce->flow = NULL;
2050     }
2051 }
2052
2053 static inline void
2054 emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2055                  const struct netdev_flow_key *key)
2056 {
2057     if (ce->flow != flow) {
2058         if (ce->flow) {
2059             dp_netdev_flow_unref(ce->flow);
2060         }
2061
2062         if (dp_netdev_flow_ref(flow)) {
2063             ce->flow = flow;
2064         } else {
2065             ce->flow = NULL;
2066         }
2067     }
2068     if (key) {
2069         netdev_flow_key_clone(&ce->key, key);
2070     }
2071 }
2072
2073 static inline void
2074 emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2075            struct dp_netdev_flow *flow)
2076 {
2077     struct emc_entry *to_be_replaced = NULL;
2078     struct emc_entry *current_entry;
2079
2080     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2081         if (netdev_flow_key_equal(&current_entry->key, key)) {
2082             /* We found the entry with the 'mf' miniflow */
2083             emc_change_entry(current_entry, flow, NULL);
2084             return;
2085         }
2086
2087         /* Replacement policy: put the flow in an empty (not alive) entry, or
2088          * in the first entry where it can be */
2089         if (!to_be_replaced
2090             || (emc_entry_alive(to_be_replaced)
2091                 && !emc_entry_alive(current_entry))
2092             || current_entry->key.hash < to_be_replaced->key.hash) {
2093             to_be_replaced = current_entry;
2094         }
2095     }
2096     /* We didn't find the miniflow in the cache.
2097      * The 'to_be_replaced' entry is where the new flow will be stored */
2098
2099     emc_change_entry(to_be_replaced, flow, key);
2100 }
2101
2102 static inline void
2103 emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2104                          const struct netdev_flow_key *key,
2105                          struct dp_netdev_flow *flow)
2106 {
2107     /* Insert an entry into the EMC based on probability value 'min'. By
2108      * default the value is UINT32_MAX / 100 which yields an insertion
2109      * probability of 1/100 ie. 1% */
2110
2111     uint32_t min;
2112     atomic_read_relaxed(&pmd->dp->emc_insert_min, &min);
2113
2114     if (min && random_uint32() <= min) {
2115         emc_insert(&pmd->flow_cache, key, flow);
2116     }
2117 }
2118
2119 static inline struct dp_netdev_flow *
2120 emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
2121 {
2122     struct emc_entry *current_entry;
2123
2124     EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2125         if (current_entry->key.hash == key->hash
2126             && emc_entry_alive(current_entry)
2127             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
2128
2129             /* We found the entry with the 'key->mf' miniflow */
2130             return current_entry->flow;
2131         }
2132     }
2133
2134     return NULL;
2135 }
2136
2137 static struct dp_netdev_flow *
2138 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2139                           const struct netdev_flow_key *key,
2140                           int *lookup_num_p)
2141 {
2142     struct dpcls *cls;
2143     struct dpcls_rule *rule;
2144     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf, in_port));
2145     struct dp_netdev_flow *netdev_flow = NULL;
2146
2147     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2148     if (OVS_LIKELY(cls)) {
2149         dpcls_lookup(cls, key, &rule, 1, lookup_num_p);
2150         netdev_flow = dp_netdev_flow_cast(rule);
2151     }
2152     return netdev_flow;
2153 }
2154
2155 static struct dp_netdev_flow *
2156 dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2157                         const ovs_u128 *ufidp, const struct nlattr *key,
2158                         size_t key_len)
2159 {
2160     struct dp_netdev_flow *netdev_flow;
2161     struct flow flow;
2162     ovs_u128 ufid;
2163
2164     /* If a UFID is not provided, determine one based on the key. */
2165     if (!ufidp && key && key_len
2166         && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2167         dpif_flow_hash(pmd->dp->dpif, &flow, sizeof flow, &ufid);
2168         ufidp = &ufid;
2169     }
2170
2171     if (ufidp) {
2172         CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2173                                  &pmd->flow_table) {
2174             if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2175                 return netdev_flow;
2176             }
2177         }
2178     }
2179
2180     return NULL;
2181 }
2182
2183 static void
2184 get_dpif_flow_stats(const struct dp_netdev_flow *netdev_flow_,
2185                     struct dpif_flow_stats *stats)
2186 {
2187     struct dp_netdev_flow *netdev_flow;
2188     unsigned long long n;
2189     long long used;
2190     uint16_t flags;
2191
2192     netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2193
2194     atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2195     stats->n_packets = n;
2196     atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2197     stats->n_bytes = n;
2198     atomic_read_relaxed(&netdev_flow->stats.used, &used);
2199     stats->used = used;
2200     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2201     stats->tcp_flags = flags;
2202 }
2203
2204 /* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2205  * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2206  * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2207  * protect them. */
2208 static void
2209 dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
2210                             struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2211                             struct dpif_flow *flow, bool terse)
2212 {
2213     if (terse) {
2214         memset(flow, 0, sizeof *flow);
2215     } else {
2216         struct flow_wildcards wc;
2217         struct dp_netdev_actions *actions;
2218         size_t offset;
2219         struct odp_flow_key_parms odp_parms = {
2220             .flow = &netdev_flow->flow,
2221             .mask = &wc.masks,
2222             .support = dp_netdev_support,
2223         };
2224
2225         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2226         /* in_port is exact matched, but we have left it out from the mask for
2227          * optimnization reasons. Add in_port back to the mask. */
2228         wc.masks.in_port.odp_port = ODPP_NONE;
2229
2230         /* Key */
2231         offset = key_buf->size;
2232         flow->key = ofpbuf_tail(key_buf);
2233         odp_flow_key_from_flow(&odp_parms, key_buf);
2234         flow->key_len = key_buf->size - offset;
2235
2236         /* Mask */
2237         offset = mask_buf->size;
2238         flow->mask = ofpbuf_tail(mask_buf);
2239         odp_parms.key_buf = key_buf;
2240         odp_flow_key_from_mask(&odp_parms, mask_buf);
2241         flow->mask_len = mask_buf->size - offset;
2242
2243         /* Actions */
2244         actions = dp_netdev_flow_get_actions(netdev_flow);
2245         flow->actions = actions->actions;
2246         flow->actions_len = actions->size;
2247     }
2248
2249     flow->ufid = netdev_flow->ufid;
2250     flow->ufid_present = true;
2251     flow->pmd_id = netdev_flow->pmd_id;
2252     get_dpif_flow_stats(netdev_flow, &flow->stats);
2253 }
2254
2255 static int
2256 dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2257                               const struct nlattr *mask_key,
2258                               uint32_t mask_key_len, const struct flow *flow,
2259                               struct flow_wildcards *wc, bool probe)
2260 {
2261     enum odp_key_fitness fitness;
2262
2263     fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow);
2264     if (fitness) {
2265         if (!probe) {
2266             /* This should not happen: it indicates that
2267              * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2268              * disagree on the acceptable form of a mask.  Log the problem
2269              * as an error, with enough details to enable debugging. */
2270             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2271
2272             if (!VLOG_DROP_ERR(&rl)) {
2273                 struct ds s;
2274
2275                 ds_init(&s);
2276                 odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2277                                 true);
2278                 VLOG_ERR("internal error parsing flow mask %s (%s)",
2279                 ds_cstr(&s), odp_key_fitness_to_string(fitness));
2280                 ds_destroy(&s);
2281             }
2282         }
2283
2284         return EINVAL;
2285     }
2286
2287     return 0;
2288 }
2289
2290 static int
2291 dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2292                               struct flow *flow, bool probe)
2293 {
2294     if (odp_flow_key_to_flow(key, key_len, flow)) {
2295         if (!probe) {
2296             /* This should not happen: it indicates that
2297              * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2298              * the acceptable form of a flow.  Log the problem as an error,
2299              * with enough details to enable debugging. */
2300             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2301
2302             if (!VLOG_DROP_ERR(&rl)) {
2303                 struct ds s;
2304
2305                 ds_init(&s);
2306                 odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
2307                 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2308                 ds_destroy(&s);
2309             }
2310         }
2311
2312         return EINVAL;
2313     }
2314
2315     if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2316         return EINVAL;
2317     }
2318
2319     return 0;
2320 }
2321
2322 static int
2323 dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2324 {
2325     struct dp_netdev *dp = get_dp_netdev(dpif);
2326     struct dp_netdev_flow *netdev_flow;
2327     struct dp_netdev_pmd_thread *pmd;
2328     struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2329     struct hmapx_node *node;
2330     int error = EINVAL;
2331
2332     if (get->pmd_id == PMD_ID_NULL) {
2333         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2334             if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2335                 dp_netdev_pmd_unref(pmd);
2336             }
2337         }
2338     } else {
2339         pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2340         if (!pmd) {
2341             goto out;
2342         }
2343         hmapx_add(&to_find, pmd);
2344     }
2345
2346     if (!hmapx_count(&to_find)) {
2347         goto out;
2348     }
2349
2350     HMAPX_FOR_EACH (node, &to_find) {
2351         pmd = (struct dp_netdev_pmd_thread *) node->data;
2352         netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2353                                               get->key_len);
2354         if (netdev_flow) {
2355             dp_netdev_flow_to_dpif_flow(netdev_flow, get->buffer, get->buffer,
2356                                         get->flow, false);
2357             error = 0;
2358             break;
2359         } else {
2360             error = ENOENT;
2361         }
2362     }
2363
2364     HMAPX_FOR_EACH (node, &to_find) {
2365         pmd = (struct dp_netdev_pmd_thread *) node->data;
2366         dp_netdev_pmd_unref(pmd);
2367     }
2368 out:
2369     hmapx_destroy(&to_find);
2370     return error;
2371 }
2372
2373 static struct dp_netdev_flow *
2374 dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
2375                    struct match *match, const ovs_u128 *ufid,
2376                    const struct nlattr *actions, size_t actions_len)
2377     OVS_REQUIRES(pmd->flow_mutex)
2378 {
2379     struct dp_netdev_flow *flow;
2380     struct netdev_flow_key mask;
2381     struct dpcls *cls;
2382
2383     /* Make sure in_port is exact matched before we read it. */
2384     ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
2385     odp_port_t in_port = match->flow.in_port.odp_port;
2386
2387     /* As we select the dpcls based on the port number, each netdev flow
2388      * belonging to the same dpcls will have the same odp_port value.
2389      * For performance reasons we wildcard odp_port here in the mask.  In the
2390      * typical case dp_hash is also wildcarded, and the resulting 8-byte
2391      * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
2392      * will not be part of the subtable mask.
2393      * This will speed up the hash computation during dpcls_lookup() because
2394      * there is one less call to hash_add64() in this case. */
2395     match->wc.masks.in_port.odp_port = 0;
2396     netdev_flow_mask_init(&mask, match);
2397     match->wc.masks.in_port.odp_port = ODPP_NONE;
2398
2399     /* Make sure wc does not have metadata. */
2400     ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
2401                && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
2402
2403     /* Do not allocate extra space. */
2404     flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
2405     memset(&flow->stats, 0, sizeof flow->stats);
2406     flow->dead = false;
2407     flow->batch = NULL;
2408     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
2409     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
2410     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
2411     ovs_refcount_init(&flow->ref_cnt);
2412     ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
2413
2414     netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
2415
2416     /* Select dpcls for in_port. Relies on in_port to be exact match. */
2417     cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
2418     dpcls_insert(cls, &flow->cr, &mask);
2419
2420     cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
2421                 dp_netdev_flow_hash(&flow->ufid));
2422
2423     if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
2424         struct ds ds = DS_EMPTY_INITIALIZER;
2425         struct ofpbuf key_buf, mask_buf;
2426         struct odp_flow_key_parms odp_parms = {
2427             .flow = &match->flow,
2428             .mask = &match->wc.masks,
2429             .support = dp_netdev_support,
2430         };
2431
2432         ofpbuf_init(&key_buf, 0);
2433         ofpbuf_init(&mask_buf, 0);
2434
2435         odp_flow_key_from_flow(&odp_parms, &key_buf);
2436         odp_parms.key_buf = &key_buf;
2437         odp_flow_key_from_mask(&odp_parms, &mask_buf);
2438
2439         ds_put_cstr(&ds, "flow_add: ");
2440         odp_format_ufid(ufid, &ds);
2441         ds_put_cstr(&ds, " ");
2442         odp_flow_format(key_buf.data, key_buf.size,
2443                         mask_buf.data, mask_buf.size,
2444                         NULL, &ds, false);
2445         ds_put_cstr(&ds, ", actions:");
2446         format_odp_actions(&ds, actions, actions_len, NULL);
2447
2448         VLOG_DBG("%s", ds_cstr(&ds));
2449
2450         ofpbuf_uninit(&key_buf);
2451         ofpbuf_uninit(&mask_buf);
2452
2453         /* Add a printout of the actual match installed. */
2454         struct match m;
2455         ds_clear(&ds);
2456         ds_put_cstr(&ds, "flow match: ");
2457         miniflow_expand(&flow->cr.flow.mf, &m.flow);
2458         miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2459         memset(&m.tun_md, 0, sizeof m.tun_md);
2460         match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2461
2462         VLOG_DBG("%s", ds_cstr(&ds));
2463
2464         ds_destroy(&ds);
2465     }
2466
2467     return flow;
2468 }
2469
2470 static int
2471 flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
2472                 struct netdev_flow_key *key,
2473                 struct match *match,
2474                 ovs_u128 *ufid,
2475                 const struct dpif_flow_put *put,
2476                 struct dpif_flow_stats *stats)
2477 {
2478     struct dp_netdev_flow *netdev_flow;
2479     int error = 0;
2480
2481     if (stats) {
2482         memset(stats, 0, sizeof *stats);
2483     }
2484
2485     ovs_mutex_lock(&pmd->flow_mutex);
2486     netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
2487     if (!netdev_flow) {
2488         if (put->flags & DPIF_FP_CREATE) {
2489             if (cmap_count(&pmd->flow_table) < MAX_FLOWS) {
2490                 dp_netdev_flow_add(pmd, match, ufid, put->actions,
2491                                    put->actions_len);
2492                 error = 0;
2493             } else {
2494                 error = EFBIG;
2495             }
2496         } else {
2497             error = ENOENT;
2498         }
2499     } else {
2500         if (put->flags & DPIF_FP_MODIFY) {
2501             struct dp_netdev_actions *new_actions;
2502             struct dp_netdev_actions *old_actions;
2503
2504             new_actions = dp_netdev_actions_create(put->actions,
2505                                                    put->actions_len);
2506
2507             old_actions = dp_netdev_flow_get_actions(netdev_flow);
2508             ovsrcu_set(&netdev_flow->actions, new_actions);
2509
2510             if (stats) {
2511                 get_dpif_flow_stats(netdev_flow, stats);
2512             }
2513             if (put->flags & DPIF_FP_ZERO_STATS) {
2514                 /* XXX: The userspace datapath uses thread local statistics
2515                  * (for flows), which should be updated only by the owning
2516                  * thread.  Since we cannot write on stats memory here,
2517                  * we choose not to support this flag.  Please note:
2518                  * - This feature is currently used only by dpctl commands with
2519                  *   option --clear.
2520                  * - Should the need arise, this operation can be implemented
2521                  *   by keeping a base value (to be update here) for each
2522                  *   counter, and subtracting it before outputting the stats */
2523                 error = EOPNOTSUPP;
2524             }
2525
2526             ovsrcu_postpone(dp_netdev_actions_free, old_actions);
2527         } else if (put->flags & DPIF_FP_CREATE) {
2528             error = EEXIST;
2529         } else {
2530             /* Overlapping flow. */
2531             error = EINVAL;
2532         }
2533     }
2534     ovs_mutex_unlock(&pmd->flow_mutex);
2535     return error;
2536 }
2537
2538 static int
2539 dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
2540 {
2541     struct dp_netdev *dp = get_dp_netdev(dpif);
2542     struct netdev_flow_key key, mask;
2543     struct dp_netdev_pmd_thread *pmd;
2544     struct match match;
2545     ovs_u128 ufid;
2546     int error;
2547     bool probe = put->flags & DPIF_FP_PROBE;
2548
2549     if (put->stats) {
2550         memset(put->stats, 0, sizeof *put->stats);
2551     }
2552     error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
2553                                           probe);
2554     if (error) {
2555         return error;
2556     }
2557     error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
2558                                           put->mask, put->mask_len,
2559                                           &match.flow, &match.wc, probe);
2560     if (error) {
2561         return error;
2562     }
2563
2564     if (put->ufid) {
2565         ufid = *put->ufid;
2566     } else {
2567         dpif_flow_hash(dpif, &match.flow, sizeof match.flow, &ufid);
2568     }
2569
2570     /* Must produce a netdev_flow_key for lookup.
2571      * Use the same method as employed to create the key when adding
2572      * the flow to the dplcs to make sure they match. */
2573     netdev_flow_mask_init(&mask, &match);
2574     netdev_flow_key_init_masked(&key, &match.flow, &mask);
2575
2576     if (put->pmd_id == PMD_ID_NULL) {
2577         if (cmap_count(&dp->poll_threads) == 0) {
2578             return EINVAL;
2579         }
2580         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2581             struct dpif_flow_stats pmd_stats;
2582             int pmd_error;
2583
2584             pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
2585                                         &pmd_stats);
2586             if (pmd_error) {
2587                 error = pmd_error;
2588             } else if (put->stats) {
2589                 put->stats->n_packets += pmd_stats.n_packets;
2590                 put->stats->n_bytes += pmd_stats.n_bytes;
2591                 put->stats->used = MAX(put->stats->used, pmd_stats.used);
2592                 put->stats->tcp_flags |= pmd_stats.tcp_flags;
2593             }
2594         }
2595     } else {
2596         pmd = dp_netdev_get_pmd(dp, put->pmd_id);
2597         if (!pmd) {
2598             return EINVAL;
2599         }
2600         error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
2601         dp_netdev_pmd_unref(pmd);
2602     }
2603
2604     return error;
2605 }
2606
2607 static int
2608 flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
2609                 struct dpif_flow_stats *stats,
2610                 const struct dpif_flow_del *del)
2611 {
2612     struct dp_netdev_flow *netdev_flow;
2613     int error = 0;
2614
2615     ovs_mutex_lock(&pmd->flow_mutex);
2616     netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
2617                                           del->key_len);
2618     if (netdev_flow) {
2619         if (stats) {
2620             get_dpif_flow_stats(netdev_flow, stats);
2621         }
2622         dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2623     } else {
2624         error = ENOENT;
2625     }
2626     ovs_mutex_unlock(&pmd->flow_mutex);
2627
2628     return error;
2629 }
2630
2631 static int
2632 dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
2633 {
2634     struct dp_netdev *dp = get_dp_netdev(dpif);
2635     struct dp_netdev_pmd_thread *pmd;
2636     int error = 0;
2637
2638     if (del->stats) {
2639         memset(del->stats, 0, sizeof *del->stats);
2640     }
2641
2642     if (del->pmd_id == PMD_ID_NULL) {
2643         if (cmap_count(&dp->poll_threads) == 0) {
2644             return EINVAL;
2645         }
2646         CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2647             struct dpif_flow_stats pmd_stats;
2648             int pmd_error;
2649
2650             pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
2651             if (pmd_error) {
2652                 error = pmd_error;
2653             } else if (del->stats) {
2654                 del->stats->n_packets += pmd_stats.n_packets;
2655                 del->stats->n_bytes += pmd_stats.n_bytes;
2656                 del->stats->used = MAX(del->stats->used, pmd_stats.used);
2657                 del->stats->tcp_flags |= pmd_stats.tcp_flags;
2658             }
2659         }
2660     } else {
2661         pmd = dp_netdev_get_pmd(dp, del->pmd_id);
2662         if (!pmd) {
2663             return EINVAL;
2664         }
2665         error = flow_del_on_pmd(pmd, del->stats, del);
2666         dp_netdev_pmd_unref(pmd);
2667     }
2668
2669
2670     return error;
2671 }
2672
2673 struct dpif_netdev_flow_dump {
2674     struct dpif_flow_dump up;
2675     struct cmap_position poll_thread_pos;
2676     struct cmap_position flow_pos;
2677     struct dp_netdev_pmd_thread *cur_pmd;
2678     int status;
2679     struct ovs_mutex mutex;
2680 };
2681
2682 static struct dpif_netdev_flow_dump *
2683 dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
2684 {
2685     return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
2686 }
2687
2688 static struct dpif_flow_dump *
2689 dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
2690                              char *type OVS_UNUSED)
2691 {
2692     struct dpif_netdev_flow_dump *dump;
2693
2694     dump = xzalloc(sizeof *dump);
2695     dpif_flow_dump_init(&dump->up, dpif_);
2696     dump->up.terse = terse;
2697     ovs_mutex_init(&dump->mutex);
2698
2699     return &dump->up;
2700 }
2701
2702 static int
2703 dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
2704 {
2705     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2706
2707     ovs_mutex_destroy(&dump->mutex);
2708     free(dump);
2709     return 0;
2710 }
2711
2712 struct dpif_netdev_flow_dump_thread {
2713     struct dpif_flow_dump_thread up;
2714     struct dpif_netdev_flow_dump *dump;
2715     struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
2716     struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
2717 };
2718
2719 static struct dpif_netdev_flow_dump_thread *
2720 dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
2721 {
2722     return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
2723 }
2724
2725 static struct dpif_flow_dump_thread *
2726 dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
2727 {
2728     struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
2729     struct dpif_netdev_flow_dump_thread *thread;
2730
2731     thread = xmalloc(sizeof *thread);
2732     dpif_flow_dump_thread_init(&thread->up, &dump->up);
2733     thread->dump = dump;
2734     return &thread->up;
2735 }
2736
2737 static void
2738 dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
2739 {
2740     struct dpif_netdev_flow_dump_thread *thread
2741         = dpif_netdev_flow_dump_thread_cast(thread_);
2742
2743     free(thread);
2744 }
2745
2746 static int
2747 dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
2748                            struct dpif_flow *flows, int max_flows)
2749 {
2750     struct dpif_netdev_flow_dump_thread *thread
2751         = dpif_netdev_flow_dump_thread_cast(thread_);
2752     struct dpif_netdev_flow_dump *dump = thread->dump;
2753     struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
2754     int n_flows = 0;
2755     int i;
2756
2757     ovs_mutex_lock(&dump->mutex);
2758     if (!dump->status) {
2759         struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dpif);
2760         struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
2761         struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
2762         int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
2763
2764         /* First call to dump_next(), extracts the first pmd thread.
2765          * If there is no pmd thread, returns immediately. */
2766         if (!pmd) {
2767             pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2768             if (!pmd) {
2769                 ovs_mutex_unlock(&dump->mutex);
2770                 return n_flows;
2771
2772             }
2773         }
2774
2775         do {
2776             for (n_flows = 0; n_flows < flow_limit; n_flows++) {
2777                 struct cmap_node *node;
2778
2779                 node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
2780                 if (!node) {
2781                     break;
2782                 }
2783                 netdev_flows[n_flows] = CONTAINER_OF(node,
2784                                                      struct dp_netdev_flow,
2785                                                      node);
2786             }
2787             /* When finishing dumping the current pmd thread, moves to
2788              * the next. */
2789             if (n_flows < flow_limit) {
2790                 memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
2791                 dp_netdev_pmd_unref(pmd);
2792                 pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
2793                 if (!pmd) {
2794                     dump->status = EOF;
2795                     break;
2796                 }
2797             }
2798             /* Keeps the reference to next caller. */
2799             dump->cur_pmd = pmd;
2800
2801             /* If the current dump is empty, do not exit the loop, since the
2802              * remaining pmds could have flows to be dumped.  Just dumps again
2803              * on the new 'pmd'. */
2804         } while (!n_flows);
2805     }
2806     ovs_mutex_unlock(&dump->mutex);
2807
2808     for (i = 0; i < n_flows; i++) {
2809         struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
2810         struct odputil_keybuf *keybuf = &thread->keybuf[i];
2811         struct dp_netdev_flow *netdev_flow = netdev_flows[i];
2812         struct dpif_flow *f = &flows[i];
2813         struct ofpbuf key, mask;
2814
2815         ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
2816         ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
2817         dp_netdev_flow_to_dpif_flow(netdev_flow, &key, &mask, f,
2818                                     dump->up.terse);
2819     }
2820
2821     return n_flows;
2822 }
2823
2824 static int
2825 dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
2826     OVS_NO_THREAD_SAFETY_ANALYSIS
2827 {
2828     struct dp_netdev *dp = get_dp_netdev(dpif);
2829     struct dp_netdev_pmd_thread *pmd;
2830     struct dp_packet_batch pp;
2831
2832     if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
2833         dp_packet_size(execute->packet) > UINT16_MAX) {
2834         return EINVAL;
2835     }
2836
2837     /* Tries finding the 'pmd'.  If NULL is returned, that means
2838      * the current thread is a non-pmd thread and should use
2839      * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
2840     pmd = ovsthread_getspecific(dp->per_pmd_key);
2841     if (!pmd) {
2842         pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
2843         if (!pmd) {
2844             return EBUSY;
2845         }
2846     }
2847
2848     if (execute->probe) {
2849         /* If this is part of a probe, Drop the packet, since executing
2850          * the action may actually cause spurious packets be sent into
2851          * the network. */
2852         return 0;
2853     }
2854
2855     /* If the current thread is non-pmd thread, acquires
2856      * the 'non_pmd_mutex'. */
2857     if (pmd->core_id == NON_PMD_CORE_ID) {
2858         ovs_mutex_lock(&dp->non_pmd_mutex);
2859     }
2860
2861     /* The action processing expects the RSS hash to be valid, because
2862      * it's always initialized at the beginning of datapath processing.
2863      * In this case, though, 'execute->packet' may not have gone through
2864      * the datapath at all, it may have been generated by the upper layer
2865      * (OpenFlow packet-out, BFD frame, ...). */
2866     if (!dp_packet_rss_valid(execute->packet)) {
2867         dp_packet_set_rss_hash(execute->packet,
2868                                flow_hash_5tuple(execute->flow, 0));
2869     }
2870
2871     dp_packet_batch_init_packet(&pp, execute->packet);
2872     dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
2873                               execute->actions, execute->actions_len,
2874                               time_msec());
2875
2876     if (pmd->core_id == NON_PMD_CORE_ID) {
2877         ovs_mutex_unlock(&dp->non_pmd_mutex);
2878         dp_netdev_pmd_unref(pmd);
2879     }
2880
2881     return 0;
2882 }
2883
2884 static void
2885 dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
2886 {
2887     size_t i;
2888
2889     for (i = 0; i < n_ops; i++) {
2890         struct dpif_op *op = ops[i];
2891
2892         switch (op->type) {
2893         case DPIF_OP_FLOW_PUT:
2894             op->error = dpif_netdev_flow_put(dpif, &op->u.flow_put);
2895             break;
2896
2897         case DPIF_OP_FLOW_DEL:
2898             op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
2899             break;
2900
2901         case DPIF_OP_EXECUTE:
2902             op->error = dpif_netdev_execute(dpif, &op->u.execute);
2903             break;
2904
2905         case DPIF_OP_FLOW_GET:
2906             op->error = dpif_netdev_flow_get(dpif, &op->u.flow_get);
2907             break;
2908         }
2909     }
2910 }
2911
2912 /* Applies datapath configuration from the database. Some of the changes are
2913  * actually applied in dpif_netdev_run(). */
2914 static int
2915 dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
2916 {
2917     struct dp_netdev *dp = get_dp_netdev(dpif);
2918     const char *cmask = smap_get(other_config, "pmd-cpu-mask");
2919     unsigned long long insert_prob =
2920         smap_get_ullong(other_config, "emc-insert-inv-prob",
2921                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
2922     uint32_t insert_min, cur_min;
2923
2924     if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
2925         free(dp->pmd_cmask);
2926         dp->pmd_cmask = nullable_xstrdup(cmask);
2927         dp_netdev_request_reconfigure(dp);
2928     }
2929
2930     atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
2931     if (insert_prob <= UINT32_MAX) {
2932         insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
2933     } else {
2934         insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
2935         insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
2936     }
2937
2938     if (insert_min != cur_min) {
2939         atomic_store_relaxed(&dp->emc_insert_min, insert_min);
2940         if (insert_min == 0) {
2941             VLOG_INFO("EMC has been disabled");
2942         } else {
2943             VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
2944                       insert_prob, (100 / (float)insert_prob));
2945         }
2946     }
2947
2948     return 0;
2949 }
2950
2951 /* Parses affinity list and returns result in 'core_ids'. */
2952 static int
2953 parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
2954 {
2955     unsigned i;
2956     char *list, *copy, *key, *value;
2957     int error = 0;
2958
2959     for (i = 0; i < n_rxq; i++) {
2960         core_ids[i] = OVS_CORE_UNSPEC;
2961     }
2962
2963     if (!affinity_list) {
2964         return 0;
2965     }
2966
2967     list = copy = xstrdup(affinity_list);
2968
2969     while (ofputil_parse_key_value(&list, &key, &value)) {
2970         int rxq_id, core_id;
2971
2972         if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
2973             || !str_to_int(value, 0, &core_id) || core_id < 0) {
2974             error = EINVAL;
2975             break;
2976         }
2977
2978         if (rxq_id < n_rxq) {
2979             core_ids[rxq_id] = core_id;
2980         }
2981     }
2982
2983     free(copy);
2984     return error;
2985 }
2986
2987 /* Parses 'affinity_list' and applies configuration if it is valid. */
2988 static int
2989 dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
2990                                   const char *affinity_list)
2991 {
2992     unsigned *core_ids, i;
2993     int error = 0;
2994
2995     core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
2996     if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
2997         error = EINVAL;
2998         goto exit;
2999     }
3000
3001     for (i = 0; i < port->n_rxq; i++) {
3002         port->rxqs[i].core_id = core_ids[i];
3003     }
3004
3005 exit:
3006     free(core_ids);
3007     return error;
3008 }
3009
3010 /* Changes the affinity of port's rx queues.  The changes are actually applied
3011  * in dpif_netdev_run(). */
3012 static int
3013 dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
3014                             const struct smap *cfg)
3015 {
3016     struct dp_netdev *dp = get_dp_netdev(dpif);
3017     struct dp_netdev_port *port;
3018     int error = 0;
3019     const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
3020
3021     ovs_mutex_lock(&dp->port_mutex);
3022     error = get_port_by_number(dp, port_no, &port);
3023     if (error || !netdev_is_pmd(port->netdev)
3024         || nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
3025         goto unlock;
3026     }
3027
3028     error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
3029     if (error) {
3030         goto unlock;
3031     }
3032     free(port->rxq_affinity_list);
3033     port->rxq_affinity_list = nullable_xstrdup(affinity_list);
3034
3035     dp_netdev_request_reconfigure(dp);
3036 unlock:
3037     ovs_mutex_unlock(&dp->port_mutex);
3038     return error;
3039 }
3040
3041 static int
3042 dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
3043                               uint32_t queue_id, uint32_t *priority)
3044 {
3045     *priority = queue_id;
3046     return 0;
3047 }
3048
3049 \f
3050 /* Creates and returns a new 'struct dp_netdev_actions', whose actions are
3051  * a copy of the 'size' bytes of 'actions' input parameters. */
3052 struct dp_netdev_actions *
3053 dp_netdev_actions_create(const struct nlattr *actions, size_t size)
3054 {
3055     struct dp_netdev_actions *netdev_actions;
3056
3057     netdev_actions = xmalloc(sizeof *netdev_actions + size);
3058     memcpy(netdev_actions->actions, actions, size);
3059     netdev_actions->size = size;
3060
3061     return netdev_actions;
3062 }
3063
3064 struct dp_netdev_actions *
3065 dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
3066 {
3067     return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
3068 }
3069
3070 static void
3071 dp_netdev_actions_free(struct dp_netdev_actions *actions)
3072 {
3073     free(actions);
3074 }
3075 \f
3076 static inline unsigned long long
3077 cycles_counter(void)
3078 {
3079 #ifdef DPDK_NETDEV
3080     return rte_get_tsc_cycles();
3081 #else
3082     return 0;
3083 #endif
3084 }
3085
3086 /* Fake mutex to make sure that the calls to cycles_count_* are balanced */
3087 extern struct ovs_mutex cycles_counter_fake_mutex;
3088
3089 /* Start counting cycles.  Must be followed by 'cycles_count_end()' */
3090 static inline void
3091 cycles_count_start(struct dp_netdev_pmd_thread *pmd)
3092     OVS_ACQUIRES(&cycles_counter_fake_mutex)
3093     OVS_NO_THREAD_SAFETY_ANALYSIS
3094 {
3095     pmd->last_cycles = cycles_counter();
3096 }
3097
3098 /* Stop counting cycles and add them to the counter 'type' */
3099 static inline void
3100 cycles_count_end(struct dp_netdev_pmd_thread *pmd,
3101                  enum pmd_cycles_counter_type type)
3102     OVS_RELEASES(&cycles_counter_fake_mutex)
3103     OVS_NO_THREAD_SAFETY_ANALYSIS
3104 {
3105     unsigned long long interval = cycles_counter() - pmd->last_cycles;
3106
3107     non_atomic_ullong_add(&pmd->cycles.n[type], interval);
3108 }
3109
3110 /* Calculate the intermediate cycle result and add to the counter 'type' */
3111 static inline void
3112 cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
3113                           struct dp_netdev_rxq *rxq,
3114                           enum pmd_cycles_counter_type type)
3115     OVS_NO_THREAD_SAFETY_ANALYSIS
3116 {
3117     unsigned long long new_cycles = cycles_counter();
3118     unsigned long long interval = new_cycles - pmd->last_cycles;
3119     pmd->last_cycles = new_cycles;
3120
3121     non_atomic_ullong_add(&pmd->cycles.n[type], interval);
3122     if (rxq && (type == PMD_CYCLES_PROCESSING)) {
3123         /* Add to the amount of current processing cycles. */
3124         non_atomic_ullong_add(&rxq->cycles[RXQ_CYCLES_PROC_CURR], interval);
3125     }
3126 }
3127
3128 static int
3129 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
3130                            struct netdev_rxq *rx,
3131                            odp_port_t port_no)
3132 {
3133     struct dp_packet_batch batch;
3134     int error;
3135     int batch_cnt = 0;
3136
3137     dp_packet_batch_init(&batch);
3138     error = netdev_rxq_recv(rx, &batch);
3139     if (!error) {
3140         *recirc_depth_get() = 0;
3141
3142         batch_cnt = batch.count;
3143         dp_netdev_input(pmd, &batch, port_no);
3144     } else if (error != EAGAIN && error != EOPNOTSUPP) {
3145         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3146
3147         VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
3148                     netdev_rxq_get_name(rx), ovs_strerror(error));
3149     }
3150
3151     return batch_cnt;
3152 }
3153
3154 static struct tx_port *
3155 tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
3156 {
3157     struct tx_port *tx;
3158
3159     HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
3160         if (tx->port->port_no == port_no) {
3161             return tx;
3162         }
3163     }
3164
3165     return NULL;
3166 }
3167
3168 static int
3169 port_reconfigure(struct dp_netdev_port *port)
3170 {
3171     struct netdev *netdev = port->netdev;
3172     int i, err;
3173
3174     port->need_reconfigure = false;
3175
3176     /* Closes the existing 'rxq's. */
3177     for (i = 0; i < port->n_rxq; i++) {
3178         netdev_rxq_close(port->rxqs[i].rx);
3179         port->rxqs[i].rx = NULL;
3180     }
3181     port->n_rxq = 0;
3182
3183     /* Allows 'netdev' to apply the pending configuration changes. */
3184     if (netdev_is_reconf_required(netdev)) {
3185         err = netdev_reconfigure(netdev);
3186         if (err && (err != EOPNOTSUPP)) {
3187             VLOG_ERR("Failed to set interface %s new configuration",
3188                      netdev_get_name(netdev));
3189             return err;
3190         }
3191     }
3192     /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
3193     port->rxqs = xrealloc(port->rxqs,
3194                           sizeof *port->rxqs * netdev_n_rxq(netdev));
3195     /* Realloc 'used' counters for tx queues. */
3196     free(port->txq_used);
3197     port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
3198
3199     for (i = 0; i < netdev_n_rxq(netdev); i++) {
3200         port->rxqs[i].port = port;
3201         err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
3202         if (err) {
3203             return err;
3204         }
3205         port->n_rxq++;
3206     }
3207
3208     /* Parse affinity list to apply configuration for new queues. */
3209     dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
3210
3211     return 0;
3212 }
3213
3214 struct rr_numa_list {
3215     struct hmap numas;  /* Contains 'struct rr_numa' */
3216 };
3217
3218 struct rr_numa {
3219     struct hmap_node node;
3220
3221     int numa_id;
3222
3223     /* Non isolated pmds on numa node 'numa_id' */
3224     struct dp_netdev_pmd_thread **pmds;
3225     int n_pmds;
3226
3227     int cur_index;
3228 };
3229
3230 static struct rr_numa *
3231 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
3232 {
3233     struct rr_numa *numa;
3234
3235     HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
3236         if (numa->numa_id == numa_id) {
3237             return numa;
3238         }
3239     }
3240
3241     return NULL;
3242 }
3243
3244 /* Returns the next node in numa list following 'numa' in round-robin fashion.
3245  * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
3246  * Returns NULL if 'rr' numa list is empty. */
3247 static struct rr_numa *
3248 rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
3249 {
3250     struct hmap_node *node = NULL;
3251
3252     if (numa) {
3253         node = hmap_next(&rr->numas, &numa->node);
3254     }
3255     if (!node) {
3256         node = hmap_first(&rr->numas);
3257     }
3258
3259     return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
3260 }
3261
3262 static void
3263 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
3264 {
3265     struct dp_netdev_pmd_thread *pmd;
3266     struct rr_numa *numa;
3267
3268     hmap_init(&rr->numas);
3269
3270     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3271         if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
3272             continue;
3273         }
3274
3275         numa = rr_numa_list_lookup(rr, pmd->numa_id);
3276         if (!numa) {
3277             numa = xzalloc(sizeof *numa);
3278             numa->numa_id = pmd->numa_id;
3279             hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
3280         }
3281         numa->n_pmds++;
3282         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
3283         numa->pmds[numa->n_pmds - 1] = pmd;
3284     }
3285 }
3286
3287 static struct dp_netdev_pmd_thread *
3288 rr_numa_get_pmd(struct rr_numa *numa)
3289 {
3290     return numa->pmds[numa->cur_index++ % numa->n_pmds];
3291 }
3292
3293 static void
3294 rr_numa_list_destroy(struct rr_numa_list *rr)
3295 {
3296     struct rr_numa *numa;
3297
3298     HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
3299         free(numa->pmds);
3300         free(numa);
3301     }
3302     hmap_destroy(&rr->numas);
3303 }
3304
3305 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
3306  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
3307  * pmds to unpinned queues.
3308  *
3309  * The function doesn't touch the pmd threads, it just stores the assignment
3310  * in the 'pmd' member of each rxq. */
3311 static void
3312 rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
3313 {
3314     struct dp_netdev_port *port;
3315     struct rr_numa_list rr;
3316     struct rr_numa *non_local_numa = NULL;
3317
3318     rr_numa_list_populate(dp, &rr);
3319
3320     HMAP_FOR_EACH (port, node, &dp->ports) {
3321         struct rr_numa *numa;
3322         int numa_id;
3323
3324         if (!netdev_is_pmd(port->netdev)) {
3325             continue;
3326         }
3327
3328         numa_id = netdev_get_numa_id(port->netdev);
3329         numa = rr_numa_list_lookup(&rr, numa_id);
3330
3331         for (int qid = 0; qid < port->n_rxq; qid++) {
3332             struct dp_netdev_rxq *q = &port->rxqs[qid];
3333
3334             if (pinned && q->core_id != OVS_CORE_UNSPEC) {
3335                 struct dp_netdev_pmd_thread *pmd;
3336
3337                 pmd = dp_netdev_get_pmd(dp, q->core_id);
3338                 if (!pmd) {
3339                     VLOG_WARN("There is no PMD thread on core %d. Queue "
3340                               "%d on port \'%s\' will not be polled.",
3341                               q->core_id, qid, netdev_get_name(port->netdev));
3342                 } else {
3343                     q->pmd = pmd;
3344                     pmd->isolated = true;
3345                     dp_netdev_pmd_unref(pmd);
3346                 }
3347             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
3348                 if (!numa) {
3349                     /* There are no pmds on the queue's local NUMA node.
3350                        Round-robin on the NUMA nodes that do have pmds. */
3351                     non_local_numa = rr_numa_list_next(&rr, non_local_numa);
3352                     if (!non_local_numa) {
3353                         VLOG_ERR("There is no available (non-isolated) pmd "
3354                                  "thread for port \'%s\' queue %d. This queue "
3355                                  "will not be polled. Is pmd-cpu-mask set to "
3356                                  "zero? Or are all PMDs isolated to other "
3357                                  "queues?", netdev_get_name(port->netdev),
3358                                  qid);
3359                         continue;
3360                     }
3361                     q->pmd = rr_numa_get_pmd(non_local_numa);
3362                     VLOG_WARN("There's no available (non-isolated) pmd thread "
3363                               "on numa node %d. Queue %d on port \'%s\' will "
3364                               "be assigned to the pmd on core %d "
3365                               "(numa node %d). Expect reduced performance.",
3366                               numa_id, qid, netdev_get_name(port->netdev),
3367                               q->pmd->core_id, q->pmd->numa_id);
3368                 } else {
3369                     /* Assign queue to the next (round-robin) PMD on it's local
3370                        NUMA node. */
3371                     q->pmd = rr_numa_get_pmd(numa);
3372                 }
3373             }
3374         }
3375     }
3376
3377     rr_numa_list_destroy(&rr);
3378 }
3379
3380 static void
3381 reload_affected_pmds(struct dp_netdev *dp)
3382 {
3383     struct dp_netdev_pmd_thread *pmd;
3384
3385     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3386         if (pmd->need_reload) {
3387             dp_netdev_reload_pmd__(pmd);
3388             pmd->need_reload = false;
3389         }
3390     }
3391 }
3392
3393 static void
3394 reconfigure_pmd_threads(struct dp_netdev *dp)
3395     OVS_REQUIRES(dp->port_mutex)
3396 {
3397     struct dp_netdev_pmd_thread *pmd;
3398     struct ovs_numa_dump *pmd_cores;
3399     struct ovs_numa_info_core *core;
3400     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
3401     struct hmapx_node *node;
3402     bool changed = false;
3403     bool need_to_adjust_static_tx_qids = false;
3404
3405     /* The pmd threads should be started only if there's a pmd port in the
3406      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
3407      * NR_PMD_THREADS per numa node. */
3408     if (!has_pmd_port(dp)) {
3409         pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
3410     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
3411         pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
3412     } else {
3413         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
3414     }
3415
3416     /* We need to adjust 'static_tx_qid's only if we're reducing number of
3417      * PMD threads. Otherwise, new threads will allocate all the freed ids. */
3418     if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
3419         /* Adjustment is required to keep 'static_tx_qid's sequential and
3420          * avoid possible issues, for example, imbalanced tx queue usage
3421          * and unnecessary locking caused by remapping on netdev level. */
3422         need_to_adjust_static_tx_qids = true;
3423     }
3424
3425     /* Check for unwanted pmd threads */
3426     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3427         if (pmd->core_id == NON_PMD_CORE_ID) {
3428             continue;
3429         }
3430         if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
3431                                                     pmd->core_id)) {
3432             hmapx_add(&to_delete, pmd);
3433         } else if (need_to_adjust_static_tx_qids) {
3434             pmd->need_reload = true;
3435         }
3436     }
3437
3438     HMAPX_FOR_EACH (node, &to_delete) {
3439         pmd = (struct dp_netdev_pmd_thread *) node->data;
3440         VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
3441                   pmd->numa_id, pmd->core_id);
3442         dp_netdev_del_pmd(dp, pmd);
3443     }
3444     changed = !hmapx_is_empty(&to_delete);
3445     hmapx_destroy(&to_delete);
3446
3447     if (need_to_adjust_static_tx_qids) {
3448         /* 'static_tx_qid's are not sequential now.
3449          * Reload remaining threads to fix this. */
3450         reload_affected_pmds(dp);
3451     }
3452
3453     /* Check for required new pmd threads */
3454     FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
3455         pmd = dp_netdev_get_pmd(dp, core->core_id);
3456         if (!pmd) {
3457             pmd = xzalloc(sizeof *pmd);
3458             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
3459             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
3460             VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
3461                       pmd->numa_id, pmd->core_id);
3462             changed = true;
3463         } else {
3464             dp_netdev_pmd_unref(pmd);
3465         }
3466     }
3467
3468     if (changed) {
3469         struct ovs_numa_info_numa *numa;
3470
3471         /* Log the number of pmd threads per numa node. */
3472         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
3473             VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
3474                       numa->n_cores, numa->numa_id);
3475         }
3476     }
3477
3478     ovs_numa_dump_destroy(pmd_cores);
3479 }
3480
3481 static void
3482 pmd_remove_stale_ports(struct dp_netdev *dp,
3483                        struct dp_netdev_pmd_thread *pmd)
3484     OVS_EXCLUDED(pmd->port_mutex)
3485     OVS_REQUIRES(dp->port_mutex)
3486 {
3487     struct rxq_poll *poll, *poll_next;
3488     struct tx_port *tx, *tx_next;
3489
3490     ovs_mutex_lock(&pmd->port_mutex);
3491     HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3492         struct dp_netdev_port *port = poll->rxq->port;
3493
3494         if (port->need_reconfigure
3495             || !hmap_contains(&dp->ports, &port->node)) {
3496             dp_netdev_del_rxq_from_pmd(pmd, poll);
3497         }
3498     }
3499     HMAP_FOR_EACH_SAFE (tx, tx_next, node, &pmd->tx_ports) {
3500         struct dp_netdev_port *port = tx->port;
3501
3502         if (port->need_reconfigure
3503             || !hmap_contains(&dp->ports, &port->node)) {
3504             dp_netdev_del_port_tx_from_pmd(pmd, tx);
3505         }
3506     }
3507     ovs_mutex_unlock(&pmd->port_mutex);
3508 }
3509
3510 /* Must be called each time a port is added/removed or the cmask changes.
3511  * This creates and destroys pmd threads, reconfigures ports, opens their
3512  * rxqs and assigns all rxqs/txqs to pmd threads. */
3513 static void
3514 reconfigure_datapath(struct dp_netdev *dp)
3515     OVS_REQUIRES(dp->port_mutex)
3516 {
3517     struct dp_netdev_pmd_thread *pmd;
3518     struct dp_netdev_port *port;
3519     int wanted_txqs;
3520
3521     dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
3522
3523     /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
3524      * on the system and the user configuration. */
3525     reconfigure_pmd_threads(dp);
3526
3527     wanted_txqs = cmap_count(&dp->poll_threads);
3528
3529     /* The number of pmd threads might have changed, or a port can be new:
3530      * adjust the txqs. */
3531     HMAP_FOR_EACH (port, node, &dp->ports) {
3532         netdev_set_tx_multiq(port->netdev, wanted_txqs);
3533     }
3534
3535     /* Step 2: Remove from the pmd threads ports that have been removed or
3536      * need reconfiguration. */
3537
3538     /* Check for all the ports that need reconfiguration.  We cache this in
3539      * 'port->need_reconfigure', because netdev_is_reconf_required() can
3540      * change at any time. */
3541     HMAP_FOR_EACH (port, node, &dp->ports) {
3542         if (netdev_is_reconf_required(port->netdev)) {
3543             port->need_reconfigure = true;
3544         }
3545     }
3546
3547     /* Remove from the pmd threads all the ports that have been deleted or
3548      * need reconfiguration. */
3549     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3550         pmd_remove_stale_ports(dp, pmd);
3551     }
3552
3553     /* Reload affected pmd threads.  We must wait for the pmd threads before
3554      * reconfiguring the ports, because a port cannot be reconfigured while
3555      * it's being used. */
3556     reload_affected_pmds(dp);
3557
3558     /* Step 3: Reconfigure ports. */
3559
3560     /* We only reconfigure the ports that we determined above, because they're
3561      * not being used by any pmd thread at the moment.  If a port fails to
3562      * reconfigure we remove it from the datapath. */
3563     struct dp_netdev_port *next_port;
3564     HMAP_FOR_EACH_SAFE (port, next_port, node, &dp->ports) {
3565         int err;
3566
3567         if (!port->need_reconfigure) {
3568             continue;
3569         }
3570
3571         err = port_reconfigure(port);
3572         if (err) {
3573             hmap_remove(&dp->ports, &port->node);
3574             seq_change(dp->port_seq);
3575             port_destroy(port);
3576         } else {
3577             port->dynamic_txqs = netdev_n_txq(port->netdev) < wanted_txqs;
3578         }
3579     }
3580
3581     /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
3582      * for now, we just update the 'pmd' pointer in each rxq to point to the
3583      * wanted thread according to the scheduling policy. */
3584
3585     /* Reset all the pmd threads to non isolated. */
3586     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3587         pmd->isolated = false;
3588     }
3589
3590     /* Reset all the queues to unassigned */
3591     HMAP_FOR_EACH (port, node, &dp->ports) {
3592         for (int i = 0; i < port->n_rxq; i++) {
3593             port->rxqs[i].pmd = NULL;
3594         }
3595     }
3596
3597     /* Add pinned queues and mark pmd threads isolated. */
3598     rxq_scheduling(dp, true);
3599
3600     /* Add non-pinned queues. */
3601     rxq_scheduling(dp, false);
3602
3603     /* Step 5: Remove queues not compliant with new scheduling. */
3604     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3605         struct rxq_poll *poll, *poll_next;
3606
3607         ovs_mutex_lock(&pmd->port_mutex);
3608         HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
3609             if (poll->rxq->pmd != pmd) {
3610                 dp_netdev_del_rxq_from_pmd(pmd, poll);
3611             }
3612         }
3613         ovs_mutex_unlock(&pmd->port_mutex);
3614     }
3615
3616     /* Reload affected pmd threads.  We must wait for the pmd threads to remove
3617      * the old queues before readding them, otherwise a queue can be polled by
3618      * two threads at the same time. */
3619     reload_affected_pmds(dp);
3620
3621     /* Step 6: Add queues from scheduling, if they're not there already. */
3622     HMAP_FOR_EACH (port, node, &dp->ports) {
3623         if (!netdev_is_pmd(port->netdev)) {
3624             continue;
3625         }
3626
3627         for (int qid = 0; qid < port->n_rxq; qid++) {
3628             struct dp_netdev_rxq *q = &port->rxqs[qid];
3629
3630             if (q->pmd) {
3631                 ovs_mutex_lock(&q->pmd->port_mutex);
3632                 dp_netdev_add_rxq_to_pmd(q->pmd, q);
3633                 ovs_mutex_unlock(&q->pmd->port_mutex);
3634             }
3635         }
3636     }
3637
3638     /* Add every port to the tx cache of every pmd thread, if it's not
3639      * there already and if this pmd has at least one rxq to poll. */
3640     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3641         ovs_mutex_lock(&pmd->port_mutex);
3642         if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
3643             HMAP_FOR_EACH (port, node, &dp->ports) {
3644                 dp_netdev_add_port_tx_to_pmd(pmd, port);
3645             }
3646         }
3647         ovs_mutex_unlock(&pmd->port_mutex);
3648     }
3649
3650     /* Reload affected pmd threads. */
3651     reload_affected_pmds(dp);
3652 }
3653
3654 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
3655 static bool
3656 ports_require_restart(const struct dp_netdev *dp)
3657     OVS_REQUIRES(dp->port_mutex)
3658 {
3659     struct dp_netdev_port *port;
3660
3661     HMAP_FOR_EACH (port, node, &dp->ports) {
3662         if (netdev_is_reconf_required(port->netdev)) {
3663             return true;
3664         }
3665     }
3666
3667     return false;
3668 }
3669
3670 /* Return true if needs to revalidate datapath flows. */
3671 static bool
3672 dpif_netdev_run(struct dpif *dpif)
3673 {
3674     struct dp_netdev_port *port;
3675     struct dp_netdev *dp = get_dp_netdev(dpif);
3676     struct dp_netdev_pmd_thread *non_pmd;
3677     uint64_t new_tnl_seq;
3678     int process_packets = 0;
3679
3680     ovs_mutex_lock(&dp->port_mutex);
3681     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3682     if (non_pmd) {
3683         ovs_mutex_lock(&dp->non_pmd_mutex);
3684         cycles_count_start(non_pmd);
3685         HMAP_FOR_EACH (port, node, &dp->ports) {
3686             if (!netdev_is_pmd(port->netdev)) {
3687                 int i;
3688
3689                 for (i = 0; i < port->n_rxq; i++) {
3690                     process_packets =
3691                         dp_netdev_process_rxq_port(non_pmd,
3692                                                    port->rxqs[i].rx,
3693                                                    port->port_no);
3694                     cycles_count_intermediate(non_pmd, NULL, process_packets ?
3695                                                        PMD_CYCLES_PROCESSING
3696                                                      : PMD_CYCLES_IDLE);
3697                 }
3698             }
3699         }
3700         cycles_count_end(non_pmd, PMD_CYCLES_IDLE);
3701         dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
3702         ovs_mutex_unlock(&dp->non_pmd_mutex);
3703
3704         dp_netdev_pmd_unref(non_pmd);
3705     }
3706
3707     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
3708         reconfigure_datapath(dp);
3709     }
3710     ovs_mutex_unlock(&dp->port_mutex);
3711
3712     tnl_neigh_cache_run();
3713     tnl_port_map_run();
3714     new_tnl_seq = seq_read(tnl_conf_seq);
3715
3716     if (dp->last_tnl_conf_seq != new_tnl_seq) {
3717         dp->last_tnl_conf_seq = new_tnl_seq;
3718         return true;
3719     }
3720     return false;
3721 }
3722
3723 static void
3724 dpif_netdev_wait(struct dpif *dpif)
3725 {
3726     struct dp_netdev_port *port;
3727     struct dp_netdev *dp = get_dp_netdev(dpif);
3728
3729     ovs_mutex_lock(&dp_netdev_mutex);
3730     ovs_mutex_lock(&dp->port_mutex);
3731     HMAP_FOR_EACH (port, node, &dp->ports) {
3732         netdev_wait_reconf_required(port->netdev);
3733         if (!netdev_is_pmd(port->netdev)) {
3734             int i;
3735
3736             for (i = 0; i < port->n_rxq; i++) {
3737                 netdev_rxq_wait(port->rxqs[i].rx);
3738             }
3739         }
3740     }
3741     ovs_mutex_unlock(&dp->port_mutex);
3742     ovs_mutex_unlock(&dp_netdev_mutex);
3743     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
3744 }
3745
3746 static void
3747 pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
3748 {
3749     struct tx_port *tx_port_cached;
3750
3751     /* Free all used tx queue ids. */
3752     dpif_netdev_xps_revalidate_pmd(pmd, 0, true);
3753
3754     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
3755         free(tx_port_cached);
3756     }
3757     HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
3758         free(tx_port_cached);
3759     }
3760 }
3761
3762 /* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
3763  * 'pmd->port_cache' (thread local) */
3764 static void
3765 pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
3766     OVS_REQUIRES(pmd->port_mutex)
3767 {
3768     struct tx_port *tx_port, *tx_port_cached;
3769
3770     pmd_free_cached_ports(pmd);
3771     hmap_shrink(&pmd->send_port_cache);
3772     hmap_shrink(&pmd->tnl_port_cache);
3773
3774     HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
3775         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
3776             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
3777             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
3778                         hash_port_no(tx_port_cached->port->port_no));
3779         }
3780
3781         if (netdev_n_txq(tx_port->port->netdev)) {
3782             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
3783             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
3784                         hash_port_no(tx_port_cached->port->port_no));
3785         }
3786     }
3787 }
3788
3789 static void
3790 pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
3791 {
3792     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
3793     if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
3794         VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
3795                    ", numa_id %d.", pmd->core_id, pmd->numa_id);
3796     }
3797     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
3798
3799     VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
3800              ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
3801 }
3802
3803 static void
3804 pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
3805 {
3806     ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
3807     id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
3808     ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
3809 }
3810
3811 static int
3812 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
3813                           struct polled_queue **ppoll_list)
3814 {
3815     struct polled_queue *poll_list = *ppoll_list;
3816     struct rxq_poll *poll;
3817     int i;
3818
3819     ovs_mutex_lock(&pmd->port_mutex);
3820     poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
3821                                     * sizeof *poll_list);
3822
3823     i = 0;
3824     HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
3825         poll_list[i].rxq = poll->rxq;
3826         poll_list[i].port_no = poll->rxq->port->port_no;
3827         i++;
3828     }
3829
3830     pmd_load_cached_ports(pmd);
3831
3832     ovs_mutex_unlock(&pmd->port_mutex);
3833
3834     *ppoll_list = poll_list;
3835     return i;
3836 }
3837
3838 static void *
3839 pmd_thread_main(void *f_)
3840 {
3841     struct dp_netdev_pmd_thread *pmd = f_;
3842     unsigned int lc = 0;
3843     struct polled_queue *poll_list;
3844     bool exiting;
3845     int poll_cnt;
3846     int i;
3847     int process_packets = 0;
3848
3849     poll_list = NULL;
3850
3851     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
3852     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
3853     ovs_numa_thread_setaffinity_core(pmd->core_id);
3854     dpdk_set_lcore_id(pmd->core_id);
3855     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3856     emc_cache_init(&pmd->flow_cache);
3857 reload:
3858     pmd_alloc_static_tx_qid(pmd);
3859
3860     /* List port/core affinity */
3861     for (i = 0; i < poll_cnt; i++) {
3862        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
3863                 pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
3864                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
3865     }
3866
3867     if (!poll_cnt) {
3868         while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
3869             seq_wait(pmd->reload_seq, pmd->last_reload_seq);
3870             poll_block();
3871         }
3872         lc = UINT_MAX;
3873     }
3874
3875     cycles_count_start(pmd);
3876     for (;;) {
3877         for (i = 0; i < poll_cnt; i++) {
3878             process_packets =
3879                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
3880                                            poll_list[i].port_no);
3881             cycles_count_intermediate(pmd, NULL,
3882                                       process_packets ? PMD_CYCLES_PROCESSING
3883                                                       : PMD_CYCLES_IDLE);
3884         }
3885
3886         if (lc++ > 1024) {
3887             bool reload;
3888
3889             lc = 0;
3890
3891             coverage_try_clear();
3892             dp_netdev_pmd_try_optimize(pmd);
3893             if (!ovsrcu_try_quiesce()) {
3894                 emc_cache_slow_sweep(&pmd->flow_cache);
3895             }
3896
3897             atomic_read_relaxed(&pmd->reload, &reload);
3898             if (reload) {
3899                 break;
3900             }
3901         }
3902     }
3903
3904     cycles_count_end(pmd, PMD_CYCLES_IDLE);
3905
3906     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
3907     exiting = latch_is_set(&pmd->exit_latch);
3908     /* Signal here to make sure the pmd finishes
3909      * reloading the updated configuration. */
3910     dp_netdev_pmd_reload_done(pmd);
3911
3912     pmd_free_static_tx_qid(pmd);
3913
3914     if (!exiting) {
3915         goto reload;
3916     }
3917
3918     emc_cache_uninit(&pmd->flow_cache);
3919     free(poll_list);
3920     pmd_free_cached_ports(pmd);
3921     return NULL;
3922 }
3923
3924 static void
3925 dp_netdev_disable_upcall(struct dp_netdev *dp)
3926     OVS_ACQUIRES(dp->upcall_rwlock)
3927 {
3928     fat_rwlock_wrlock(&dp->upcall_rwlock);
3929 }
3930
3931 \f
3932 /* Meters */
3933 static void
3934 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
3935                                struct ofputil_meter_features *features)
3936 {
3937     features->max_meters = MAX_METERS;
3938     features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
3939     features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
3940     features->max_bands = MAX_BANDS;
3941     features->max_color = 0;
3942 }
3943
3944 /* Returns false when packet needs to be dropped. */
3945 static void
3946 dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
3947                     uint32_t meter_id, long long int now)
3948 {
3949     struct dp_meter *meter;
3950     struct dp_meter_band *band;
3951     long long int long_delta_t; /* msec */
3952     uint32_t delta_t; /* msec */
3953     int i;
3954     int cnt = packets_->count;
3955     uint32_t bytes, volume;
3956     int exceeded_band[NETDEV_MAX_BURST];
3957     uint32_t exceeded_rate[NETDEV_MAX_BURST];
3958     int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */
3959
3960     if (meter_id >= MAX_METERS) {
3961         return;
3962     }
3963
3964     meter_lock(dp, meter_id);
3965     meter = dp->meters[meter_id];
3966     if (!meter) {
3967         goto out;
3968     }
3969
3970     /* Initialize as negative values. */
3971     memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
3972     /* Initialize as zeroes. */
3973     memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
3974
3975     /* All packets will hit the meter at the same time. */
3976     long_delta_t = (now - meter->used); /* msec */
3977
3978     /* Make sure delta_t will not be too large, so that bucket will not
3979      * wrap around below. */
3980     delta_t = (long_delta_t > (long long int)meter->max_delta_t)
3981         ? meter->max_delta_t : (uint32_t)long_delta_t;
3982
3983     /* Update meter stats. */
3984     meter->used = now;
3985     meter->packet_count += cnt;
3986     bytes = 0;
3987     for (i = 0; i < cnt; i++) {
3988         bytes += dp_packet_size(packets_->packets[i]);
3989     }
3990     meter->byte_count += bytes;
3991
3992     /* Meters can operate in terms of packets per second or kilobits per
3993      * second. */
3994     if (meter->flags & OFPMF13_PKTPS) {
3995         /* Rate in packets/second, bucket 1/1000 packets. */
3996         /* msec * packets/sec = 1/1000 packets. */
3997         volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
3998     } else {
3999         /* Rate in kbps, bucket in bits. */
4000         /* msec * kbps = bits */
4001         volume = bytes * 8;
4002     }
4003
4004     /* Update all bands and find the one hit with the highest rate for each
4005      * packet (if any). */
4006     for (int m = 0; m < meter->n_bands; ++m) {
4007         band = &meter->bands[m];
4008
4009         /* Update band's bucket. */
4010         band->bucket += delta_t * band->up.rate;
4011         if (band->bucket > band->up.burst_size) {
4012             band->bucket = band->up.burst_size;
4013         }
4014
4015         /* Drain the bucket for all the packets, if possible. */
4016         if (band->bucket >= volume) {
4017             band->bucket -= volume;
4018         } else {
4019             int band_exceeded_pkt;
4020
4021             /* Band limit hit, must process packet-by-packet. */
4022             if (meter->flags & OFPMF13_PKTPS) {
4023                 band_exceeded_pkt = band->bucket / 1000;
4024                 band->bucket %= 1000; /* Remainder stays in bucket. */
4025
4026                 /* Update the exceeding band for each exceeding packet.
4027                  * (Only one band will be fired by a packet, and that
4028                  * can be different for each packet.) */
4029                 for (i = band_exceeded_pkt; i < cnt; i++) {
4030                     if (band->up.rate > exceeded_rate[i]) {
4031                         exceeded_rate[i] = band->up.rate;
4032                         exceeded_band[i] = m;
4033                     }
4034                 }
4035             } else {
4036                 /* Packet sizes differ, must process one-by-one. */
4037                 band_exceeded_pkt = cnt;
4038                 for (i = 0; i < cnt; i++) {
4039                     uint32_t bits = dp_packet_size(packets_->packets[i]) * 8;
4040
4041                     if (band->bucket >= bits) {
4042                         band->bucket -= bits;
4043                     } else {
4044                         if (i < band_exceeded_pkt) {
4045                             band_exceeded_pkt = i;
4046                         }
4047                         /* Update the exceeding band for the exceeding packet.
4048                          * (Only one band will be fired by a packet, and that
4049                          * can be different for each packet.) */
4050                         if (band->up.rate > exceeded_rate[i]) {
4051                             exceeded_rate[i] = band->up.rate;
4052                             exceeded_band[i] = m;
4053                         }
4054                     }
4055                 }
4056             }
4057             /* Remember the first exceeding packet. */
4058             if (exceeded_pkt > band_exceeded_pkt) {
4059                 exceeded_pkt = band_exceeded_pkt;
4060             }
4061         }
4062     }
4063
4064     /* Fire the highest rate band exceeded by each packet.
4065      * Drop packets if needed, by swapping packet to the end that will be
4066      * ignored. */
4067     const size_t size = dp_packet_batch_size(packets_);
4068     struct dp_packet *packet;
4069     size_t j;
4070     DP_PACKET_BATCH_REFILL_FOR_EACH (j, size, packet, packets_) {
4071         if (exceeded_band[j] >= 0) {
4072             /* Meter drop packet. */
4073             band = &meter->bands[exceeded_band[j]];
4074             band->packet_count += 1;
4075             band->byte_count += dp_packet_size(packet);
4076
4077             dp_packet_delete(packet);
4078         } else {
4079             /* Meter accepts packet. */
4080             dp_packet_batch_refill(packets_, packet, j);
4081         }
4082     }
4083  out:
4084     meter_unlock(dp, meter_id);
4085 }
4086
4087 /* Meter set/get/del processing is still single-threaded. */
4088 static int
4089 dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id,
4090                       struct ofputil_meter_config *config)
4091 {
4092     struct dp_netdev *dp = get_dp_netdev(dpif);
4093     uint32_t mid = meter_id->uint32;
4094     struct dp_meter *meter;
4095     int i;
4096
4097     if (mid >= MAX_METERS) {
4098         return EFBIG; /* Meter_id out of range. */
4099     }
4100
4101     if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK ||
4102         !(config->flags & (OFPMF13_KBPS | OFPMF13_PKTPS))) {
4103         return EBADF; /* Unsupported flags set */
4104     }
4105     /* Validate bands */
4106     if (config->n_bands == 0 || config->n_bands > MAX_BANDS) {
4107         return EINVAL; /* Too many bands */
4108     }
4109     for (i = 0; i < config->n_bands; ++i) {
4110         switch (config->bands[i].type) {
4111         case OFPMBT13_DROP:
4112             break;
4113         default:
4114             return ENODEV; /* Unsupported band type */
4115         }
4116     }
4117
4118     /* Allocate meter */
4119     meter = xzalloc(sizeof *meter
4120                     + config->n_bands * sizeof(struct dp_meter_band));
4121     if (meter) {
4122         meter->flags = config->flags;
4123         meter->n_bands = config->n_bands;
4124         meter->max_delta_t = 0;
4125         meter->used = time_msec();
4126
4127         /* set up bands */
4128         for (i = 0; i < config->n_bands; ++i) {
4129             uint32_t band_max_delta_t;
4130
4131             /* Set burst size to a workable value if none specified. */
4132             if (config->bands[i].burst_size == 0) {
4133                 config->bands[i].burst_size = config->bands[i].rate;
4134             }
4135
4136             meter->bands[i].up = config->bands[i];
4137             /* Convert burst size to the bucket units: */
4138             /* pkts => 1/1000 packets, kilobits => bits. */
4139             meter->bands[i].up.burst_size *= 1000;
4140             /* Initialize bucket to empty. */
4141             meter->bands[i].bucket = 0;
4142
4143             /* Figure out max delta_t that is enough to fill any bucket. */
4144             band_max_delta_t
4145                 = meter->bands[i].up.burst_size / meter->bands[i].up.rate;
4146             if (band_max_delta_t > meter->max_delta_t) {
4147                 meter->max_delta_t = band_max_delta_t;
4148             }
4149         }
4150
4151         meter_lock(dp, mid);
4152         dp_delete_meter(dp, mid); /* Free existing meter, if any */
4153         dp->meters[mid] = meter;
4154         meter_unlock(dp, mid);
4155
4156         return 0;
4157     }
4158     return ENOMEM;
4159 }
4160
4161 static int
4162 dpif_netdev_meter_get(const struct dpif *dpif,
4163                       ofproto_meter_id meter_id_,
4164                       struct ofputil_meter_stats *stats, uint16_t n_bands)
4165 {
4166     const struct dp_netdev *dp = get_dp_netdev(dpif);
4167     const struct dp_meter *meter;
4168     uint32_t meter_id = meter_id_.uint32;
4169
4170     if (meter_id >= MAX_METERS) {
4171         return EFBIG;
4172     }
4173     meter = dp->meters[meter_id];
4174     if (!meter) {
4175         return ENOENT;
4176     }
4177     if (stats) {
4178         int i = 0;
4179
4180         meter_lock(dp, meter_id);
4181         stats->packet_in_count = meter->packet_count;
4182         stats->byte_in_count = meter->byte_count;
4183
4184         for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
4185             stats->bands[i].packet_count = meter->bands[i].packet_count;
4186             stats->bands[i].byte_count = meter->bands[i].byte_count;
4187         }
4188         meter_unlock(dp, meter_id);
4189
4190         stats->n_bands = i;
4191     }
4192     return 0;
4193 }
4194
4195 static int
4196 dpif_netdev_meter_del(struct dpif *dpif,
4197                       ofproto_meter_id meter_id_,
4198                       struct ofputil_meter_stats *stats, uint16_t n_bands)
4199 {
4200     struct dp_netdev *dp = get_dp_netdev(dpif);
4201     int error;
4202
4203     error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
4204     if (!error) {
4205         uint32_t meter_id = meter_id_.uint32;
4206
4207         meter_lock(dp, meter_id);
4208         dp_delete_meter(dp, meter_id);
4209         meter_unlock(dp, meter_id);
4210     }
4211     return error;
4212 }
4213
4214 \f
4215 static void
4216 dpif_netdev_disable_upcall(struct dpif *dpif)
4217     OVS_NO_THREAD_SAFETY_ANALYSIS
4218 {
4219     struct dp_netdev *dp = get_dp_netdev(dpif);
4220     dp_netdev_disable_upcall(dp);
4221 }
4222
4223 static void
4224 dp_netdev_enable_upcall(struct dp_netdev *dp)
4225     OVS_RELEASES(dp->upcall_rwlock)
4226 {
4227     fat_rwlock_unlock(&dp->upcall_rwlock);
4228 }
4229
4230 static void
4231 dpif_netdev_enable_upcall(struct dpif *dpif)
4232     OVS_NO_THREAD_SAFETY_ANALYSIS
4233 {
4234     struct dp_netdev *dp = get_dp_netdev(dpif);
4235     dp_netdev_enable_upcall(dp);
4236 }
4237
4238 static void
4239 dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
4240 {
4241     ovs_mutex_lock(&pmd->cond_mutex);
4242     atomic_store_relaxed(&pmd->reload, false);
4243     pmd->last_reload_seq = seq_read(pmd->reload_seq);
4244     xpthread_cond_signal(&pmd->cond);
4245     ovs_mutex_unlock(&pmd->cond_mutex);
4246 }
4247
4248 /* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
4249  * the pointer if succeeds, otherwise, NULL (it can return NULL even if
4250  * 'core_id' is NON_PMD_CORE_ID).
4251  *
4252  * Caller must unrefs the returned reference.  */
4253 static struct dp_netdev_pmd_thread *
4254 dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
4255 {
4256     struct dp_netdev_pmd_thread *pmd;
4257     const struct cmap_node *pnode;
4258
4259     pnode = cmap_find(&dp->poll_threads, hash_int(core_id, 0));
4260     if (!pnode) {
4261         return NULL;
4262     }
4263     pmd = CONTAINER_OF(pnode, struct dp_netdev_pmd_thread, node);
4264
4265     return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
4266 }
4267
4268 /* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
4269 static void
4270 dp_netdev_set_nonpmd(struct dp_netdev *dp)
4271     OVS_REQUIRES(dp->port_mutex)
4272 {
4273     struct dp_netdev_pmd_thread *non_pmd;
4274
4275     non_pmd = xzalloc(sizeof *non_pmd);
4276     dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
4277 }
4278
4279 /* Caller must have valid pointer to 'pmd'. */
4280 static bool
4281 dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
4282 {
4283     return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
4284 }
4285
4286 static void
4287 dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
4288 {
4289     if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
4290         ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
4291     }
4292 }
4293
4294 /* Given cmap position 'pos', tries to ref the next node.  If try_ref()
4295  * fails, keeps checking for next node until reaching the end of cmap.
4296  *
4297  * Caller must unrefs the returned reference. */
4298 static struct dp_netdev_pmd_thread *
4299 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
4300 {
4301     struct dp_netdev_pmd_thread *next;
4302
4303     do {
4304         struct cmap_node *node;
4305
4306         node = cmap_next_position(&dp->poll_threads, pos);
4307         next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
4308             : NULL;
4309     } while (next && !dp_netdev_pmd_try_ref(next));
4310
4311     return next;
4312 }
4313
4314 /* Configures the 'pmd' based on the input argument. */
4315 static void
4316 dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
4317                         unsigned core_id, int numa_id)
4318 {
4319     pmd->dp = dp;
4320     pmd->core_id = core_id;
4321     pmd->numa_id = numa_id;
4322     pmd->need_reload = false;
4323
4324     ovs_refcount_init(&pmd->ref_cnt);
4325     latch_init(&pmd->exit_latch);
4326     pmd->reload_seq = seq_create();
4327     pmd->last_reload_seq = seq_read(pmd->reload_seq);
4328     atomic_init(&pmd->reload, false);
4329     xpthread_cond_init(&pmd->cond, NULL);
4330     ovs_mutex_init(&pmd->cond_mutex);
4331     ovs_mutex_init(&pmd->flow_mutex);
4332     ovs_mutex_init(&pmd->port_mutex);
4333     cmap_init(&pmd->flow_table);
4334     cmap_init(&pmd->classifiers);
4335     pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
4336     hmap_init(&pmd->poll_list);
4337     hmap_init(&pmd->tx_ports);
4338     hmap_init(&pmd->tnl_port_cache);
4339     hmap_init(&pmd->send_port_cache);
4340     /* init the 'flow_cache' since there is no
4341      * actual thread created for NON_PMD_CORE_ID. */
4342     if (core_id == NON_PMD_CORE_ID) {
4343         emc_cache_init(&pmd->flow_cache);
4344         pmd_alloc_static_tx_qid(pmd);
4345     }
4346     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
4347                 hash_int(core_id, 0));
4348 }
4349
4350 static void
4351 dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
4352 {
4353     struct dpcls *cls;
4354
4355     dp_netdev_pmd_flow_flush(pmd);
4356     hmap_destroy(&pmd->send_port_cache);
4357     hmap_destroy(&pmd->tnl_port_cache);
4358     hmap_destroy(&pmd->tx_ports);
4359     hmap_destroy(&pmd->poll_list);
4360     /* All flows (including their dpcls_rules) have been deleted already */
4361     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
4362         dpcls_destroy(cls);
4363         ovsrcu_postpone(free, cls);
4364     }
4365     cmap_destroy(&pmd->classifiers);
4366     cmap_destroy(&pmd->flow_table);
4367     ovs_mutex_destroy(&pmd->flow_mutex);
4368     latch_destroy(&pmd->exit_latch);
4369     seq_destroy(pmd->reload_seq);
4370     xpthread_cond_destroy(&pmd->cond);
4371     ovs_mutex_destroy(&pmd->cond_mutex);
4372     ovs_mutex_destroy(&pmd->port_mutex);
4373     free(pmd);
4374 }
4375
4376 /* Stops the pmd thread, removes it from the 'dp->poll_threads',
4377  * and unrefs the struct. */
4378 static void
4379 dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
4380 {
4381     /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
4382      * but extra cleanup is necessary */
4383     if (pmd->core_id == NON_PMD_CORE_ID) {
4384         ovs_mutex_lock(&dp->non_pmd_mutex);
4385         emc_cache_uninit(&pmd->flow_cache);
4386         pmd_free_cached_ports(pmd);
4387         pmd_free_static_tx_qid(pmd);
4388         ovs_mutex_unlock(&dp->non_pmd_mutex);
4389     } else {
4390         latch_set(&pmd->exit_latch);
4391         dp_netdev_reload_pmd__(pmd);
4392         xpthread_join(pmd->thread, NULL);
4393     }
4394
4395     dp_netdev_pmd_clear_ports(pmd);
4396
4397     /* Purges the 'pmd''s flows after stopping the thread, but before
4398      * destroying the flows, so that the flow stats can be collected. */
4399     if (dp->dp_purge_cb) {
4400         dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
4401     }
4402     cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
4403     dp_netdev_pmd_unref(pmd);
4404 }
4405
4406 /* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
4407  * thread. */
4408 static void
4409 dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
4410 {
4411     struct dp_netdev_pmd_thread *pmd;
4412     struct dp_netdev_pmd_thread **pmd_list;
4413     size_t k = 0, n_pmds;
4414
4415     n_pmds = cmap_count(&dp->poll_threads);
4416     pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
4417
4418     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4419         if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
4420             continue;
4421         }
4422         /* We cannot call dp_netdev_del_pmd(), since it alters
4423          * 'dp->poll_threads' (while we're iterating it) and it
4424          * might quiesce. */
4425         ovs_assert(k < n_pmds);
4426         pmd_list[k++] = pmd;
4427     }
4428
4429     for (size_t i = 0; i < k; i++) {
4430         dp_netdev_del_pmd(dp, pmd_list[i]);
4431     }
4432     free(pmd_list);
4433 }
4434
4435 /* Deletes all rx queues from pmd->poll_list and all the ports from
4436  * pmd->tx_ports. */
4437 static void
4438 dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
4439 {
4440     struct rxq_poll *poll;
4441     struct tx_port *port;
4442
4443     ovs_mutex_lock(&pmd->port_mutex);
4444     HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
4445         free(poll);
4446     }
4447     HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
4448         free(port);
4449     }
4450     ovs_mutex_unlock(&pmd->port_mutex);
4451 }
4452
4453 /* Adds rx queue to poll_list of PMD thread, if it's not there already. */
4454 static void
4455 dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
4456                          struct dp_netdev_rxq *rxq)
4457     OVS_REQUIRES(pmd->port_mutex)
4458 {
4459     int qid = netdev_rxq_get_queue_id(rxq->rx);
4460     uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
4461     struct rxq_poll *poll;
4462
4463     HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
4464         if (poll->rxq == rxq) {
4465             /* 'rxq' is already polled by this thread. Do nothing. */
4466             return;
4467         }
4468     }
4469
4470     poll = xmalloc(sizeof *poll);
4471     poll->rxq = rxq;
4472     hmap_insert(&pmd->poll_list, &poll->node, hash);
4473
4474     pmd->need_reload = true;
4475 }
4476
4477 /* Delete 'poll' from poll_list of PMD thread. */
4478 static void
4479 dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
4480                            struct rxq_poll *poll)
4481     OVS_REQUIRES(pmd->port_mutex)
4482 {
4483     hmap_remove(&pmd->poll_list, &poll->node);
4484     free(poll);
4485
4486     pmd->need_reload = true;
4487 }
4488
4489 /* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
4490  * changes to take effect. */
4491 static void
4492 dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
4493                              struct dp_netdev_port *port)
4494     OVS_REQUIRES(pmd->port_mutex)
4495 {
4496     struct tx_port *tx;
4497
4498     tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
4499     if (tx) {
4500         /* 'port' is already on this thread tx cache. Do nothing. */
4501         return;
4502     }
4503
4504     tx = xzalloc(sizeof *tx);
4505
4506     tx->port = port;
4507     tx->qid = -1;
4508
4509     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
4510     pmd->need_reload = true;
4511 }
4512
4513 /* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
4514  * changes to take effect. */
4515 static void
4516 dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
4517                                struct tx_port *tx)
4518     OVS_REQUIRES(pmd->port_mutex)
4519 {
4520     hmap_remove(&pmd->tx_ports, &tx->node);
4521     free(tx);
4522     pmd->need_reload = true;
4523 }
4524 \f
4525 static char *
4526 dpif_netdev_get_datapath_version(void)
4527 {
4528      return xstrdup("<built-in>");
4529 }
4530
4531 static void
4532 dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
4533                     uint16_t tcp_flags, long long now)
4534 {
4535     uint16_t flags;
4536
4537     atomic_store_relaxed(&netdev_flow->stats.used, now);
4538     non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
4539     non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
4540     atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
4541     flags |= tcp_flags;
4542     atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
4543 }
4544
4545 static void
4546 dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
4547                        enum dp_stat_type type, int cnt)
4548 {
4549     non_atomic_ullong_add(&pmd->stats.n[type], cnt);
4550 }
4551
4552 static int
4553 dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
4554                  struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
4555                  enum dpif_upcall_type type, const struct nlattr *userdata,
4556                  struct ofpbuf *actions, struct ofpbuf *put_actions)
4557 {
4558     struct dp_netdev *dp = pmd->dp;
4559
4560     if (OVS_UNLIKELY(!dp->upcall_cb)) {
4561         return ENODEV;
4562     }
4563
4564     if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
4565         struct ds ds = DS_EMPTY_INITIALIZER;
4566         char *packet_str;
4567         struct ofpbuf key;
4568         struct odp_flow_key_parms odp_parms = {
4569             .flow = flow,
4570             .mask = wc ? &wc->masks : NULL,
4571             .support = dp_netdev_support,
4572         };
4573
4574         ofpbuf_init(&key, 0);
4575         odp_flow_key_from_flow(&odp_parms, &key);
4576         packet_str = ofp_dp_packet_to_string(packet_);
4577
4578         odp_flow_key_format(key.data, key.size, &ds);
4579
4580         VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
4581                  dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
4582
4583         ofpbuf_uninit(&key);
4584         free(packet_str);
4585
4586         ds_destroy(&ds);
4587     }
4588
4589     return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
4590                          actions, wc, put_actions, dp->upcall_aux);
4591 }
4592
4593 static inline uint32_t
4594 dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
4595                                 const struct miniflow *mf)
4596 {
4597     uint32_t hash, recirc_depth;
4598
4599     if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4600         hash = dp_packet_get_rss_hash(packet);
4601     } else {
4602         hash = miniflow_hash_5tuple(mf, 0);
4603         dp_packet_set_rss_hash(packet, hash);
4604     }
4605
4606     /* The RSS hash must account for the recirculation depth to avoid
4607      * collisions in the exact match cache */
4608     recirc_depth = *recirc_depth_get_unsafe();
4609     if (OVS_UNLIKELY(recirc_depth)) {
4610         hash = hash_finish(hash, recirc_depth);
4611         dp_packet_set_rss_hash(packet, hash);
4612     }
4613     return hash;
4614 }
4615
4616 struct packet_batch_per_flow {
4617     unsigned int byte_count;
4618     uint16_t tcp_flags;
4619     struct dp_netdev_flow *flow;
4620
4621     struct dp_packet_batch array;
4622 };
4623
4624 static inline void
4625 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
4626                              struct dp_packet *packet,
4627                              const struct miniflow *mf)
4628 {
4629     batch->byte_count += dp_packet_size(packet);
4630     batch->tcp_flags |= miniflow_get_tcp_flags(mf);
4631     batch->array.packets[batch->array.count++] = packet;
4632 }
4633
4634 static inline void
4635 packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
4636                            struct dp_netdev_flow *flow)
4637 {
4638     flow->batch = batch;
4639
4640     batch->flow = flow;
4641     dp_packet_batch_init(&batch->array);
4642     batch->byte_count = 0;
4643     batch->tcp_flags = 0;
4644 }
4645
4646 static inline void
4647 packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
4648                               struct dp_netdev_pmd_thread *pmd,
4649                               long long now)
4650 {
4651     struct dp_netdev_actions *actions;
4652     struct dp_netdev_flow *flow = batch->flow;
4653
4654     dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
4655                         batch->tcp_flags, now);
4656
4657     actions = dp_netdev_flow_get_actions(flow);
4658
4659     dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
4660                               actions->actions, actions->size, now);
4661 }
4662
4663 static inline void
4664 dp_netdev_queue_batches(struct dp_packet *pkt,
4665                         struct dp_netdev_flow *flow, const struct miniflow *mf,
4666                         struct packet_batch_per_flow *batches,
4667                         size_t *n_batches)
4668 {
4669     struct packet_batch_per_flow *batch = flow->batch;
4670
4671     if (OVS_UNLIKELY(!batch)) {
4672         batch = &batches[(*n_batches)++];
4673         packet_batch_per_flow_init(batch, flow);
4674     }
4675
4676     packet_batch_per_flow_update(batch, pkt, mf);
4677 }
4678
4679 /* Try to process all ('cnt') the 'packets' using only the exact match cache
4680  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
4681  * miniflow is copied into 'keys' and the packet pointer is moved at the
4682  * beginning of the 'packets' array.
4683  *
4684  * The function returns the number of packets that needs to be processed in the
4685  * 'packets' array (they have been moved to the beginning of the vector).
4686  *
4687  * If 'md_is_valid' is false, the metadata in 'packets' is not valid and must
4688  * be initialized by this function using 'port_no'.
4689  */
4690 static inline size_t
4691 emc_processing(struct dp_netdev_pmd_thread *pmd,
4692                struct dp_packet_batch *packets_,
4693                struct netdev_flow_key *keys,
4694                struct packet_batch_per_flow batches[], size_t *n_batches,
4695                bool md_is_valid, odp_port_t port_no)
4696 {
4697     struct emc_cache *flow_cache = &pmd->flow_cache;
4698     struct netdev_flow_key *key = &keys[0];
4699     size_t n_missed = 0, n_dropped = 0;
4700     struct dp_packet *packet;
4701     const size_t size = dp_packet_batch_size(packets_);
4702     uint32_t cur_min;
4703     int i;
4704
4705     atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
4706
4707     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets_) {
4708         struct dp_netdev_flow *flow;
4709
4710         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
4711             dp_packet_delete(packet);
4712             n_dropped++;
4713             continue;
4714         }
4715
4716         if (i != size - 1) {
4717             struct dp_packet **packets = packets_->packets;
4718             /* Prefetch next packet data and metadata. */
4719             OVS_PREFETCH(dp_packet_data(packets[i+1]));
4720             pkt_metadata_prefetch_init(&packets[i+1]->md);
4721         }
4722
4723         if (!md_is_valid) {
4724             pkt_metadata_init(&packet->md, port_no);
4725         }
4726         miniflow_extract(packet, &key->mf);
4727         key->len = 0; /* Not computed yet. */
4728         key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
4729
4730         /* If EMC is disabled skip emc_lookup */
4731         flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
4732         if (OVS_LIKELY(flow)) {
4733             dp_netdev_queue_batches(packet, flow, &key->mf, batches,
4734                                     n_batches);
4735         } else {
4736             /* Exact match cache missed. Group missed packets together at
4737              * the beginning of the 'packets' array. */
4738             dp_packet_batch_refill(packets_, packet, i);
4739             /* 'key[n_missed]' contains the key of the current packet and it
4740              * must be returned to the caller. The next key should be extracted
4741              * to 'keys[n_missed + 1]'. */
4742             key = &keys[++n_missed];
4743         }
4744     }
4745
4746     dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT,
4747                            size - n_dropped - n_missed);
4748
4749     return dp_packet_batch_size(packets_);
4750 }
4751
4752 static inline void
4753 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
4754                      struct dp_packet *packet,
4755                      const struct netdev_flow_key *key,
4756                      struct ofpbuf *actions, struct ofpbuf *put_actions,
4757                      int *lost_cnt, long long now)
4758 {
4759     struct ofpbuf *add_actions;
4760     struct dp_packet_batch b;
4761     struct match match;
4762     ovs_u128 ufid;
4763     int error;
4764
4765     match.tun_md.valid = false;
4766     miniflow_expand(&key->mf, &match.flow);
4767
4768     ofpbuf_clear(actions);
4769     ofpbuf_clear(put_actions);
4770
4771     dpif_flow_hash(pmd->dp->dpif, &match.flow, sizeof match.flow, &ufid);
4772     error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
4773                              &ufid, DPIF_UC_MISS, NULL, actions,
4774                              put_actions);
4775     if (OVS_UNLIKELY(error && error != ENOSPC)) {
4776         dp_packet_delete(packet);
4777         (*lost_cnt)++;
4778         return;
4779     }
4780
4781     /* The Netlink encoding of datapath flow keys cannot express
4782      * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
4783      * tag is interpreted as exact match on the fact that there is no
4784      * VLAN.  Unless we refactor a lot of code that translates between
4785      * Netlink and struct flow representations, we have to do the same
4786      * here. */
4787     if (!match.wc.masks.vlans[0].tci) {
4788         match.wc.masks.vlans[0].tci = htons(0xffff);
4789     }
4790
4791     /* We can't allow the packet batching in the next loop to execute
4792      * the actions.  Otherwise, if there are any slow path actions,
4793      * we'll send the packet up twice. */
4794     dp_packet_batch_init_packet(&b, packet);
4795     dp_netdev_execute_actions(pmd, &b, true, &match.flow,
4796                               actions->data, actions->size, now);
4797
4798     add_actions = put_actions->size ? put_actions : actions;
4799     if (OVS_LIKELY(error != ENOSPC)) {
4800         struct dp_netdev_flow *netdev_flow;
4801
4802         /* XXX: There's a race window where a flow covering this packet
4803          * could have already been installed since we last did the flow
4804          * lookup before upcall.  This could be solved by moving the
4805          * mutex lock outside the loop, but that's an awful long time
4806          * to be locking everyone out of making flow installs.  If we
4807          * move to a per-core classifier, it would be reasonable. */
4808         ovs_mutex_lock(&pmd->flow_mutex);
4809         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
4810         if (OVS_LIKELY(!netdev_flow)) {
4811             netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
4812                                              add_actions->data,
4813                                              add_actions->size);
4814         }
4815         ovs_mutex_unlock(&pmd->flow_mutex);
4816         emc_probabilistic_insert(pmd, key, netdev_flow);
4817     }
4818 }
4819
4820 static inline void
4821 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
4822                      struct dp_packet_batch *packets_,
4823                      struct netdev_flow_key *keys,
4824                      struct packet_batch_per_flow batches[], size_t *n_batches,
4825                      odp_port_t in_port,
4826                      long long now)
4827 {
4828     int cnt = packets_->count;
4829 #if !defined(__CHECKER__) && !defined(_WIN32)
4830     const size_t PKT_ARRAY_SIZE = cnt;
4831 #else
4832     /* Sparse or MSVC doesn't like variable length array. */
4833     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4834 #endif
4835     struct dp_packet **packets = packets_->packets;
4836     struct dpcls *cls;
4837     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
4838     struct dp_netdev *dp = pmd->dp;
4839     int miss_cnt = 0, lost_cnt = 0;
4840     int lookup_cnt = 0, add_lookup_cnt;
4841     bool any_miss;
4842     size_t i;
4843
4844     for (i = 0; i < cnt; i++) {
4845         /* Key length is needed in all the cases, hash computed on demand. */
4846         keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
4847     }
4848     /* Get the classifier for the in_port */
4849     cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
4850     if (OVS_LIKELY(cls)) {
4851         any_miss = !dpcls_lookup(cls, keys, rules, cnt, &lookup_cnt);
4852     } else {
4853         any_miss = true;
4854         memset(rules, 0, sizeof(rules));
4855     }
4856     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
4857         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
4858         struct ofpbuf actions, put_actions;
4859
4860         ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
4861         ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
4862
4863         for (i = 0; i < cnt; i++) {
4864             struct dp_netdev_flow *netdev_flow;
4865
4866             if (OVS_LIKELY(rules[i])) {
4867                 continue;
4868             }
4869
4870             /* It's possible that an earlier slow path execution installed
4871              * a rule covering this flow.  In this case, it's a lot cheaper
4872              * to catch it here than execute a miss. */
4873             netdev_flow = dp_netdev_pmd_lookup_flow(pmd, &keys[i],
4874                                                     &add_lookup_cnt);
4875             if (netdev_flow) {
4876                 lookup_cnt += add_lookup_cnt;
4877                 rules[i] = &netdev_flow->cr;
4878                 continue;
4879             }
4880
4881             miss_cnt++;
4882             handle_packet_upcall(pmd, packets[i], &keys[i], &actions,
4883                                  &put_actions, &lost_cnt, now);
4884         }
4885
4886         ofpbuf_uninit(&actions);
4887         ofpbuf_uninit(&put_actions);
4888         fat_rwlock_unlock(&dp->upcall_rwlock);
4889     } else if (OVS_UNLIKELY(any_miss)) {
4890         for (i = 0; i < cnt; i++) {
4891             if (OVS_UNLIKELY(!rules[i])) {
4892                 dp_packet_delete(packets[i]);
4893                 lost_cnt++;
4894                 miss_cnt++;
4895             }
4896         }
4897     }
4898
4899     for (i = 0; i < cnt; i++) {
4900         struct dp_packet *packet = packets[i];
4901         struct dp_netdev_flow *flow;
4902
4903         if (OVS_UNLIKELY(!rules[i])) {
4904             continue;
4905         }
4906
4907         flow = dp_netdev_flow_cast(rules[i]);
4908
4909         emc_probabilistic_insert(pmd, &keys[i], flow);
4910         dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
4911     }
4912
4913     dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
4914     dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
4915     dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
4916     dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
4917 }
4918
4919 /* Packets enter the datapath from a port (or from recirculation) here.
4920  *
4921  * For performance reasons a caller may choose not to initialize the metadata
4922  * in 'packets': in this case 'mdinit' is false and this function needs to
4923  * initialize it using 'port_no'.  If the metadata in 'packets' is already
4924  * valid, 'md_is_valid' must be true and 'port_no' will be ignored. */
4925 static void
4926 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
4927                   struct dp_packet_batch *packets,
4928                   bool md_is_valid, odp_port_t port_no)
4929 {
4930     int cnt = packets->count;
4931 #if !defined(__CHECKER__) && !defined(_WIN32)
4932     const size_t PKT_ARRAY_SIZE = cnt;
4933 #else
4934     /* Sparse or MSVC doesn't like variable length array. */
4935     enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
4936 #endif
4937     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
4938         struct netdev_flow_key keys[PKT_ARRAY_SIZE];
4939     struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
4940     long long now = time_msec();
4941     size_t n_batches;
4942     odp_port_t in_port;
4943
4944     n_batches = 0;
4945     emc_processing(pmd, packets, keys, batches, &n_batches,
4946                             md_is_valid, port_no);
4947     if (!dp_packet_batch_is_empty(packets)) {
4948         /* Get ingress port from first packet's metadata. */
4949         in_port = packets->packets[0]->md.in_port.odp_port;
4950         fast_path_processing(pmd, packets, keys, batches, &n_batches,
4951                              in_port, now);
4952     }
4953
4954     /* All the flow batches need to be reset before any call to
4955      * packet_batch_per_flow_execute() as it could potentially trigger
4956      * recirculation. When a packet matching flow ‘j’ happens to be
4957      * recirculated, the nested call to dp_netdev_input__() could potentially
4958      * classify the packet as matching another flow - say 'k'. It could happen
4959      * that in the previous call to dp_netdev_input__() that same flow 'k' had
4960      * already its own batches[k] still waiting to be served.  So if its
4961      * ‘batch’ member is not reset, the recirculated packet would be wrongly
4962      * appended to batches[k] of the 1st call to dp_netdev_input__(). */
4963     size_t i;
4964     for (i = 0; i < n_batches; i++) {
4965         batches[i].flow->batch = NULL;
4966     }
4967
4968     for (i = 0; i < n_batches; i++) {
4969         packet_batch_per_flow_execute(&batches[i], pmd, now);
4970     }
4971 }
4972
4973 static void
4974 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
4975                 struct dp_packet_batch *packets,
4976                 odp_port_t port_no)
4977 {
4978     dp_netdev_input__(pmd, packets, false, port_no);
4979 }
4980
4981 static void
4982 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
4983                       struct dp_packet_batch *packets)
4984 {
4985     dp_netdev_input__(pmd, packets, true, 0);
4986 }
4987
4988 struct dp_netdev_execute_aux {
4989     struct dp_netdev_pmd_thread *pmd;
4990     long long now;
4991     const struct flow *flow;
4992 };
4993
4994 static void
4995 dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
4996                                  void *aux)
4997 {
4998     struct dp_netdev *dp = get_dp_netdev(dpif);
4999     dp->dp_purge_aux = aux;
5000     dp->dp_purge_cb = cb;
5001 }
5002
5003 static void
5004 dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
5005                                void *aux)
5006 {
5007     struct dp_netdev *dp = get_dp_netdev(dpif);
5008     dp->upcall_aux = aux;
5009     dp->upcall_cb = cb;
5010 }
5011
5012 static void
5013 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
5014                                long long now, bool purge)
5015 {
5016     struct tx_port *tx;
5017     struct dp_netdev_port *port;
5018     long long interval;
5019
5020     HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
5021         if (!tx->port->dynamic_txqs) {
5022             continue;
5023         }
5024         interval = now - tx->last_used;
5025         if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
5026             port = tx->port;
5027             ovs_mutex_lock(&port->txq_used_mutex);
5028             port->txq_used[tx->qid]--;
5029             ovs_mutex_unlock(&port->txq_used_mutex);
5030             tx->qid = -1;
5031         }
5032     }
5033 }
5034
5035 static int
5036 dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
5037                            struct tx_port *tx, long long now)
5038 {
5039     struct dp_netdev_port *port;
5040     long long interval;
5041     int i, min_cnt, min_qid;
5042
5043     if (OVS_UNLIKELY(!now)) {
5044         now = time_msec();
5045     }
5046
5047     interval = now - tx->last_used;
5048     tx->last_used = now;
5049
5050     if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
5051         return tx->qid;
5052     }
5053
5054     port = tx->port;
5055
5056     ovs_mutex_lock(&port->txq_used_mutex);
5057     if (tx->qid >= 0) {
5058         port->txq_used[tx->qid]--;
5059         tx->qid = -1;
5060     }
5061
5062     min_cnt = -1;
5063     min_qid = 0;
5064     for (i = 0; i < netdev_n_txq(port->netdev); i++) {
5065         if (port->txq_used[i] < min_cnt || min_cnt == -1) {
5066             min_cnt = port->txq_used[i];
5067             min_qid = i;
5068         }
5069     }
5070
5071     port->txq_used[min_qid]++;
5072     tx->qid = min_qid;
5073
5074     ovs_mutex_unlock(&port->txq_used_mutex);
5075
5076     dpif_netdev_xps_revalidate_pmd(pmd, now, false);
5077
5078     VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
5079              pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
5080     return min_qid;
5081 }
5082
5083 static struct tx_port *
5084 pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5085                           odp_port_t port_no)
5086 {
5087     return tx_port_lookup(&pmd->tnl_port_cache, port_no);
5088 }
5089
5090 static struct tx_port *
5091 pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
5092                            odp_port_t port_no)
5093 {
5094     return tx_port_lookup(&pmd->send_port_cache, port_no);
5095 }
5096
5097 static int
5098 push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
5099                 const struct nlattr *attr,
5100                 struct dp_packet_batch *batch)
5101 {
5102     struct tx_port *tun_port;
5103     const struct ovs_action_push_tnl *data;
5104     int err;
5105
5106     data = nl_attr_get(attr);
5107
5108     tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
5109     if (!tun_port) {
5110         err = -EINVAL;
5111         goto error;
5112     }
5113     err = netdev_push_header(tun_port->port->netdev, batch, data);
5114     if (!err) {
5115         return 0;
5116     }
5117 error:
5118     dp_packet_delete_batch(batch, true);
5119     return err;
5120 }
5121
5122 static void
5123 dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
5124                             struct dp_packet *packet, bool may_steal,
5125                             struct flow *flow, ovs_u128 *ufid,
5126                             struct ofpbuf *actions,
5127                             const struct nlattr *userdata, long long now)
5128 {
5129     struct dp_packet_batch b;
5130     int error;
5131
5132     ofpbuf_clear(actions);
5133
5134     error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
5135                              DPIF_UC_ACTION, userdata, actions,
5136                              NULL);
5137     if (!error || error == ENOSPC) {
5138         dp_packet_batch_init_packet(&b, packet);
5139         dp_netdev_execute_actions(pmd, &b, may_steal, flow,
5140                                   actions->data, actions->size, now);
5141     } else if (may_steal) {
5142         dp_packet_delete(packet);
5143     }
5144 }
5145
5146 static void
5147 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
5148               const struct nlattr *a, bool may_steal)
5149     OVS_NO_THREAD_SAFETY_ANALYSIS
5150 {
5151     struct dp_netdev_execute_aux *aux = aux_;
5152     uint32_t *depth = recirc_depth_get();
5153     struct dp_netdev_pmd_thread *pmd = aux->pmd;
5154     struct dp_netdev *dp = pmd->dp;
5155     int type = nl_attr_type(a);
5156     long long now = aux->now;
5157     struct tx_port *p;
5158
5159     switch ((enum ovs_action_attr)type) {
5160     case OVS_ACTION_ATTR_OUTPUT:
5161         p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
5162         if (OVS_LIKELY(p)) {
5163             int tx_qid;
5164             bool dynamic_txqs;
5165
5166             dynamic_txqs = p->port->dynamic_txqs;
5167             if (dynamic_txqs) {
5168                 tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now);
5169             } else {
5170                 tx_qid = pmd->static_tx_qid;
5171             }
5172
5173             netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
5174                         dynamic_txqs);
5175             return;
5176         }
5177         break;
5178
5179     case OVS_ACTION_ATTR_TUNNEL_PUSH:
5180         if (*depth < MAX_RECIRC_DEPTH) {
5181             dp_packet_batch_apply_cutlen(packets_);
5182             push_tnl_action(pmd, a, packets_);
5183             return;
5184         }
5185         break;
5186
5187     case OVS_ACTION_ATTR_TUNNEL_POP:
5188         if (*depth < MAX_RECIRC_DEPTH) {
5189             struct dp_packet_batch *orig_packets_ = packets_;
5190             odp_port_t portno = nl_attr_get_odp_port(a);
5191
5192             p = pmd_tnl_port_cache_lookup(pmd, portno);
5193             if (p) {
5194                 struct dp_packet_batch tnl_pkt;
5195
5196                 if (!may_steal) {
5197                     dp_packet_batch_clone(&tnl_pkt, packets_);
5198                     packets_ = &tnl_pkt;
5199                     dp_packet_batch_reset_cutlen(orig_packets_);
5200                 }
5201
5202                 dp_packet_batch_apply_cutlen(packets_);
5203
5204                 netdev_pop_header(p->port->netdev, packets_);
5205                 if (dp_packet_batch_is_empty(packets_)) {
5206                     return;
5207                 }
5208
5209                 struct dp_packet *packet;
5210                 DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5211                     packet->md.in_port.odp_port = portno;
5212                 }
5213
5214                 (*depth)++;
5215                 dp_netdev_recirculate(pmd, packets_);
5216                 (*depth)--;
5217                 return;
5218             }
5219         }
5220         break;
5221
5222     case OVS_ACTION_ATTR_USERSPACE:
5223         if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
5224             struct dp_packet_batch *orig_packets_ = packets_;
5225             const struct nlattr *userdata;
5226             struct dp_packet_batch usr_pkt;
5227             struct ofpbuf actions;
5228             struct flow flow;
5229             ovs_u128 ufid;
5230             bool clone = false;
5231
5232             userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
5233             ofpbuf_init(&actions, 0);
5234
5235             if (packets_->trunc) {
5236                 if (!may_steal) {
5237                     dp_packet_batch_clone(&usr_pkt, packets_);
5238                     packets_ = &usr_pkt;
5239                     clone = true;
5240                     dp_packet_batch_reset_cutlen(orig_packets_);
5241                 }
5242
5243                 dp_packet_batch_apply_cutlen(packets_);
5244             }
5245
5246             struct dp_packet *packet;
5247             DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5248                 flow_extract(packet, &flow);
5249                 dpif_flow_hash(dp->dpif, &flow, sizeof flow, &ufid);
5250                 dp_execute_userspace_action(pmd, packet, may_steal, &flow,
5251                                             &ufid, &actions, userdata, now);
5252             }
5253
5254             if (clone) {
5255                 dp_packet_delete_batch(packets_, true);
5256             }
5257
5258             ofpbuf_uninit(&actions);
5259             fat_rwlock_unlock(&dp->upcall_rwlock);
5260
5261             return;
5262         }
5263         break;
5264
5265     case OVS_ACTION_ATTR_RECIRC:
5266         if (*depth < MAX_RECIRC_DEPTH) {
5267             struct dp_packet_batch recirc_pkts;
5268
5269             if (!may_steal) {
5270                dp_packet_batch_clone(&recirc_pkts, packets_);
5271                packets_ = &recirc_pkts;
5272             }
5273
5274             struct dp_packet *packet;
5275             DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
5276                 packet->md.recirc_id = nl_attr_get_u32(a);
5277             }
5278
5279             (*depth)++;
5280             dp_netdev_recirculate(pmd, packets_);
5281             (*depth)--;
5282
5283             return;
5284         }
5285
5286         VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
5287         break;
5288
5289     case OVS_ACTION_ATTR_CT: {
5290         const struct nlattr *b;
5291         bool force = false;
5292         bool commit = false;
5293         unsigned int left;
5294         uint16_t zone = 0;
5295         const char *helper = NULL;
5296         const uint32_t *setmark = NULL;
5297         const struct ovs_key_ct_labels *setlabel = NULL;
5298         struct nat_action_info_t nat_action_info;
5299         struct nat_action_info_t *nat_action_info_ref = NULL;
5300         bool nat_config = false;
5301
5302         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
5303                                  nl_attr_get_size(a)) {
5304             enum ovs_ct_attr sub_type = nl_attr_type(b);
5305
5306             switch(sub_type) {
5307             case OVS_CT_ATTR_FORCE_COMMIT:
5308                 force = true;
5309                 /* fall through. */
5310             case OVS_CT_ATTR_COMMIT:
5311                 commit = true;
5312                 break;
5313             case OVS_CT_ATTR_ZONE:
5314                 zone = nl_attr_get_u16(b);
5315                 break;
5316             case OVS_CT_ATTR_HELPER:
5317                 helper = nl_attr_get_string(b);
5318                 break;
5319             case OVS_CT_ATTR_MARK:
5320                 setmark = nl_attr_get(b);
5321                 break;
5322             case OVS_CT_ATTR_LABELS:
5323                 setlabel = nl_attr_get(b);
5324                 break;
5325             case OVS_CT_ATTR_EVENTMASK:
5326                 /* Silently ignored, as userspace datapath does not generate
5327                  * netlink events. */
5328                 break;
5329             case OVS_CT_ATTR_NAT: {
5330                 const struct nlattr *b_nest;
5331                 unsigned int left_nest;
5332                 bool ip_min_specified = false;
5333                 bool proto_num_min_specified = false;
5334                 bool ip_max_specified = false;
5335                 bool proto_num_max_specified = false;
5336                 memset(&nat_action_info, 0, sizeof nat_action_info);
5337                 nat_action_info_ref = &nat_action_info;
5338
5339                 NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
5340                     enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
5341
5342                     switch (sub_type_nest) {
5343                     case OVS_NAT_ATTR_SRC:
5344                     case OVS_NAT_ATTR_DST:
5345                         nat_config = true;
5346                         nat_action_info.nat_action |=
5347                             ((sub_type_nest == OVS_NAT_ATTR_SRC)
5348                                 ? NAT_ACTION_SRC : NAT_ACTION_DST);
5349                         break;
5350                     case OVS_NAT_ATTR_IP_MIN:
5351                         memcpy(&nat_action_info.min_addr,
5352                                nl_attr_get(b_nest),
5353                                nl_attr_get_size(b_nest));
5354                         ip_min_specified = true;
5355                         break;
5356                     case OVS_NAT_ATTR_IP_MAX:
5357                         memcpy(&nat_action_info.max_addr,
5358                                nl_attr_get(b_nest),
5359                                nl_attr_get_size(b_nest));
5360                         ip_max_specified = true;
5361                         break;
5362                     case OVS_NAT_ATTR_PROTO_MIN:
5363                         nat_action_info.min_port =
5364                             nl_attr_get_u16(b_nest);
5365                         proto_num_min_specified = true;
5366                         break;
5367                     case OVS_NAT_ATTR_PROTO_MAX:
5368                         nat_action_info.max_port =
5369                             nl_attr_get_u16(b_nest);
5370                         proto_num_max_specified = true;
5371                         break;
5372                     case OVS_NAT_ATTR_PERSISTENT:
5373                     case OVS_NAT_ATTR_PROTO_HASH:
5374                     case OVS_NAT_ATTR_PROTO_RANDOM:
5375                         break;
5376                     case OVS_NAT_ATTR_UNSPEC:
5377                     case __OVS_NAT_ATTR_MAX:
5378                         OVS_NOT_REACHED();
5379                     }
5380                 }
5381
5382                 if (ip_min_specified && !ip_max_specified) {
5383                     nat_action_info.max_addr = nat_action_info.min_addr;
5384                 }
5385                 if (proto_num_min_specified && !proto_num_max_specified) {
5386                     nat_action_info.max_port = nat_action_info.min_port;
5387                 }
5388                 if (proto_num_min_specified || proto_num_max_specified) {
5389                     if (nat_action_info.nat_action & NAT_ACTION_SRC) {
5390                         nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
5391                     } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
5392                         nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
5393                     }
5394                 }
5395                 break;
5396             }
5397             case OVS_CT_ATTR_UNSPEC:
5398             case __OVS_CT_ATTR_MAX:
5399                 OVS_NOT_REACHED();
5400             }
5401         }
5402
5403         /* We won't be able to function properly in this case, hence
5404          * complain loudly. */
5405         if (nat_config && !commit) {
5406             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
5407             VLOG_WARN_RL(&rl, "NAT specified without commit.");
5408         }
5409
5410         conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
5411                           commit, zone, setmark, setlabel, helper,
5412                           nat_action_info_ref, now);
5413         break;
5414     }
5415
5416     case OVS_ACTION_ATTR_METER:
5417         dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
5418                             time_msec());
5419         break;
5420
5421     case OVS_ACTION_ATTR_PUSH_VLAN:
5422     case OVS_ACTION_ATTR_POP_VLAN:
5423     case OVS_ACTION_ATTR_PUSH_MPLS:
5424     case OVS_ACTION_ATTR_POP_MPLS:
5425     case OVS_ACTION_ATTR_SET:
5426     case OVS_ACTION_ATTR_SET_MASKED:
5427     case OVS_ACTION_ATTR_SAMPLE:
5428     case OVS_ACTION_ATTR_HASH:
5429     case OVS_ACTION_ATTR_UNSPEC:
5430     case OVS_ACTION_ATTR_TRUNC:
5431     case OVS_ACTION_ATTR_PUSH_ETH:
5432     case OVS_ACTION_ATTR_POP_ETH:
5433     case OVS_ACTION_ATTR_CLONE:
5434     case OVS_ACTION_ATTR_ENCAP_NSH:
5435     case OVS_ACTION_ATTR_DECAP_NSH:
5436     case __OVS_ACTION_ATTR_MAX:
5437         OVS_NOT_REACHED();
5438     }
5439
5440     dp_packet_delete_batch(packets_, may_steal);
5441 }
5442
5443 static void
5444 dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
5445                           struct dp_packet_batch *packets,
5446                           bool may_steal, const struct flow *flow,
5447                           const struct nlattr *actions, size_t actions_len,
5448                           long long now)
5449 {
5450     struct dp_netdev_execute_aux aux = { pmd, now, flow };
5451
5452     odp_execute_actions(&aux, packets, may_steal, actions,
5453                         actions_len, dp_execute_cb);
5454 }
5455
5456 struct dp_netdev_ct_dump {
5457     struct ct_dpif_dump_state up;
5458     struct conntrack_dump dump;
5459     struct conntrack *ct;
5460     struct dp_netdev *dp;
5461 };
5462
5463 static int
5464 dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
5465                           const uint16_t *pzone, int *ptot_bkts)
5466 {
5467     struct dp_netdev *dp = get_dp_netdev(dpif);
5468     struct dp_netdev_ct_dump *dump;
5469
5470     dump = xzalloc(sizeof *dump);
5471     dump->dp = dp;
5472     dump->ct = &dp->conntrack;
5473
5474     conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts);
5475
5476     *dump_ = &dump->up;
5477
5478     return 0;
5479 }
5480
5481 static int
5482 dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
5483                          struct ct_dpif_dump_state *dump_,
5484                          struct ct_dpif_entry *entry)
5485 {
5486     struct dp_netdev_ct_dump *dump;
5487
5488     INIT_CONTAINER(dump, dump_, up);
5489
5490     return conntrack_dump_next(&dump->dump, entry);
5491 }
5492
5493 static int
5494 dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
5495                          struct ct_dpif_dump_state *dump_)
5496 {
5497     struct dp_netdev_ct_dump *dump;
5498     int err;
5499
5500     INIT_CONTAINER(dump, dump_, up);
5501
5502     err = conntrack_dump_done(&dump->dump);
5503
5504     free(dump);
5505
5506     return err;
5507 }
5508
5509 static int
5510 dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone)
5511 {
5512     struct dp_netdev *dp = get_dp_netdev(dpif);
5513
5514     return conntrack_flush(&dp->conntrack, zone);
5515 }
5516
5517 const struct dpif_class dpif_netdev_class = {
5518     "netdev",
5519     dpif_netdev_init,
5520     dpif_netdev_enumerate,
5521     dpif_netdev_port_open_type,
5522     dpif_netdev_open,
5523     dpif_netdev_close,
5524     dpif_netdev_destroy,
5525     dpif_netdev_run,
5526     dpif_netdev_wait,
5527     dpif_netdev_get_stats,
5528     dpif_netdev_port_add,
5529     dpif_netdev_port_del,
5530     dpif_netdev_port_set_config,
5531     dpif_netdev_port_query_by_number,
5532     dpif_netdev_port_query_by_name,
5533     NULL,                       /* port_get_pid */
5534     dpif_netdev_port_dump_start,
5535     dpif_netdev_port_dump_next,
5536     dpif_netdev_port_dump_done,
5537     dpif_netdev_port_poll,
5538     dpif_netdev_port_poll_wait,
5539     dpif_netdev_flow_flush,
5540     dpif_netdev_flow_dump_create,
5541     dpif_netdev_flow_dump_destroy,
5542     dpif_netdev_flow_dump_thread_create,
5543     dpif_netdev_flow_dump_thread_destroy,
5544     dpif_netdev_flow_dump_next,
5545     dpif_netdev_operate,
5546     NULL,                       /* recv_set */
5547     NULL,                       /* handlers_set */
5548     dpif_netdev_set_config,
5549     dpif_netdev_queue_to_priority,
5550     NULL,                       /* recv */
5551     NULL,                       /* recv_wait */
5552     NULL,                       /* recv_purge */
5553     dpif_netdev_register_dp_purge_cb,
5554     dpif_netdev_register_upcall_cb,
5555     dpif_netdev_enable_upcall,
5556     dpif_netdev_disable_upcall,
5557     dpif_netdev_get_datapath_version,
5558     dpif_netdev_ct_dump_start,
5559     dpif_netdev_ct_dump_next,
5560     dpif_netdev_ct_dump_done,
5561     dpif_netdev_ct_flush,
5562     dpif_netdev_meter_get_features,
5563     dpif_netdev_meter_set,
5564     dpif_netdev_meter_get,
5565     dpif_netdev_meter_del,
5566 };
5567
5568 static void
5569 dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
5570                               const char *argv[], void *aux OVS_UNUSED)
5571 {
5572     struct dp_netdev_port *port;
5573     struct dp_netdev *dp;
5574     odp_port_t port_no;
5575
5576     ovs_mutex_lock(&dp_netdev_mutex);
5577     dp = shash_find_data(&dp_netdevs, argv[1]);
5578     if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
5579         ovs_mutex_unlock(&dp_netdev_mutex);
5580         unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
5581         return;
5582     }
5583     ovs_refcount_ref(&dp->ref_cnt);
5584     ovs_mutex_unlock(&dp_netdev_mutex);
5585
5586     ovs_mutex_lock(&dp->port_mutex);
5587     if (get_port_by_name(dp, argv[2], &port)) {
5588         unixctl_command_reply_error(conn, "unknown port");
5589         goto exit;
5590     }
5591
5592     port_no = u32_to_odp(atoi(argv[3]));
5593     if (!port_no || port_no == ODPP_NONE) {
5594         unixctl_command_reply_error(conn, "bad port number");
5595         goto exit;
5596     }
5597     if (dp_netdev_lookup_port(dp, port_no)) {
5598         unixctl_command_reply_error(conn, "port number already in use");
5599         goto exit;
5600     }
5601
5602     /* Remove port. */
5603     hmap_remove(&dp->ports, &port->node);
5604     reconfigure_datapath(dp);
5605
5606     /* Reinsert with new port number. */
5607     port->port_no = port_no;
5608     hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
5609     reconfigure_datapath(dp);
5610
5611     seq_change(dp->port_seq);
5612     unixctl_command_reply(conn, NULL);
5613
5614 exit:
5615     ovs_mutex_unlock(&dp->port_mutex);
5616     dp_netdev_unref(dp);
5617 }
5618
5619 static void
5620 dpif_dummy_register__(const char *type)
5621 {
5622     struct dpif_class *class;
5623
5624     class = xmalloc(sizeof *class);
5625     *class = dpif_netdev_class;
5626     class->type = xstrdup(type);
5627     dp_register_provider(class);
5628 }
5629
5630 static void
5631 dpif_dummy_override(const char *type)
5632 {
5633     int error;
5634
5635     /*
5636      * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
5637      * a userland-only build.  It's useful for testsuite.
5638      */
5639     error = dp_unregister_provider(type);
5640     if (error == 0 || error == EAFNOSUPPORT) {
5641         dpif_dummy_register__(type);
5642     }
5643 }
5644
5645 void
5646 dpif_dummy_register(enum dummy_level level)
5647 {
5648     if (level == DUMMY_OVERRIDE_ALL) {
5649         struct sset types;
5650         const char *type;
5651
5652         sset_init(&types);
5653         dp_enumerate_types(&types);
5654         SSET_FOR_EACH (type, &types) {
5655             dpif_dummy_override(type);
5656         }
5657         sset_destroy(&types);
5658     } else if (level == DUMMY_OVERRIDE_SYSTEM) {
5659         dpif_dummy_override("system");
5660     }
5661
5662     dpif_dummy_register__("dummy");
5663
5664     unixctl_command_register("dpif-dummy/change-port-number",
5665                              "dp port new-number",
5666                              3, 3, dpif_dummy_change_port_number, NULL);
5667 }
5668 \f
5669 /* Datapath Classifier. */
5670
5671 /* A set of rules that all have the same fields wildcarded. */
5672 struct dpcls_subtable {
5673     /* The fields are only used by writers. */
5674     struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */
5675
5676     /* These fields are accessed by readers. */
5677     struct cmap rules;           /* Contains "struct dpcls_rule"s. */
5678     uint32_t hit_cnt;            /* Number of match hits in subtable in current
5679                                     optimization interval. */
5680     struct netdev_flow_key mask; /* Wildcards for fields (const). */
5681     /* 'mask' must be the last field, additional space is allocated here. */
5682 };
5683
5684 /* Initializes 'cls' as a classifier that initially contains no classification
5685  * rules. */
5686 static void
5687 dpcls_init(struct dpcls *cls)
5688 {
5689     cmap_init(&cls->subtables_map);
5690     pvector_init(&cls->subtables);
5691 }
5692
5693 static void
5694 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
5695 {
5696     VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
5697     pvector_remove(&cls->subtables, subtable);
5698     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
5699                 subtable->mask.hash);
5700     cmap_destroy(&subtable->rules);
5701     ovsrcu_postpone(free, subtable);
5702 }
5703
5704 /* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
5705  * caller's responsibility.
5706  * May only be called after all the readers have been terminated. */
5707 static void
5708 dpcls_destroy(struct dpcls *cls)
5709 {
5710     if (cls) {
5711         struct dpcls_subtable *subtable;
5712
5713         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
5714             ovs_assert(cmap_count(&subtable->rules) == 0);
5715             dpcls_destroy_subtable(cls, subtable);
5716         }
5717         cmap_destroy(&cls->subtables_map);
5718         pvector_destroy(&cls->subtables);
5719     }
5720 }
5721
5722 static struct dpcls_subtable *
5723 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
5724 {
5725     struct dpcls_subtable *subtable;
5726
5727     /* Need to add one. */
5728     subtable = xmalloc(sizeof *subtable
5729                        - sizeof subtable->mask.mf + mask->len);
5730     cmap_init(&subtable->rules);
5731     subtable->hit_cnt = 0;
5732     netdev_flow_key_clone(&subtable->mask, mask);
5733     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
5734     /* Add the new subtable at the end of the pvector (with no hits yet) */
5735     pvector_insert(&cls->subtables, subtable, 0);
5736     VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
5737              cmap_count(&cls->subtables_map), subtable, cls->in_port);
5738     pvector_publish(&cls->subtables);
5739
5740     return subtable;
5741 }
5742
5743 static inline struct dpcls_subtable *
5744 dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
5745 {
5746     struct dpcls_subtable *subtable;
5747
5748     CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
5749                              &cls->subtables_map) {
5750         if (netdev_flow_key_equal(&subtable->mask, mask)) {
5751             return subtable;
5752         }
5753     }
5754     return dpcls_create_subtable(cls, mask);
5755 }
5756
5757
5758 /* Periodically sort the dpcls subtable vectors according to hit counts */
5759 static void
5760 dpcls_sort_subtable_vector(struct dpcls *cls)
5761 {
5762     struct pvector *pvec = &cls->subtables;
5763     struct dpcls_subtable *subtable;
5764
5765     PVECTOR_FOR_EACH (subtable, pvec) {
5766         pvector_change_priority(pvec, subtable, subtable->hit_cnt);
5767         subtable->hit_cnt = 0;
5768     }
5769     pvector_publish(pvec);
5770 }
5771
5772 static inline void
5773 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)
5774 {
5775     struct dpcls *cls;
5776     long long int now = time_msec();
5777
5778     if (now > pmd->next_optimization) {
5779         /* Try to obtain the flow lock to block out revalidator threads.
5780          * If not possible, just try next time. */
5781         if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
5782             /* Optimize each classifier */
5783             CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
5784                 dpcls_sort_subtable_vector(cls);
5785             }
5786             ovs_mutex_unlock(&pmd->flow_mutex);
5787             /* Start new measuring interval */
5788             pmd->next_optimization = now + DPCLS_OPTIMIZATION_INTERVAL;
5789         }
5790     }
5791 }
5792
5793 /* Insert 'rule' into 'cls'. */
5794 static void
5795 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
5796              const struct netdev_flow_key *mask)
5797 {
5798     struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
5799
5800     /* Refer to subtable's mask, also for later removal. */
5801     rule->mask = &subtable->mask;
5802     cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
5803 }
5804
5805 /* Removes 'rule' from 'cls', also destructing the 'rule'. */
5806 static void
5807 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
5808 {
5809     struct dpcls_subtable *subtable;
5810
5811     ovs_assert(rule->mask);
5812
5813     /* Get subtable from reference in rule->mask. */
5814     INIT_CONTAINER(subtable, rule->mask, mask);
5815     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
5816         == 0) {
5817         /* Delete empty subtable. */
5818         dpcls_destroy_subtable(cls, subtable);
5819         pvector_publish(&cls->subtables);
5820     }
5821 }
5822
5823 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
5824  * in 'mask' the values in 'key' and 'target' are the same. */
5825 static inline bool
5826 dpcls_rule_matches_key(const struct dpcls_rule *rule,
5827                        const struct netdev_flow_key *target)
5828 {
5829     const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
5830     const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
5831     uint64_t value;
5832
5833     NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
5834         if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
5835             return false;
5836         }
5837     }
5838     return true;
5839 }
5840
5841 /* For each miniflow in 'keys' performs a classifier lookup writing the result
5842  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
5843  * NULL it is skipped.
5844  *
5845  * This function is optimized for use in the userspace datapath and therefore
5846  * does not implement a lot of features available in the standard
5847  * classifier_lookup() function.  Specifically, it does not implement
5848  * priorities, instead returning any rule which matches the flow.
5849  *
5850  * Returns true if all miniflows found a corresponding rule. */
5851 static bool
5852 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
5853              struct dpcls_rule **rules, const size_t cnt,
5854              int *num_lookups_p)
5855 {
5856     /* The received 'cnt' miniflows are the search-keys that will be processed
5857      * to find a matching entry into the available subtables.
5858      * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
5859     typedef uint32_t map_type;
5860 #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
5861     BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
5862
5863     struct dpcls_subtable *subtable;
5864
5865     map_type keys_map = TYPE_MAXIMUM(map_type); /* Set all bits. */
5866     map_type found_map;
5867     uint32_t hashes[MAP_BITS];
5868     const struct cmap_node *nodes[MAP_BITS];
5869
5870     if (cnt != MAP_BITS) {
5871         keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
5872     }
5873     memset(rules, 0, cnt * sizeof *rules);
5874
5875     int lookups_match = 0, subtable_pos = 1;
5876
5877     /* The Datapath classifier - aka dpcls - is composed of subtables.
5878      * Subtables are dynamically created as needed when new rules are inserted.
5879      * Each subtable collects rules with matches on a specific subset of packet
5880      * fields as defined by the subtable's mask.  We proceed to process every
5881      * search-key against each subtable, but when a match is found for a
5882      * search-key, the search for that key can stop because the rules are
5883      * non-overlapping. */
5884     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
5885         int i;
5886
5887         /* Compute hashes for the remaining keys.  Each search-key is
5888          * masked with the subtable's mask to avoid hashing the wildcarded
5889          * bits. */
5890         ULLONG_FOR_EACH_1(i, keys_map) {
5891             hashes[i] = netdev_flow_key_hash_in_mask(&keys[i],
5892                                                      &subtable->mask);
5893         }
5894         /* Lookup. */
5895         found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
5896         /* Check results.  When the i-th bit of found_map is set, it means
5897          * that a set of nodes with a matching hash value was found for the
5898          * i-th search-key.  Due to possible hash collisions we need to check
5899          * which of the found rules, if any, really matches our masked
5900          * search-key. */
5901         ULLONG_FOR_EACH_1(i, found_map) {
5902             struct dpcls_rule *rule;
5903
5904             CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
5905                 if (OVS_LIKELY(dpcls_rule_matches_key(rule, &keys[i]))) {
5906                     rules[i] = rule;
5907                     /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
5908                      * within one second optimization interval. */
5909                     subtable->hit_cnt++;
5910                     lookups_match += subtable_pos;
5911                     goto next;
5912                 }
5913             }
5914             /* None of the found rules was a match.  Reset the i-th bit to
5915              * keep searching this key in the next subtable. */
5916             ULLONG_SET0(found_map, i);  /* Did not match. */
5917         next:
5918             ;                     /* Keep Sparse happy. */
5919         }
5920         keys_map &= ~found_map;             /* Clear the found rules. */
5921         if (!keys_map) {
5922             if (num_lookups_p) {
5923                 *num_lookups_p = lookups_match;
5924             }
5925             return true;              /* All found. */
5926         }
5927         subtable_pos++;
5928     }
5929     if (num_lookups_p) {
5930         *num_lookups_p = lookups_match;
5931     }
5932     return false;                     /* Some misses. */
5933 }