Userspace datapath: Add fragmentation handling.

author Darrell Ball <dlu998@gmail.com>

Wed, 13 Feb 2019 23:34:21 +0000 (15:34 -0800)

committer Ben Pfaff <blp@ovn.org>

Thu, 14 Feb 2019 22:18:56 +0000 (14:18 -0800)
author Darrell Ball <dlu998@gmail.com>
Wed, 13 Feb 2019 23:34:21 +0000 (15:34 -0800)
committer Ben Pfaff <blp@ovn.org>
Thu, 14 Feb 2019 22:18:56 +0000 (14:18 -0800)
diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst

index cd7254b767083d728e184794d181cfeabb3d7b0a..a78152b54de9774a11bcb967a4482bc13f98dbc8 100644 (file)
--- a/Documentation/faq/releases.rst
+++ b/Documentation/faq/releases.rst
@@ -105,31 +105,32 @@ Q: Are all features available with all datapaths?
      The following table lists the datapath supported features from an Open
      vSwitch user's perspective.
  
-    ===================== ============== ============== ========= =======
-    Feature               Linux upstream Linux OVS tree Userspace Hyper-V
-    ===================== ============== ============== ========= =======
-    NAT                   4.6            YES            Yes       NO
-    Connection tracking   4.3            YES            PARTIAL   PARTIAL
-    Tunnel - LISP         NO             YES            NO        NO
-    Tunnel - STT          NO             YES            NO        YES
-    Tunnel - GRE          3.11           YES            YES       YES
-    Tunnel - VXLAN        3.12           YES            YES       YES
-    Tunnel - Geneve       3.18           YES            YES       YES
-    Tunnel - GRE-IPv6     4.18           YES            YES       NO
-    Tunnel - VXLAN-IPv6   4.3            YES            YES       NO
-    Tunnel - Geneve-IPv6  4.4            YES            YES       NO
-    Tunnel - ERSPAN       4.18           YES            YES       NO
-    Tunnel - ERSPAN-IPv6  4.18           YES            YES       NO
-    QoS - Policing        YES            YES            YES       NO
-    QoS - Shaping         YES            YES            NO        NO
-    sFlow                 YES            YES            YES       NO
-    IPFIX                 3.10           YES            YES       NO
-    Set action            YES            YES            YES       PARTIAL
-    NIC Bonding           YES            YES            YES       YES
-    Multiple VTEPs        YES            YES            YES       YES
-    Meters                4.15           YES            YES       NO
-    Conntrack zone limit  4.18           YES            NO        NO
-    ===================== ============== ============== ========= =======
+    ========================== ============== ============== ========= =======
+    Feature                    Linux upstream Linux OVS tree Userspace Hyper-V
+    ========================== ============== ============== ========= =======
+    Connection tracking             4.3            YES          YES      YES
+    Conntrack Fragment Reass.       4.3            YES          YES      YES
+    NAT                             4.6            YES          YES      NO
+    Conntrack zone limit            4.18           YES          NO       NO
+    Tunnel - LISP                   NO             YES          NO       NO
+    Tunnel - STT                    NO             YES          NO       YES
+    Tunnel - GRE                    3.11           YES          YES      YES
+    Tunnel - VXLAN                  3.12           YES          YES      YES
+    Tunnel - Geneve                 3.18           YES          YES      YES
+    Tunnel - GRE-IPv6               NO             NO           YES      NO
+    Tunnel - VXLAN-IPv6             4.3            YES          YES      NO
+    Tunnel - Geneve-IPv6            4.4            YES          YES      NO
+    Tunnel - ERSPAN                 4.18           YES          YES      NO
+    Tunnel - ERSPAN-IPv6            4.18           YES          YES      NO
+    QoS - Policing                  YES            YES          YES      NO
+    QoS - Shaping                   YES            YES          NO       NO
+    sFlow                           YES            YES          YES      NO
+    IPFIX                           3.10           YES          YES      NO
+    Set action                      YES            YES          YES    PARTIAL
+    NIC Bonding                     YES            YES          YES      YES
+    Multiple VTEPs                  YES            YES          YES      YES
+    Meters                          4.15           YES          YES      NO
+    ========================== ============== ============== ========= =======
  
      Do note, however:
  
diff --git a/NEWS b/NEWS

index ccc0bfb0e474381fec9b37d6897c9bbcb6c7597b..2dabb970415a7f40d535cd5dd366ea2896c2edba 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -8,7 +8,15 @@ Post-v2.11.0
     - Userspace datapath:
       * ICMPv6 ND enhancements: support for match and set ND options type
         and reserved fields.
-
+     * Add v4/v6 fragmentation support for conntrack.
+     * New ovs-appctl "dpctl/ipf-set-enabled" and "dpctl/ipf-set-disabled"
+       commands for userspace datapath conntrack fragmentation support.
+     * New "ovs-appctl dpctl/ipf-set-min-frag" command for userspace
+       datapath conntrack fragmentation support.
+     * New "ovs-appctl dpctl/ipf-set-max-nfrags" command for userspace datapath
+       conntrack fragmentation support.
+     * New "ovs-appctl dpctl/ipf-get-status" command for userspace datapath
+       conntrack fragmentation support.
  
  v2.11.0 - xx xxx xxxx
  ---------------------
diff --git a/include/sparse/netinet/ip6.h b/include/sparse/netinet/ip6.h

index d2a54de169d894e67f9fb203d2faac40bca74c4b..bfa637a4604b4fcd366b9bb52755df1602137ea0 100644 (file)
--- a/include/sparse/netinet/ip6.h
+++ b/include/sparse/netinet/ip6.h
@@ -64,5 +64,6 @@ struct ip6_frag {
  };
  
  #define IP6F_OFF_MASK ((OVS_FORCE ovs_be16) 0xfff8)
+#define IP6F_MORE_FRAG ((OVS_FORCE ovs_be16) 0x0001)
  
  #endif /* netinet/ip6.h sparse */
diff --git a/lib/automake.mk b/lib/automake.mk

index ba1041095c51e2874fb9d0fd0e9b3c1e19a9c705..bae032bd835e9f86ead322e5c5029a0f1a4ef92f 100644 (file)
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -1,4 +1,4 @@
-# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
+# Copyright (C) 2009-2018 Nicira, Inc.
  #
  # Copying and distribution of this file, with or without modification,
  # are permitted in any medium without royalty provided the copyright
@@ -108,6 +108,8 @@ lib_libopenvswitch_la_SOURCES = \
         lib/hmapx.h \
         lib/id-pool.c \
         lib/id-pool.h \
+       lib/ipf.c \
+       lib/ipf.h \
         lib/jhash.c \
         lib/jhash.h \
         lib/json.c \
diff --git a/lib/conntrack.c b/lib/conntrack.c

index a044a69874f45b691dc498c652c82391678510fc..78c673c56f1fb661230c31fe231067282294b020 100644 (file)
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
+ * Copyright (c) 2015-2019 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
  #include "ct-dpif.h"
  #include "dp-packet.h"
  #include "flow.h"
+#include "ipf.h"
  #include "netdev.h"
  #include "odp-netlink.h"
  #include "openvswitch/hmap.h"
@@ -340,6 +341,7 @@ conntrack_init(struct conntrack *ct)
      atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
      latch_init(&ct->clean_thread_exit);
      ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
+    ct->ipf = ipf_init();
  }
  
  /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
@@ -382,6 +384,7 @@ conntrack_destroy(struct conntrack *ct)
      hindex_destroy(&ct->alg_expectation_refs);
      ct_rwlock_unlock(&ct->resources_lock);
      ct_rwlock_destroy(&ct->resources_lock);
+    ipf_destroy(ct->ipf);
  }
  \f
  static unsigned hash_to_bucket(uint32_t hash)
@@ -1299,7 +1302,8 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
  
  /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
   * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
- * the l3 and and l4 offset properly set.
+ * the l3 and and l4 offset properly set.  Performs fragment reassembly with
+ * the help of ipf_preprocess_conntrack().
   *
   * If 'commit' is true, the packets are allowed to create new entries in the
   * connection tables.  'setmark', if not NULL, should point to a two
@@ -1314,11 +1318,15 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
                    const struct nat_action_info_t *nat_action_info,
                    long long now)
  {
+    ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
+                             ct->hash_basis);
+
      struct dp_packet *packet;
      struct conn_lookup_ctx ctx;
  
      DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
-        if (!conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
+        if (packet->md.ct_state == CS_INVALID
+            || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
              packet->md.ct_state = CS_INVALID;
              write_ct_md(packet, zone, NULL, NULL, NULL);
              continue;
@@ -1327,6 +1335,8 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
                      setlabel, nat_action_info, tp_src, tp_dst, helper);
      }
  
+    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
+
      return 0;
  }
  
@@ -2484,6 +2494,12 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
      }
  }
  
+struct ipf *
+conntrack_ipf_ctx(struct conntrack *ct)
+{
+    return ct->ipf;
+}
+
  int
  conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
                       const uint16_t *pzone, int *ptot_bkts)
diff --git a/lib/conntrack.h b/lib/conntrack.h

index e3a5dcc8023ff04aa148da231c83c5f3c24f6e7d..038d22d713a1a0b2874d8fa0169ab060177f5e53 100644 (file)
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
+ * Copyright (c) 2015, 2016, 2017, 2019 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -122,6 +122,7 @@ int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *,
  int conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns);
  int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns);
  int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns);
+struct ipf *conntrack_ipf_ctx(struct conntrack *ct);
  \f
  /* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful to try
   * different types of locks (e.g. spinlocks) */
@@ -293,6 +294,9 @@ struct conntrack {
       */
      struct ct_rwlock resources_lock;
  
+    /* Fragmentation handling context. */
+    struct ipf *ipf;
+
  };
  
  #endif /* conntrack.h */
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c

index 67eccd0fa04a06e2c86201c0801deb53dfad61ff..b2c9b4309a8c3cb23fbc8f2917f78b9eff4ccce6 100644 (file)
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015 Nicira, Inc.
+ * Copyright (c) 2015, 2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -194,6 +194,62 @@ ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *zone_limits)
              : EOPNOTSUPP);
  }
  
+int
+ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
+{
+    return (dpif->dpif_class->ipf_set_enabled
+            ? dpif->dpif_class->ipf_set_enabled(dpif, v6, enable)
+            : EOPNOTSUPP);
+}
+
+int
+ct_dpif_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
+{
+    return (dpif->dpif_class->ipf_set_min_frag
+            ? dpif->dpif_class->ipf_set_min_frag(dpif, v6, min_frag)
+            : EOPNOTSUPP);
+}
+
+int
+ct_dpif_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
+{
+    return (dpif->dpif_class->ipf_set_max_nfrags
+            ? dpif->dpif_class->ipf_set_max_nfrags(dpif, max_frags)
+            : EOPNOTSUPP);
+}
+
+int ct_dpif_ipf_get_status(struct dpif *dpif,
+                           struct dpif_ipf_status *dpif_ipf_status)
+{
+    return (dpif->dpif_class->ipf_get_status
+            ? dpif->dpif_class->ipf_get_status(dpif, dpif_ipf_status)
+            : EOPNOTSUPP);
+}
+
+int
+ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **dump_ctx)
+{
+    return (dpif->dpif_class->ipf_dump_start
+           ? dpif->dpif_class->ipf_dump_start(dpif, dump_ctx)
+           : EOPNOTSUPP);
+}
+
+int
+ct_dpif_ipf_dump_next(struct dpif *dpif, void *dump_ctx,  char **dump)
+{
+    return (dpif->dpif_class->ipf_dump_next
+            ? dpif->dpif_class->ipf_dump_next(dpif, dump_ctx, dump)
+            : EOPNOTSUPP);
+}
+
+int
+ct_dpif_ipf_dump_done(struct dpif *dpif, void *dump_ctx)
+{
+    return (dpif->dpif_class->ipf_dump_done
+            ? dpif->dpif_class->ipf_dump_done(dpif, dump_ctx)
+            : EOPNOTSUPP);
+}
+
  void
  ct_dpif_entry_uninit(struct ct_dpif_entry *entry)
  {
diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h

index decc14ffc2a04ef09168a1985492b2bfe1749f2d..0151cfea47959b1c447f5e973f40d7bc0bab42c6 100644 (file)
--- a/lib/ct-dpif.h
+++ b/lib/ct-dpif.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015 Nicira, Inc.
+ * Copyright (c) 2015, 2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -186,6 +186,8 @@ enum {
  };
  
  struct dpif;
+struct dpif_ipf_status;
+struct ipf_dump_ctx;
  
  struct ct_dpif_dump_state {
      struct dpif *dpif;
@@ -212,6 +214,14 @@ int ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit,
  int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit,
                         const struct ovs_list *, struct ovs_list *);
  int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *);
+int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable);
+int ct_dpif_ipf_set_min_frag(struct dpif *, bool v6, uint32_t min_frag);
+int ct_dpif_ipf_set_max_nfrags(struct dpif *, uint32_t max_frags);
+int ct_dpif_ipf_get_status(struct dpif *dpif,
+                           struct dpif_ipf_status *dpif_ipf_status);
+int ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **);
+int ct_dpif_ipf_dump_next(struct dpif *dpif, void *, char **);
+int ct_dpif_ipf_dump_done(struct dpif *dpif, void *);
  void ct_dpif_entry_uninit(struct ct_dpif_entry *);
  void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *,
                            bool verbose, bool print_stats);
diff --git a/lib/dpctl.c b/lib/dpctl.c

index 59071cdba83d8a05f0d0dfd968edc941e1ad0489..f5a09b70f65b03190c28e9f48d5507bbe27434d9 100644 (file)
--- a/lib/dpctl.c
+++ b/lib/dpctl.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2008-2017 Nicira, Inc.
+ * Copyright (c) 2008-2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
  #include "dirs.h"
  #include "dpctl.h"
  #include "dpif.h"
+#include "dpif-provider.h"
  #include "openvswitch/dynamic-string.h"
  #include "flow.h"
  #include "openvswitch/match.h"
@@ -1917,6 +1918,210 @@ out:
      return error;
  }
  
+static int
+ipf_set_enabled__(int argc, const char *argv[], struct dpctl_params *dpctl_p,
+                  bool enabled)
+{
+    struct dpif *dpif;
+    int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif);
+    if (!error) {
+        char v4_or_v6[3] = {0};
+        if (ovs_scan(argv[argc - 1], "%2s", v4_or_v6) &&
+            (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2))) {
+            error = ct_dpif_ipf_set_enabled(
+                        dpif, !strncmp(v4_or_v6, "v6", 2), enabled);
+            if (!error) {
+                dpctl_print(dpctl_p,
+                            "%s fragmentation reassembly successful",
+                            enabled ? "enabling" : "disabling");
+            } else {
+                dpctl_error(dpctl_p, error,
+                            "%s fragmentation reassembly failed",
+                            enabled ? "enabling" : "disabling");
+            }
+        } else {
+            error = EINVAL;
+            dpctl_error(dpctl_p, error,
+                        "parameter missing: 'v4' for IPv4 or 'v6' for IPv6");
+        }
+        dpif_close(dpif);
+    }
+    return error;
+}
+
+static int
+dpctl_ipf_set_enabled(int argc, const char *argv[],
+                      struct dpctl_params *dpctl_p)
+{
+    return ipf_set_enabled__(argc, argv, dpctl_p, true);
+}
+
+static int
+dpctl_ipf_set_disabled(int argc, const char *argv[],
+                       struct dpctl_params *dpctl_p)
+{
+    return ipf_set_enabled__(argc, argv, dpctl_p, false);
+}
+
+static int
+dpctl_ipf_set_min_frag(int argc, const char *argv[],
+                       struct dpctl_params *dpctl_p)
+{
+    struct dpif *dpif;
+    int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif);
+    if (!error) {
+        char v4_or_v6[3] = {0};
+        if (ovs_scan(argv[argc - 2], "%2s", v4_or_v6) &&
+            (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2))) {
+            uint32_t min_fragment;
+            if (ovs_scan(argv[argc - 1], "%"SCNu32, &min_fragment)) {
+                error = ct_dpif_ipf_set_min_frag(
+                            dpif, !strncmp(v4_or_v6, "v6", 2), min_fragment);
+                if (!error) {
+                    dpctl_print(dpctl_p,
+                                "setting minimum fragment size successful");
+                } else {
+                    dpctl_error(dpctl_p, error,
+                                "requested minimum fragment size too small;"
+                                " see documentation");
+                }
+            } else {
+                error = EINVAL;
+                dpctl_error(dpctl_p, error,
+                            "parameter missing for minimum fragment size");
+            }
+        } else {
+            error = EINVAL;
+            dpctl_error(dpctl_p, error,
+                        "parameter missing: v4 for IPv4 or v6 for IPv6");
+        }
+        dpif_close(dpif);
+    }
+
+    return error;
+}
+
+static int
+dpctl_ipf_set_max_nfrags(int argc, const char *argv[],
+                         struct dpctl_params *dpctl_p)
+{
+    struct dpif *dpif;
+    int error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif);
+    if (!error) {
+        uint32_t nfrags_max;
+        if (ovs_scan(argv[argc - 1], "%"SCNu32, &nfrags_max)) {
+            error = ct_dpif_ipf_set_max_nfrags(dpif, nfrags_max);
+            if (!error) {
+                dpctl_print(dpctl_p,
+                            "setting maximum fragments successful");
+            } else {
+                dpctl_error(dpctl_p, error,
+                            "setting maximum fragments failed");
+            }
+        } else {
+            error = EINVAL;
+            dpctl_error(dpctl_p, error,
+                        "parameter missing for maximum fragments");
+        }
+        dpif_close(dpif);
+    }
+
+    return error;
+}
+
+static void
+dpctl_dump_ipf(struct dpif *dpif, struct dpctl_params *dpctl_p)
+{
+    struct ipf_dump_ctx *dump_ctx;
+    char *dump;
+
+    int error = ct_dpif_ipf_dump_start(dpif, &dump_ctx);
+    if (error) {
+        dpctl_error(dpctl_p, error, "starting ipf list dump");
+        /* Nothing to clean up, just return. */
+        return;
+    }
+
+    dpctl_print(dpctl_p, "\n        Fragment Lists:\n\n");
+    while (!(error = ct_dpif_ipf_dump_next(dpif, dump_ctx, &dump))) {
+        dpctl_print(dpctl_p, "%s\n", dump);
+        free(dump);
+    }
+
+    if (error && error != EOF) {
+        dpctl_error(dpctl_p, error, "dumping ipf lists failed");
+    }
+
+    ct_dpif_ipf_dump_done(dpif, dump_ctx);
+}
+
+static int
+dpctl_ct_ipf_get_status(int argc, const char *argv[],
+                        struct dpctl_params *dpctl_p)
+{
+    struct dpif *dpif;
+    int error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif);
+
+    if (!error) {
+        struct dpif_ipf_status dpif_ipf_status;
+        error = ct_dpif_ipf_get_status(dpif, &dpif_ipf_status);
+
+        if (!error) {
+            dpctl_print(dpctl_p, "        Fragmentation Module Status\n");
+            dpctl_print(dpctl_p, "        ---------------------------\n");
+            dpctl_print(dpctl_p, "        v4 enabled: %u\n",
+                        dpif_ipf_status.v4.enabled);
+            dpctl_print(dpctl_p, "        v6 enabled: %u\n",
+                        dpif_ipf_status.v6.enabled);
+            dpctl_print(dpctl_p, "        max num frags (v4/v6): %u\n",
+                        dpif_ipf_status.nfrag_max);
+            dpctl_print(dpctl_p, "        num frag: %u\n",
+                        dpif_ipf_status.nfrag);
+            dpctl_print(dpctl_p, "        min v4 frag size: %u\n",
+                        dpif_ipf_status.v4.min_frag_size);
+            dpctl_print(dpctl_p, "        v4 frags accepted: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_accepted);
+            dpctl_print(dpctl_p, "        v4 frags completed: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_completed_sent);
+            dpctl_print(dpctl_p, "        v4 frags expired: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_expired_sent);
+            dpctl_print(dpctl_p, "        v4 frags too small: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_too_small);
+            dpctl_print(dpctl_p, "        v4 frags overlapped: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_overlap);
+            dpctl_print(dpctl_p, "        v4 frags purged: %"PRIu64"\n",
+                        dpif_ipf_status.v4.nfrag_purged);
+
+            dpctl_print(dpctl_p, "        min v6 frag size: %u\n",
+                        dpif_ipf_status.v6.min_frag_size);
+            dpctl_print(dpctl_p, "        v6 frags accepted: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_accepted);
+            dpctl_print(dpctl_p, "        v6 frags completed: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_completed_sent);
+            dpctl_print(dpctl_p, "        v6 frags expired: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_expired_sent);
+            dpctl_print(dpctl_p, "        v6 frags too small: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_too_small);
+            dpctl_print(dpctl_p, "        v6 frags overlapped: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_overlap);
+            dpctl_print(dpctl_p, "        v6 frags purged: %"PRIu64"\n",
+                        dpif_ipf_status.v6.nfrag_purged);
+        } else {
+            dpctl_error(dpctl_p, error,
+                        "ipf status could not be retrieved");
+            return error;
+        }
+
+        if (dpctl_p->verbosity) {
+            dpctl_dump_ipf(dpif, dpctl_p);
+        }
+
+        dpif_close(dpif);
+    }
+
+    return error;
+}
+
  /* Undocumented commands for unit testing. */
  
  static int
@@ -2222,6 +2427,14 @@ static const struct dpctl_command all_commands[] = {
          DP_RO },
      { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2, dpctl_ct_get_limits,
          DP_RO },
+    { "ipf-set-enabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_enabled, DP_RW },
+    { "ipf-set-disabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_disabled, DP_RW },
+    { "ipf-set-min-frag", "[dp] v4|v6 minfragment", 2, 3,
+       dpctl_ipf_set_min_frag, DP_RW },
+    { "ipf-set-max-nfrags", "[dp] maxfrags", 1, 2,
+       dpctl_ipf_set_max_nfrags, DP_RW },
+    { "ipf-get-status", "[dp]", 0, 1, dpctl_ct_ipf_get_status,
+       DP_RO },
      { "help", "", 0, INT_MAX, dpctl_help, DP_RO },
      { "list-commands", "", 0, INT_MAX, dpctl_list_commands, DP_RO },
  
diff --git a/lib/dpctl.man b/lib/dpctl.man

index fe0aec9ef34d36acf22b126c5588f274e9f2eb92..f22029fcd40061354e4f26f1e06bd0533a153815 100644 (file)
--- a/lib/dpctl.man
+++ b/lib/dpctl.man
@@ -220,6 +220,42 @@ nftables and the regular host stack).  Therefore, the following commands
  do not apply specifically to one datapath.
  .
  .TP
+\*(DX\fBipf\-set\-enabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR
+.TQ
+\*(DX\fBipf\-set\-disabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR
+Enables or disables IP fragmentation handling for the userspace
+connection tracker.  Either \fBv4\fR or \fBv6\fR must be specified.
+Both IPv4 and IPv6 fragment reassembly are enabled by default.  Only
+supported for the userspace datapath.
+.
+.TP
+\*(DX\fBipf\-set\-min\-frag\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR \fIminfrag\fR
+Sets the minimum fragment size for non-final fragments to
+\fIminfrag\fR.  Either \fBv4\fR or \fBv6\fR must be specified.  For
+enhanced DOS security, higher minimum fragment sizes can usually be used.
+The default IPv4 value is 1200 and the clamped minimum is 400.  The default
+IPv6 value is 1280, with a clamped minimum of 400, for testing
+flexibility.  The maximum fragment size is not clamped, however, setting
+this value too high might result in valid fragments being dropped.  Only
+supported for userspace datapath.
+.
+.TP
+\*(DX\fBipf\-set\-max\-nfrags\fR [\fIdp\fR] \fImaxfrags\fR
+Sets the maximum number of fragments tracked by the userspace datapath
+connection tracker to \fImaxfrags\fR.  The default value is 1000 and the
+clamped maximum is 5000.  Note that packet buffers can be held by the
+fragmentation module while fragments are incomplete, but will timeout
+after 15 seconds.  Memory pool sizing should be set accordingly when
+fragmentation is enabled.  Only supported for userspace datapath.
+.
+.TP
+.DO "[\fB\-m\fR | \fB\-\-more\fR]" "\*(DX\fBipf\-get\-status\fR [\fIdp\fR]"
+Gets the configuration settings and fragment counters associated with the
+fragmentation handling of the userspace datapath connection tracker.
+With \fB\-m\fR or \fB\-\-more\fR, also dumps the IP fragment lists.
+Only supported for userspace datapath.
+.
+.TP
  .DO "[\fB\-m\fR | \fB\-\-more\fR] [\fB\-s\fR | \fB\-\-statistics\fR]" "\*(DX\fBdump\-conntrack\fR" "[\fIdp\fR] [\fBzone=\fIzone\fR]"
  Prints to the console all the connection entries in the tracker used by
  \fIdp\fR.  If \fBzone=\fIzone\fR is specified, only shows the connections
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c

index f41f1d70df36b1ec3607b7f5134a1247c07f2971..77ac1d2c19a050d85c4b483493f87aa44e7a4a0b 100644 (file)
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc.
+ * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -47,6 +47,7 @@
  #include "flow.h"
  #include "hmapx.h"
  #include "id-pool.h"
+#include "ipf.h"
  #include "latch.h"
  #include "netdev.h"
  #include "netdev-provider.h"
@@ -7356,6 +7357,61 @@ dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
      return conntrack_get_nconns(&dp->conntrack, nconns);
  }
  
+static int
+dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
+{
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    return ipf_set_enabled(conntrack_ipf_ctx(&dp->conntrack), v6, enable);
+}
+
+static int
+dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
+{
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    return ipf_set_min_frag(conntrack_ipf_ctx(&dp->conntrack), v6, min_frag);
+}
+
+static int
+dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
+{
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    return ipf_set_max_nfrags(conntrack_ipf_ctx(&dp->conntrack), max_frags);
+}
+
+/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
+ * diverge. */
+static int
+dpif_netdev_ipf_get_status(struct dpif *dpif,
+                           struct dpif_ipf_status *dpif_ipf_status)
+{
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    ipf_get_status(conntrack_ipf_ctx(&dp->conntrack),
+                   (struct ipf_status *) dpif_ipf_status);
+    return 0;
+}
+
+static int
+dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
+                           struct ipf_dump_ctx **ipf_dump_ctx)
+{
+    return ipf_dump_start(ipf_dump_ctx);
+}
+
+static int
+dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
+{
+    struct dp_netdev *dp = get_dp_netdev(dpif);
+    return ipf_dump_next(conntrack_ipf_ctx(&dp->conntrack), ipf_dump_ctx,
+                         dump);
+}
+
+static int
+dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
+{
+    return ipf_dump_done(ipf_dump_ctx);
+
+}
+
  const struct dpif_class dpif_netdev_class = {
      "netdev",
      dpif_netdev_init,
@@ -7407,6 +7463,13 @@ const struct dpif_class dpif_netdev_class = {
      NULL,                       /* ct_set_limits */
      NULL,                       /* ct_get_limits */
      NULL,                       /* ct_del_limits */
+    dpif_netdev_ipf_set_enabled,
+    dpif_netdev_ipf_set_min_frag,
+    dpif_netdev_ipf_set_max_nfrags,
+    dpif_netdev_ipf_get_status,
+    dpif_netdev_ipf_dump_start,
+    dpif_netdev_ipf_dump_next,
+    dpif_netdev_ipf_dump_done,
      dpif_netdev_meter_get_features,
      dpif_netdev_meter_set,
      dpif_netdev_meter_get,
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c

index e23a35da4f4e1dc041ea49e2c1ebbc66a919d996..73641a5b5b5a3b94c451ebe181f72a0448707c25 100644 (file)
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2008-2017 Nicira, Inc.
+ * Copyright (c) 2008-2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -3429,6 +3429,13 @@ const struct dpif_class dpif_netlink_class = {
      dpif_netlink_ct_set_limits,
      dpif_netlink_ct_get_limits,
      dpif_netlink_ct_del_limits,
+    NULL,                       /* ipf_set_enabled */
+    NULL,                       /* ipf_set_min_frag */
+    NULL,                       /* ipf_set_max_nfrags */
+    NULL,                       /* ipf_get_status */
+    NULL,                       /* ipf_dump_start */
+    NULL,                       /* ipf_dump_next */
+    NULL,                       /* ipf_dump_done */
      dpif_netlink_meter_get_features,
      dpif_netlink_meter_set,
      dpif_netlink_meter_get,
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h

index 78e153c86247b666df3521c021a2359713ddceb2..b2a4dff9645c74699bcc854a3a8406d85507e480 100644 (file)
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
+ * Copyright (c) 2009-2014, 2018 Nicira, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -42,6 +42,9 @@ struct dpif {
      long long int current_ms;
  };
  
+struct dpif_ipf_status;
+struct ipf_dump_ctx;
+
  void dpif_init(struct dpif *, const struct dpif_class *, const char *name,
                 uint8_t netflow_engine_type, uint8_t netflow_engine_id);
  void dpif_uninit(struct dpif *dpif, bool close);
@@ -78,6 +81,27 @@ struct ct_dpif_dump_state;
  struct ct_dpif_entry;
  struct ct_dpif_tuple;
  
+/* 'dpif_ipf_proto_status' and 'dpif_ipf_status' are presently in
+ * sync with 'ipf_proto_status' and 'ipf_status', but more
+ * generally represent a superset of present and future support. */
+struct dpif_ipf_proto_status {
+   uint64_t nfrag_accepted;
+   uint64_t nfrag_completed_sent;
+   uint64_t nfrag_expired_sent;
+   uint64_t nfrag_too_small;
+   uint64_t nfrag_overlap;
+   uint64_t nfrag_purged;
+   unsigned int min_frag_size;
+   bool enabled;
+};
+
+struct dpif_ipf_status {
+   struct dpif_ipf_proto_status v4;
+   struct dpif_ipf_proto_status v6;
+   unsigned int nfrag;
+   unsigned int nfrag_max;
+};
+
  /* Datapath interface class structure, to be defined by each implementation of
   * a datapath interface.
   *
@@ -468,6 +492,33 @@ struct dpif_class {
       * list of 'struct ct_dpif_zone_limit' entries. */
      int (*ct_del_limits)(struct dpif *, const struct ovs_list *zone_limits);
  
+    /* IP Fragmentation. */
+
+    /* Disables or enables conntrack fragment reassembly.  The default
+     * setting is enabled. */
+    int (*ipf_set_enabled)(struct dpif *, bool v6, bool enabled);
+
+    /* Set minimum fragment allowed. */
+    int (*ipf_set_min_frag)(struct dpif *, bool v6, uint32_t min_frag);
+
+    /* Set maximum number of fragments tracked. */
+    int (*ipf_set_max_nfrags)(struct dpif *, uint32_t max_nfrags);
+
+    /* Get fragmentation configuration status and counters. */
+    int (*ipf_get_status)(struct dpif *,
+                          struct dpif_ipf_status *dpif_ipf_status);
+
+    /* The following 3 apis find and print ipf lists by creating a string
+     * representation of the state of an ipf list, to which 'dump' is pointed
+     * to.  'ipf_dump_start()' allocates memory for 'ipf_dump_ctx'.
+     * 'ipf_dump_next()' finds the next ipf list and copies it's
+     * characteristics to a string, which is freed by the caller.
+     * 'ipf_dump_done()' frees the 'ipf_dump_ctx' that was allocated in
+     * 'ipf_dump_start'. */
+    int (*ipf_dump_start)(struct dpif *, struct ipf_dump_ctx **ipf_dump_ctx);
+    int (*ipf_dump_next)(struct dpif *, void *ipf_dump_ctx, char **dump);
+    int (*ipf_dump_done)(struct dpif *, void *ipf_dump_ctx);
+
      /* Meters */
  
      /* Queries 'dpif' for supported meter features.
diff --git a/lib/ipf.c b/lib/ipf.c

new file mode 100644 (file)

index 0000000..df5196f
--- /dev/null
+++ b/lib/ipf.c
@@ -0,0 +1,1525 @@
+/*
+ * Copyright (c) 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <string.h>
+
+#include "coverage.h"
+#include "csum.h"
+#include "ipf.h"
+#include "latch.h"
+#include "openvswitch/hmap.h"
+#include "openvswitch/poll-loop.h"
+#include "openvswitch/vlog.h"
+#include "ovs-atomic.h"
+#include "packets.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(ipf);
+COVERAGE_DEFINE(ipf_stuck_frag_list_purged);
+
+enum {
+    IPV4_PACKET_MAX_HDR_SIZE = 60,
+    IPV4_PACKET_MAX_SIZE = 65535,
+    IPV6_PACKET_MAX_DATA = 65535,
+};
+
+enum ipf_list_state {
+    IPF_LIST_STATE_UNUSED,
+    IPF_LIST_STATE_REASS_FAIL,
+    IPF_LIST_STATE_OTHER_SEEN,
+    IPF_LIST_STATE_FIRST_SEEN,
+    IPF_LIST_STATE_LAST_SEEN,
+    IPF_LIST_STATE_FIRST_LAST_SEEN,
+    IPF_LIST_STATE_COMPLETED,
+    IPF_LIST_STATE_NUM,
+};
+
+static char *ipf_state_name[IPF_LIST_STATE_NUM] =
+    {"unused", "reassemble fail", "other frag", "first frag", "last frag",
+     "first/last frag", "complete"};
+
+enum ipf_list_type {
+    IPF_FRAG_COMPLETED_LIST,
+    IPF_FRAG_EXPIRY_LIST,
+};
+
+enum {
+    IPF_INVALID_IDX = -1,
+    IPF_V4_FRAG_SIZE_LBOUND = 400,
+    IPF_V4_FRAG_SIZE_MIN_DEF = 1200,
+    IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */
+    IPF_V6_FRAG_SIZE_MIN_DEF = 1280,
+    IPF_MAX_FRAGS_DEFAULT = 1000,
+    IPF_NFRAG_UBOUND = 5000,
+};
+
+enum ipf_counter_type {
+    IPF_NFRAGS_ACCEPTED,
+    IPF_NFRAGS_COMPL_SENT,
+    IPF_NFRAGS_EXPD_SENT,
+    IPF_NFRAGS_TOO_SMALL,
+    IPF_NFRAGS_OVERLAP,
+    IPF_NFRAGS_PURGED,
+    IPF_NFRAGS_NUM_CNTS,
+};
+
+union ipf_addr {
+    ovs_be32 ipv4;
+    struct in6_addr ipv6;
+};
+
+/* Represents a single fragment; part of a list of fragments. */
+struct ipf_frag {
+    struct dp_packet *pkt;
+    uint16_t start_data_byte;
+    uint16_t end_data_byte;
+    bool dnsteal; /* 'do not steal': if true, ipf should not free packet. */
+};
+
+/* The key for a collection of fragments potentially making up an unfragmented
+ * packet. */
+struct ipf_list_key {
+    /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
+     * two members. */
+    union ipf_addr src_addr;
+    union ipf_addr dst_addr;
+    uint32_t recirc_id;
+    ovs_be32 ip_id;   /* V6 is 32 bits. */
+    ovs_be16 dl_type;
+    uint16_t zone;
+    uint8_t nw_proto;
+};
+
+/* A collection of fragments potentially making up an unfragmented packet. */
+struct ipf_list {
+    struct hmap_node node;         /* In struct ipf's 'frag_lists'. */
+    struct ovs_list list_node;     /* In struct ipf's 'frag_exp_list' or
+                                    * 'frag_complete_list'. */
+    struct ipf_frag *frag_list;    /* List of fragments for this list. */
+    struct ipf_list_key key;       /* The key for the fragemnt list. */
+    struct dp_packet *reass_execute_ctx; /* Reassembled packet. */
+    long long expiration;          /* In milliseconds. */
+    int last_sent_idx;             /* Last sent fragment idx. */
+    int last_inuse_idx;            /* Last inuse fragment idx. */
+    int size;                      /* Fragment list size. */
+    uint8_t state;                 /* Frag list state; see ipf_list_state. */
+};
+
+/* Represents a reassambled packet which typically is passed through
+ * conntrack. */
+struct reassembled_pkt {
+    struct ovs_list rp_list_node;  /* In struct ipf's
+                                    * 'reassembled_pkt_list'. */
+    struct dp_packet *pkt;
+    struct ipf_list *list;
+};
+
+struct ipf {
+    /* The clean thread is used to clean up fragments in the 'ipf'
+     * module if packet batches are not longer be sent through its user. */
+    pthread_t ipf_clean_thread;
+    struct latch ipf_clean_thread_exit;
+
+    int max_v4_frag_list_size;
+
+    struct ovs_mutex ipf_lock; /* Protects all of the following. */
+    /* These contain 'struct ipf_list's. */
+    struct hmap frag_lists OVS_GUARDED;
+    struct ovs_list frag_exp_list OVS_GUARDED;
+    struct ovs_list frag_complete_list OVS_GUARDED;
+    /* Contains 'struct reassembled_pkt's. */
+    struct ovs_list reassembled_pkt_list OVS_GUARDED;
+
+    /* Used to allow disabling fragmentation reassembly. */
+    atomic_bool ifp_v4_enabled;
+    atomic_bool ifp_v6_enabled;
+
+    /* Will be clamped above 400 bytes; the value chosen should handle
+     * alg control packets of interest that use string encoding of mutable
+     * IP fields; meaning, the control packets should not be fragmented. */
+    atomic_uint min_v4_frag_size;
+    atomic_uint min_v6_frag_size;
+
+    /* Configurable maximum allowable fragments in process. */
+    atomic_uint nfrag_max;
+
+    /* Number of fragments in process. */
+    atomic_count nfrag;
+
+    atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS];
+    atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS];
+};
+
+static void
+ipf_print_reass_packet(const char *es, const void *pkt)
+{
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+    if (!VLOG_DROP_WARN(&rl)) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+        ds_put_hex_dump(&ds, pkt, 128, 0, false);
+        VLOG_WARN("%s\n%s", es, ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+}
+
+static void
+ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr)
+{
+    atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] : &ipf->n4frag_cnt[cntr]);
+}
+
+static bool
+ipf_get_v4_enabled(struct ipf *ipf)
+{
+    bool ifp_v4_enabled_;
+    atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_);
+    return ifp_v4_enabled_;
+}
+
+static bool
+ipf_get_v6_enabled(struct ipf *ipf)
+{
+    bool ifp_v6_enabled_;
+    atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_);
+    return ifp_v6_enabled_;
+}
+
+static bool
+ipf_get_enabled(struct ipf *ipf)
+{
+    return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf);
+}
+
+static uint32_t
+ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr)
+{
+    BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
+    return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
+}
+
+/* Adds a list of fragments to the list tracking expiry of yet to be
+ * completed reassembled packets, hence subject to expirty. */
+static void
+ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list *ipf_list,
+                    long long now)
+   /* OVS_REQUIRES(ipf->ipf_lock) */
+{
+    enum {
+        IPF_FRAG_LIST_TIMEOUT = 15000,
+    };
+
+    ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT;
+    ovs_list_push_back(frag_exp_list, &ipf_list->list_node);
+}
+
+/* Adds a list of fragments to the list of completed packets, which will be
+ * subsequently transmitted. */
+static void
+ipf_completed_list_add(struct ovs_list *frag_complete_list,
+                       struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ovs_list_push_back(frag_complete_list, &ipf_list->list_node);
+}
+
+/* Adds a reassmebled packet to the list of reassembled packets, awaiting some
+ * processing, such as being sent through conntrack. */
+static void
+ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list,
+                         struct reassembled_pkt *rp)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node);
+}
+
+/* Removed a frag list from tracking datastructures and frees list heap
+ * memory. */
+static void
+ipf_list_clean(struct hmap *frag_lists,
+               struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ovs_list_remove(&ipf_list->list_node);
+    hmap_remove(frag_lists, &ipf_list->node);
+    free(ipf_list->frag_list);
+    free(ipf_list);
+}
+
+/* Removed a frag list sitting on the expiry list from tracking
+ * datastructures and frees list heap memory. */
+static void
+ipf_expiry_list_clean(struct hmap *frag_lists,
+                      struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ipf_list_clean(frag_lists, ipf_list);
+}
+
+/* Removed a frag list sitting on the completed list from tracking
+ * datastructures and frees list heap memory. */
+static void
+ipf_completed_list_clean(struct hmap *frag_lists,
+                         struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ipf_list_clean(frag_lists, ipf_list);
+}
+
+static void
+ipf_expiry_list_remove(struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ovs_list_remove(&ipf_list->list_node);
+}
+
+static void
+ipf_reassembled_list_remove(struct reassembled_pkt *rp)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    ovs_list_remove(&rp->rp_list_node);
+}
+
+/* Symmetric */
+static uint32_t
+ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
+{
+    uint32_t hsrc, hdst, hash;
+    hsrc = hdst = basis;
+    hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
+    hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
+    hash = hsrc ^ hdst;
+
+    /* Hash the rest of the key. */
+    return hash_words((uint32_t *) (&key->dst_addr + 1),
+                      (uint32_t *) (key + 1) -
+                      (uint32_t *) (&key->dst_addr + 1),
+                      hash);
+}
+
+static bool
+ipf_is_first_v4_frag(const struct dp_packet *pkt)
+{
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
+        l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
+        return true;
+    }
+    return false;
+}
+
+static bool
+ipf_is_last_v4_frag(const struct dp_packet *pkt)
+{
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
+        !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
+        return true;
+    }
+    return false;
+}
+
+static bool
+ipf_is_v6_frag(ovs_be16 ip6f_offlg)
+{
+    if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) {
+        return true;
+    }
+    return false;
+}
+
+static bool
+ipf_is_first_v6_frag(ovs_be16 ip6f_offlg)
+{
+    if (!(ip6f_offlg & IP6F_OFF_MASK) &&
+        ip6f_offlg & IP6F_MORE_FRAG) {
+        return true;
+    }
+    return false;
+}
+
+static bool
+ipf_is_last_v6_frag(ovs_be16 ip6f_offlg)
+{
+    if ((ip6f_offlg & IP6F_OFF_MASK) &&
+        !(ip6f_offlg & IP6F_MORE_FRAG)) {
+        return true;
+    }
+    return false;
+}
+
+/* Checks for a completed packet collection of fragments. */
+static bool
+ipf_list_complete(const struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
+        if (ipf_list->frag_list[i - 1].end_data_byte + 1
+            != ipf_list->frag_list[i].start_data_byte) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/* Runs O(n) for a sorted or almost sorted list. */
+static void
+ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    for (int li = 1; li <= last_idx; li++) {
+        struct ipf_frag ipf_frag = frag_list[li];
+        int ci = li - 1;
+        while (ci >= 0 &&
+               frag_list[ci].start_data_byte > ipf_frag.start_data_byte) {
+            frag_list[ci + 1] = frag_list[ci];
+            ci--;
+        }
+        frag_list[ci + 1] = ipf_frag;
+    }
+}
+
+/* Called on a sorted complete list of v4 fragments to reassemble them into
+ * a single packet that can be processed, such as passing through conntrack.
+ */
+static struct dp_packet *
+ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    struct ipf_frag *frag_list = ipf_list->frag_list;
+    struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
+    struct ip_header *l3 = dp_packet_l3(pkt);
+    int len = ntohs(l3->ip_tot_len);
+
+    int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
+                   frag_list[1].start_data_byte + 1;
+
+    if (len + rest_len > IPV4_PACKET_MAX_SIZE) {
+        ipf_print_reass_packet(
+            "Unsupported big reassembled v4 packet; v4 hdr:", l3);
+        dp_packet_delete(pkt);
+        return NULL;
+    }
+
+    dp_packet_prealloc_tailroom(pkt, len + rest_len);
+
+    for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
+        size_t add_len = frag_list[i].end_data_byte -
+                         frag_list[i].start_data_byte + 1;
+        len += add_len;
+        const char *l4 = dp_packet_l4(frag_list[i].pkt);
+        dp_packet_put(pkt, l4, add_len);
+    }
+    l3 = dp_packet_l3(pkt);
+    ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
+    l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
+                                new_ip_frag_off);
+    l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
+    l3->ip_tot_len = htons(len);
+    l3->ip_frag_off = new_ip_frag_off;
+    dp_packet_set_l2_pad_size(pkt, 0);
+
+    return pkt;
+}
+
+/* Called on a sorted complete list of v6 fragments to reassemble them into
+ * a single packet that can be processed, such as passing through conntrack.
+ */
+static struct dp_packet *
+ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    struct ipf_frag *frag_list = ipf_list->frag_list;
+    struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
+    struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
+    int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag);
+
+    int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
+                   frag_list[1].start_data_byte + 1;
+
+    if (pl + rest_len > IPV4_PACKET_MAX_SIZE) {
+        ipf_print_reass_packet(
+             "Unsupported big reassembled v6 packet; v6 hdr:", l3);
+        dp_packet_delete(pkt);
+        return NULL;
+    }
+
+    dp_packet_prealloc_tailroom(pkt, pl + rest_len);
+
+    for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
+        size_t add_len = frag_list[i].end_data_byte -
+                          frag_list[i].start_data_byte + 1;
+        pl += add_len;
+        const char *l4 = dp_packet_l4(frag_list[i].pkt);
+        dp_packet_put(pkt, l4, add_len);
+    }
+
+    l3 = dp_packet_l3(pkt);
+
+    uint8_t nw_proto = l3->ip6_nxt;
+    uint8_t nw_frag = 0;
+    const void *data = l3 + 1;
+    size_t datasize = pl;
+
+    const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
+    if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr)
+        || !nw_frag || !frag_hdr) {
+
+        ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3);
+        dp_packet_delete(pkt);
+        return NULL;
+    }
+
+    struct ovs_16aligned_ip6_frag *fh =
+        CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr);
+    fh->ip6f_offlg = 0;
+    l3->ip6_plen = htons(pl);
+    l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto;
+    dp_packet_set_l2_pad_size(pkt, 0);
+    return pkt;
+}
+
+/* Called when a frag list state transitions to another state. This is
+ * triggered by new fragment for the list being received.*/
+static void
+ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
+                          bool ff, bool lf, bool v6)
+    OVS_REQUIRES(ipf->ipf_lock)
+{
+    enum ipf_list_state curr_state = ipf_list->state;
+    enum ipf_list_state next_state;
+    switch (curr_state) {
+    case IPF_LIST_STATE_UNUSED:
+    case IPF_LIST_STATE_OTHER_SEEN:
+        if (ff) {
+            next_state = IPF_LIST_STATE_FIRST_SEEN;
+        } else if (lf) {
+            next_state = IPF_LIST_STATE_LAST_SEEN;
+        } else {
+            next_state = IPF_LIST_STATE_OTHER_SEEN;
+        }
+        break;
+    case IPF_LIST_STATE_FIRST_SEEN:
+        if (ff) {
+            next_state = IPF_LIST_STATE_FIRST_SEEN;
+        } else if (lf) {
+            next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
+        } else {
+            next_state = IPF_LIST_STATE_FIRST_SEEN;
+        }
+        break;
+    case IPF_LIST_STATE_LAST_SEEN:
+        if (ff) {
+            next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
+        } else if (lf) {
+            next_state = IPF_LIST_STATE_LAST_SEEN;
+        } else {
+            next_state = IPF_LIST_STATE_LAST_SEEN;
+        }
+        break;
+    case IPF_LIST_STATE_FIRST_LAST_SEEN:
+        next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
+        break;
+    case IPF_LIST_STATE_COMPLETED:
+    case IPF_LIST_STATE_REASS_FAIL:
+    case IPF_LIST_STATE_NUM:
+    default:
+        OVS_NOT_REACHED();
+    }
+
+    if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) {
+        ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
+        if (ipf_list_complete(ipf_list)) {
+            struct dp_packet *reass_pkt = v6
+                ? ipf_reassemble_v6_frags(ipf_list)
+                : ipf_reassemble_v4_frags(ipf_list);
+            if (reass_pkt) {
+                struct reassembled_pkt *rp = xzalloc(sizeof *rp);
+                rp->pkt = reass_pkt;
+                rp->list = ipf_list;
+                ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp);
+                ipf_expiry_list_remove(ipf_list);
+                next_state = IPF_LIST_STATE_COMPLETED;
+            } else {
+                next_state = IPF_LIST_STATE_REASS_FAIL;
+            }
+        }
+    }
+    ipf_list->state = next_state;
+}
+
+/* Some sanity checks are redundant, but prudent, in case code paths for
+ * fragments change in future. The processing cost for fragments is not
+ * important. */
+static bool
+ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt)
+{
+    if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) {
+        goto invalid_pkt;
+    }
+
+    const struct eth_header *l2 = dp_packet_eth(pkt);
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+
+    if (OVS_UNLIKELY(!l2 || !l3)) {
+        goto invalid_pkt;
+    }
+
+    size_t l3_size = dp_packet_l3_size(pkt);
+    if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) {
+        goto invalid_pkt;
+    }
+
+    if (!IP_IS_FRAGMENT(l3->ip_frag_off)) {
+        return false;
+    }
+
+    uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
+    if (OVS_UNLIKELY(ip_tot_len != l3_size)) {
+        goto invalid_pkt;
+    }
+
+    size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
+    if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
+        goto invalid_pkt;
+    }
+    if (OVS_UNLIKELY(l3_size < ip_hdr_len)) {
+        goto invalid_pkt;
+    }
+
+    if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
+                     && csum(l3, ip_hdr_len) != 0)) {
+        goto invalid_pkt;
+    }
+
+    uint32_t min_v4_frag_size_;
+    atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_);
+    bool lf = ipf_is_last_v4_frag(pkt);
+    if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v4_frag_size_)) {
+        ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL);
+        goto invalid_pkt;
+    }
+    return true;
+
+invalid_pkt:
+    pkt->md.ct_state = CS_INVALID;
+    return false;
+}
+
+static bool
+ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
+                   struct ipf_list_key *key, uint16_t *start_data_byte,
+                   uint16_t *end_data_byte, bool *ff, bool *lf)
+{
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
+    size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
+
+    *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8;
+    *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1;
+    *ff = ipf_is_first_v4_frag(pkt);
+    *lf = ipf_is_last_v4_frag(pkt);
+    memset(key, 0, sizeof *key);
+    key->ip_id = be16_to_be32(l3->ip_id);
+    key->dl_type = dl_type;
+    key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src);
+    key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst);
+    key->nw_proto = l3->ip_proto;
+    key->zone = zone;
+    key->recirc_id = pkt->md.recirc_id;
+    return true;
+}
+
+/* Some sanity checks are redundant, but prudent, in case code paths for
+ * fragments change in future. The processing cost for fragments is not
+ * important. */
+static bool
+ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt)
+{
+    const struct eth_header *l2 = dp_packet_eth(pkt);
+    const struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
+    const char *l4 = dp_packet_l4(pkt);
+
+    if (OVS_UNLIKELY(!l2 || !l3 || !l4)) {
+        goto invalid_pkt;
+    }
+
+    size_t l3_size = dp_packet_l3_size(pkt);
+    size_t l3_hdr_size = sizeof *l3;
+
+    if (OVS_UNLIKELY(l3_size < l3_hdr_size)) {
+        goto invalid_pkt;
+    }
+
+    uint8_t nw_frag = 0;
+    uint8_t nw_proto = l3->ip6_nxt;
+    const void *data = l3 + 1;
+    size_t datasize = l3_size - l3_hdr_size;
+    const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
+    if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
+                             &frag_hdr) || !nw_frag || !frag_hdr) {
+        return false;
+    }
+
+    int pl = ntohs(l3->ip6_plen);
+    if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) {
+        goto invalid_pkt;
+    }
+
+    ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
+    if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) {
+        return false;
+    }
+
+    uint32_t min_v6_frag_size_;
+    atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_);
+    bool lf = ipf_is_last_v6_frag(ip6f_offlg);
+
+    if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v6_frag_size_)) {
+        ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL);
+        goto invalid_pkt;
+    }
+
+    return true;
+
+invalid_pkt:
+    pkt->md.ct_state = CS_INVALID;
+    return false;
+
+}
+
+static void
+ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
+                   struct ipf_list_key *key, uint16_t *start_data_byte,
+                   uint16_t *end_data_byte, bool *ff, bool *lf)
+{
+    const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
+    const char *l4 = dp_packet_l4(pkt);
+    const char *tail = dp_packet_tail(pkt);
+    uint8_t pad = dp_packet_l2_pad_size(pkt);
+    size_t l3_size = tail - (char *)l3 - pad;
+    size_t l4_size = tail - (char *)l4 - pad;
+    size_t l3_hdr_size = sizeof *l3;
+    uint8_t nw_frag = 0;
+    uint8_t nw_proto = l3->ip6_nxt;
+    const void *data = l3 + 1;
+    size_t datasize = l3_size - l3_hdr_size;
+    const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
+
+    parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr);
+    ovs_assert(nw_frag && frag_hdr);
+    ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
+    *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) +
+        sizeof (struct ovs_16aligned_ip6_frag);
+    *end_data_byte = *start_data_byte + l4_size - 1;
+    *ff = ipf_is_first_v6_frag(ip6f_offlg);
+    *lf = ipf_is_last_v6_frag(ip6f_offlg);
+    memset(key, 0, sizeof *key);
+    key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident);
+    key->dl_type = dl_type;
+    memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6);
+    /* We are not supporting parsing of the routing header to use as the
+     * dst address part of the key. */
+    memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6);
+    key->nw_proto = 0;   /* Not used for key for V6. */
+    key->zone = zone;
+    key->recirc_id = pkt->md.recirc_id;
+}
+
+static bool
+ipf_list_key_eq(const struct ipf_list_key *key1,
+                const struct ipf_list_key *key2)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) &&
+        !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) &&
+        key1->dl_type == key2->dl_type &&
+        key1->ip_id == key2->ip_id &&
+        key1->zone == key2->zone &&
+        key1->nw_proto == key2->nw_proto &&
+        key1->recirc_id == key2->recirc_id) {
+        return true;
+    }
+    return false;
+}
+
+static struct ipf_list *
+ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
+                    uint32_t hash)
+    /* OVS_REQUIRES(ipf->ipf_lock) */
+{
+    struct ipf_list *ipf_list;
+    HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
+        if (ipf_list_key_eq(&ipf_list->key, key)) {
+            return ipf_list;
+        }
+    }
+    return NULL;
+}
+
+static bool
+ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
+                  size_t start_data_byte, size_t end_data_byte)
+    /* OVS_REQUIRES(ipf_lock) */
+{
+    for (int i = 0; i <= last_inuse_idx; i++) {
+        if ((start_data_byte >= frag_list[i].start_data_byte &&
+            start_data_byte <= frag_list[i].end_data_byte) ||
+            (end_data_byte >= frag_list[i].start_data_byte &&
+             end_data_byte <= frag_list[i].end_data_byte)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Adds a fragment to a list of fragments, if the fragment is not a
+ * duplicate. If the fragment is a duplicate, that fragment is marked
+ * invalid to avoid the work that conntrack would do to mark the fragment
+ * as invalid, which it will in all cases. */
+static bool
+ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
+                 struct dp_packet *pkt, uint16_t start_data_byte,
+                 uint16_t end_data_byte, bool ff, bool lf, bool v6,
+                 bool dnsteal)
+    OVS_REQUIRES(ipf->ipf_lock)
+{
+    bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
+        ipf_list->last_inuse_idx, start_data_byte, end_data_byte);
+    int last_inuse_idx = ipf_list->last_inuse_idx;
+
+    if (!duped_frag) {
+        if (last_inuse_idx < ipf_list->size - 1) {
+            /* In the case of dpdk, it would be unfortunate if we had
+             * to create a clone fragment outside the dpdk mp due to the
+             * mempool size being too limited. We will otherwise need to
+             * recommend not setting the mempool number of buffers too low
+             * and also clamp the number of fragments. */
+            struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx + 1];
+            frag->pkt = pkt;
+            frag->start_data_byte = start_data_byte;
+            frag->end_data_byte = end_data_byte;
+            frag->dnsteal = dnsteal;
+            ipf_list->last_inuse_idx++;
+            atomic_count_inc(&ipf->nfrag);
+            ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
+            ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
+        } else {
+            OVS_NOT_REACHED();
+        }
+    } else {
+        ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP);
+        pkt->md.ct_state = CS_INVALID;
+        return false;
+    }
+    return true;
+}
+
+static void
+ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key,
+              int max_frag_list_size)
+{
+    ipf_list->key = *key;
+    ipf_list->last_inuse_idx = IPF_INVALID_IDX;
+    ipf_list->last_sent_idx = IPF_INVALID_IDX;
+    ipf_list->reass_execute_ctx = NULL;
+    ipf_list->state = IPF_LIST_STATE_UNUSED;
+    ipf_list->size = max_frag_list_size;
+    ipf_list->frag_list
+        = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
+}
+
+/* Generates a fragment list key from a well formed fragment and either starts
+ * a new fragment list or increases the size of the existing fragment list,
+ * while checking if the maximum supported fragements are supported or the
+ * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
+ * to a list of fragemnts. */
+static bool
+ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
+                uint16_t zone, long long now, uint32_t hash_basis,
+                bool dnsteal)
+    OVS_REQUIRES(ipf->ipf_lock)
+{
+    struct ipf_list_key key;
+    /* Initialize 4 variables for some versions of GCC. */
+    uint16_t start_data_byte = 0;
+    uint16_t end_data_byte = 0;
+    bool ff = false;
+    bool lf = false;
+    bool v6 = dl_type == htons(ETH_TYPE_IPV6);
+
+    if (v6 && ipf_get_v6_enabled(ipf)) {
+        ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
+                           &end_data_byte, &ff, &lf);
+    } else if (!v6 && ipf_get_v4_enabled(ipf)) {
+        ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
+                           &end_data_byte, &ff, &lf);
+    } else {
+        return false;
+    }
+
+    unsigned int nfrag_max;
+    atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max);
+    if (atomic_count_get(&ipf->nfrag) >= nfrag_max) {
+        return false;
+    }
+
+    uint32_t hash = ipf_list_key_hash(&key, hash_basis);
+    struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash);
+    enum {
+        IPF_FRAG_LIST_MIN_INCREMENT = 4,
+        IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535,
+    };
+
+    int max_frag_list_size;
+    if (v6) {
+        /* Because the calculation with extension headers is variable,
+         * we don't calculate a hard maximum fragment list size upfront.  The
+         * fragment list size is practically limited by the code, however. */
+        max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE;
+    } else {
+        max_frag_list_size = ipf->max_v4_frag_list_size;
+    }
+
+    if (!ipf_list) {
+        ipf_list = xmalloc(sizeof *ipf_list);
+        ipf_list_init(ipf_list, &key,
+                      MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT));
+        hmap_insert(&ipf->frag_lists, &ipf_list->node, hash);
+        ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now);
+    } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL) {
+        /* Bail out as early as possible. */
+        return false;
+    } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
+        int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
+                            max_frag_list_size - ipf_list->size);
+        /* Enforce limit. */
+        if (increment > 0) {
+            ipf_list->frag_list =
+                xrealloc(ipf_list->frag_list, (ipf_list->size + increment) *
+                  sizeof *ipf_list->frag_list);
+            ipf_list->size += increment;
+        } else {
+            return false;
+        }
+    }
+
+    return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
+                            end_data_byte, ff, lf, v6, dnsteal);
+}
+
+/* Filters out fragments from a batch of fragments and adjust the batch. */
+static void
+ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb,
+                             ovs_be16 dl_type, uint16_t zone, long long now,
+                             uint32_t hash_basis)
+{
+    const size_t pb_cnt = dp_packet_batch_size(pb);
+    int pb_idx; /* Index in a packet batch. */
+    struct dp_packet *pkt;
+
+    DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
+        if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) &&
+                          ipf_is_valid_v4_frag(ipf, pkt))
+                          ||
+                          (dl_type == htons(ETH_TYPE_IPV6) &&
+                          ipf_is_valid_v6_frag(ipf, pkt)))) {
+
+            ovs_mutex_lock(&ipf->ipf_lock);
+            if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis,
+                                 pb->do_not_steal)) {
+                dp_packet_batch_refill(pb, pkt, pb_idx);
+            }
+            ovs_mutex_unlock(&ipf->ipf_lock);
+        } else {
+            dp_packet_batch_refill(pb, pkt, pb_idx);
+        }
+    }
+}
+
+/* In case of DPDK, a memory source check is done, as DPDK memory pool
+ * management has trouble dealing with multiple source types.  The
+ * check_source paramater is used to indicate when this check is needed. */
+static bool
+ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt,
+                        bool check_source OVS_UNUSED)
+{
+#ifdef DPDK_NETDEV
+    if ((dp_packet_batch_is_full(pb)) ||
+        /* DPDK cannot handle multiple sources in a batch. */
+        (check_source && !dp_packet_batch_is_empty(pb)
+         && pb->packets[0]->source != pkt->source)) {
+#else
+    if (dp_packet_batch_is_full(pb)) {
+#endif
+        return false;
+    }
+
+    dp_packet_batch_add(pb, pkt);
+    return true;
+}
+
+/* This would be used in rare cases where a list cannot be sent. One rare
+ * reason known right now is a mempool source check, which exists due to DPDK
+ * support, where packets are no longer being received on any port with a
+ * source matching the fragment.  Another reason is a race where all
+ * conntrack rules are unconfigured when some fragments are yet to be
+ * flushed.
+ *
+ * Returns true if the list was purged. */
+static bool
+ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list,
+                     long long now)
+    OVS_REQUIRES(ipf->ipf_lock)
+{
+    enum {
+        IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000
+    };
+
+    if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) {
+        return false;
+    }
+
+    while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
+        struct dp_packet * pkt
+            = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
+        dp_packet_delete(pkt);
+        atomic_count_dec(&ipf->nfrag);
+        COVERAGE_INC(ipf_stuck_frag_list_purged);
+        ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6),
+                  IPF_NFRAGS_PURGED);
+        ipf_list->last_sent_idx++;
+    }
+
+    return true;
+}
+
+/* Does the packet batch management and common accounting work associated
+ * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
+static bool
+ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
+                       struct dp_packet_batch *pb,
+                       enum ipf_list_type list_type, bool v6, long long now)
+    OVS_REQUIRES(ipf->ipf_lock)
+{
+    if (ipf_purge_list_check(ipf, ipf_list, now)) {
+        return true;
+    }
+
+    while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
+        struct dp_packet *pkt
+            = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
+        if (ipf_dp_packet_batch_add(pb, pkt, true)) {
+            ipf_list->last_sent_idx++;
+            atomic_count_dec(&ipf->nfrag);
+
+            if (list_type == IPF_FRAG_COMPLETED_LIST) {
+                ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT);
+            } else {
+                ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT);
+                pkt->md.ct_state = CS_INVALID;
+            }
+
+            if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
+                return true;
+            }
+        } else {
+            return false;
+        }
+    }
+    OVS_NOT_REACHED();
+}
+
+/* Adds fragments associated with a completed fragment list to a packet batch
+ * to be processed by the calling application, typically conntrack. Also
+ * cleans up the list context when it is empty.*/
+static void
+ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
+                         long long now, bool v6)
+{
+    if (ovs_list_is_empty(&ipf->frag_complete_list)) {
+        return;
+    }
+
+    ovs_mutex_lock(&ipf->ipf_lock);
+    struct ipf_list *ipf_list, *next;
+
+    LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) {
+        if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST,
+                                   v6, now)) {
+            ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
+        } else {
+            break;
+        }
+    }
+
+    ovs_mutex_unlock(&ipf->ipf_lock);
+}
+
+/* Conservatively adds fragments associated with a expired fragment list to
+ * a packet batch to be processed by the calling application, typically
+ * conntrack. Also cleans up the list context when it is empty.*/
+static void
+ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb,
+                       long long now, bool v6)
+{
+    enum {
+        /* Very conservative, due to DOS probability. */
+        IPF_FRAG_LIST_MAX_EXPIRED = 1,
+    };
+
+
+    if (ovs_list_is_empty(&ipf->frag_exp_list)) {
+        return;
+    }
+
+    ovs_mutex_lock(&ipf->ipf_lock);
+    struct ipf_list *ipf_list, *next;
+    size_t lists_removed = 0;
+
+    LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) {
+        if (now <= ipf_list->expiration ||
+            lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
+            break;
+        }
+
+        if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_EXPIRY_LIST,
+                                   v6, now)) {
+            ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
+            lists_removed++;
+        } else {
+            break;
+        }
+    }
+
+    ovs_mutex_unlock(&ipf->ipf_lock);
+}
+
+/* Adds a reassmebled packet to a packet batch to be processed by the caller.
+ */
+static void
+ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
+{
+    if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
+        return;
+    }
+
+    ovs_mutex_lock(&ipf->ipf_lock);
+    struct reassembled_pkt *rp, *next;
+
+    LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
+        if (!rp->list->reass_execute_ctx &&
+            ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
+            rp->list->reass_execute_ctx = rp->pkt;
+        }
+    }
+
+    ovs_mutex_unlock(&ipf->ipf_lock);
+}
+
+/* Checks for reassembled packets post processing by conntrack and edits the
+ * fragments if needed based on what conntrack decided. */
+static void
+ipf_post_execute_reass_pkts(struct ipf *ipf,
+                            struct dp_packet_batch *pb, bool v6)
+{
+    if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
+        return;
+    }
+
+    ovs_mutex_lock(&ipf->ipf_lock);
+    struct reassembled_pkt *rp, *next;
+
+    LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
+        const size_t pb_cnt = dp_packet_batch_size(pb);
+        int pb_idx;
+        struct dp_packet *pkt;
+        /* Inner batch loop is constant time since batch size is <=
+         * NETDEV_MAX_BURST. */
+        DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
+            if (pkt == rp->list->reass_execute_ctx) {
+                for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
+                    rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
+                    rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
+                    rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
+                    rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
+                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
+                        pkt->md.ct_orig_tuple_ipv6;
+                    if (pkt->md.ct_orig_tuple_ipv6) {
+                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
+                            pkt->md.ct_orig_tuple.ipv6;
+                    } else {
+                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
+                            pkt->md.ct_orig_tuple.ipv4;
+                    }
+                }
+
+                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
+                const char *tail_frag = dp_packet_tail(frag_0->pkt);
+                uint8_t pad_frag = dp_packet_l2_pad_size(frag_0->pkt);
+                void *l4_frag = dp_packet_l4(frag_0->pkt);
+                void *l4_reass = dp_packet_l4(pkt);
+                memcpy(l4_frag, l4_reass,
+                       tail_frag - (char *) l4_frag - pad_frag);
+
+                if (v6) {
+                    struct ovs_16aligned_ip6_hdr *l3_frag
+                        = dp_packet_l3(frag_0->pkt);
+                    struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
+                    l3_frag->ip6_src = l3_reass->ip6_src;
+                    l3_frag->ip6_dst = l3_reass->ip6_dst;
+                } else {
+                    struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
+                    struct ip_header *l3_reass = dp_packet_l3(pkt);
+                    ovs_be32 reass_ip = get_16aligned_be32(&l3_reass->ip_src);
+                    ovs_be32 frag_ip = get_16aligned_be32(&l3_frag->ip_src);
+                    l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
+                                                     frag_ip, reass_ip);
+                    l3_frag->ip_src = l3_reass->ip_src;
+
+                    reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
+                    frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
+                    l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
+                                                     frag_ip, reass_ip);
+                    l3_frag->ip_dst = l3_reass->ip_dst;
+                }
+
+                ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
+                ipf_reassembled_list_remove(rp);
+                dp_packet_delete(rp->pkt);
+                free(rp);
+            } else {
+                dp_packet_batch_refill(pb, pkt, pb_idx);
+            }
+        }
+    }
+
+    ovs_mutex_unlock(&ipf->ipf_lock);
+}
+
+/* Extracts any fragments from the batch and reassembles them when a
+ * complete packet is received.  Completed packets are attempted to
+ * be added to the batch to be sent through conntrack. */
+void
+ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
+                         long long now, ovs_be16 dl_type, uint16_t zone,
+                         uint32_t hash_basis)
+{
+    if (ipf_get_enabled(ipf)) {
+        ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis);
+    }
+
+    if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
+        ipf_execute_reass_pkts(ipf, pb);
+    }
+}
+
+/* Updates fragments based on the processing of the reassembled packet sent
+ * through conntrack and adds these fragments to any batches seen.  Expired
+ * fragments are marked as invalid and also added to the batches seen
+ * with low priority.  Reassembled packets are freed. */
+void
+ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
+                          long long now, ovs_be16 dl_type)
+{
+    if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
+        bool v6 = dl_type == htons(ETH_TYPE_IPV6);
+        ipf_post_execute_reass_pkts(ipf, pb, v6);
+        ipf_send_completed_frags(ipf, pb, now, v6);
+        ipf_send_expired_frags(ipf, pb, now, v6);
+    }
+}
+
+static void *
+ipf_clean_thread_main(void *f)
+{
+    struct ipf *ipf = f;
+
+    enum {
+        IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000,
+    };
+
+    while (!latch_is_set(&ipf->ipf_clean_thread_exit)) {
+
+        long long now = time_msec();
+
+        if (!ovs_list_is_empty(&ipf->frag_exp_list) ||
+            !ovs_list_is_empty(&ipf->frag_complete_list)) {
+
+            ovs_mutex_lock(&ipf->ipf_lock);
+
+            struct ipf_list *ipf_list, *next;
+            LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
+                                &ipf->frag_exp_list) {
+                if (ipf_purge_list_check(ipf, ipf_list, now)) {
+                    ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
+                }
+            }
+
+            LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
+                                &ipf->frag_complete_list) {
+                if (ipf_purge_list_check(ipf, ipf_list, now)) {
+                    ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
+                }
+            }
+
+            ovs_mutex_unlock(&ipf->ipf_lock);
+        }
+
+        poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT);
+        latch_wait(&ipf->ipf_clean_thread_exit);
+        poll_block();
+    }
+
+    return NULL;
+}
+
+struct ipf *
+ipf_init(void)
+{
+    struct ipf *ipf = xzalloc(sizeof *ipf);
+
+    ovs_mutex_init_adaptive(&ipf->ipf_lock);
+    ovs_mutex_lock(&ipf->ipf_lock);
+    hmap_init(&ipf->frag_lists);
+    ovs_list_init(&ipf->frag_exp_list);
+    ovs_list_init(&ipf->frag_complete_list);
+    ovs_list_init(&ipf->reassembled_pkt_list);
+    atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF);
+    atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF);
+    ipf->max_v4_frag_list_size = DIV_ROUND_UP(
+        IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
+        ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
+    ovs_mutex_unlock(&ipf->ipf_lock);
+    atomic_count_init(&ipf->nfrag, 0);
+    for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) {
+        atomic_init(&ipf->n4frag_cnt[i], 0);
+        atomic_init(&ipf->n6frag_cnt[i], 0);
+    }
+    atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT);
+    atomic_init(&ipf->ifp_v4_enabled, true);
+    atomic_init(&ipf->ifp_v6_enabled, true);
+    latch_init(&ipf->ipf_clean_thread_exit);
+    ipf->ipf_clean_thread = ovs_thread_create("ipf_clean",
+                                         ipf_clean_thread_main, ipf);
+
+    return ipf;
+}
+
+void
+ipf_destroy(struct ipf *ipf)
+{
+    ovs_mutex_lock(&ipf->ipf_lock);
+    latch_set(&ipf->ipf_clean_thread_exit);
+    pthread_join(ipf->ipf_clean_thread, NULL);
+    latch_destroy(&ipf->ipf_clean_thread_exit);
+
+    struct ipf_list *ipf_list;
+    HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) {
+        while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
+            struct dp_packet *pkt
+                = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
+            if (!ipf_list->frag_list[ipf_list->last_sent_idx + 1].dnsteal) {
+                dp_packet_delete(pkt);
+            }
+            atomic_count_dec(&ipf->nfrag);
+            ipf_list->last_sent_idx++;
+        }
+        free(ipf_list->frag_list);
+        free(ipf_list);
+    }
+
+    if (atomic_count_get(&ipf->nfrag)) {
+        VLOG_WARN("ipf destroy with non-zero fragment count. ");
+    }
+
+    struct reassembled_pkt *rp;
+    LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) {
+        dp_packet_delete(rp->pkt);
+        free(rp);
+    }
+
+    hmap_destroy(&ipf->frag_lists);
+    ovs_list_poison(&ipf->frag_exp_list);
+    ovs_list_poison(&ipf->frag_complete_list);
+    ovs_list_poison(&ipf->reassembled_pkt_list);
+    ovs_mutex_unlock(&ipf->ipf_lock);
+    ovs_mutex_destroy(&ipf->ipf_lock);
+    free(ipf);
+}
+
+int
+ipf_set_enabled(struct ipf *ipf, bool v6, bool enable)
+{
+    atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled,
+                         enable);
+    return 0;
+}
+
+int
+ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value)
+{
+    /* If the user specifies an unreasonably large number, fragmentation
+     * will not work well but it will not blow up. */
+    if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND :  IPF_V4_FRAG_SIZE_LBOUND)) {
+        return 1;
+    }
+
+    ovs_mutex_lock(&ipf->ipf_lock);
+    if (v6) {
+        atomic_store_relaxed(&ipf->min_v6_frag_size, value);
+    } else {
+        atomic_store_relaxed(&ipf->min_v4_frag_size, value);
+        ipf->max_v4_frag_list_size = DIV_ROUND_UP(
+            IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
+            ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
+    }
+    ovs_mutex_unlock(&ipf->ipf_lock);
+    return 0;
+}
+
+int
+ipf_set_max_nfrags(struct ipf *ipf, uint32_t value)
+{
+    if (value > IPF_NFRAG_UBOUND) {
+        return 1;
+    }
+    atomic_store_relaxed(&ipf->nfrag_max, value);
+    return 0;
+}
+
+int
+ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status)
+{
+    ipf_status->nfrag = atomic_count_get(&ipf->nfrag);
+    atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max);
+
+    atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled);
+    atomic_read_relaxed(&ipf->min_v4_frag_size,
+                        &ipf_status->v4.min_frag_size);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED],
+                        &ipf_status->v4.nfrag_accepted);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT],
+                        &ipf_status->v4.nfrag_completed_sent);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT],
+                        &ipf_status->v4.nfrag_expired_sent);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL],
+                        &ipf_status->v4.nfrag_too_small);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP],
+                        &ipf_status->v4.nfrag_overlap);
+    atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED],
+                        &ipf_status->v4.nfrag_purged);
+
+    atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled);
+    atomic_read_relaxed(&ipf->min_v6_frag_size,
+                        &ipf_status->v6.min_frag_size);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED],
+                        &ipf_status->v6.nfrag_accepted);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT],
+                        &ipf_status->v6.nfrag_completed_sent);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT],
+                        &ipf_status->v6.nfrag_expired_sent);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL],
+                        &ipf_status->v6.nfrag_too_small);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP],
+                        &ipf_status->v6.nfrag_overlap);
+    atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED],
+                        &ipf_status->v6.nfrag_purged);
+    return 0;
+}
+
+struct ipf_dump_ctx {
+    struct hmap_position bucket_pos;
+};
+
+/* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
+ * caller must call ipf_dump_done() when dumping is finished. */
+int
+ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx)
+{
+    *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx);
+    return 0;
+}
+
+/* Creates a string representation of the state of an 'ipf_list' and puts
+ * it in 'ds'. */
+static void
+ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds)
+{
+    ds_put_cstr(ds, "(");
+    if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) {
+        ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",",
+                      IP_ARGS(ipf_list->key.src_addr.ipv4),
+                      IP_ARGS(ipf_list->key.dst_addr.ipv4));
+    } else {
+        ds_put_cstr(ds, "src=");
+        ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds);
+        ds_put_cstr(ds, ",dst=");
+        ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds);
+        ds_put_cstr(ds, ",");
+    }
+
+    ds_put_format(ds, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
+                  ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id),
+                  ntohs(ipf_list->key.dl_type), ipf_list->key.zone,
+                  ipf_list->key.nw_proto);
+
+    ds_put_format(ds, ",num_fragments=%u,state=%s",
+                  ipf_list->last_inuse_idx + 1,
+                  ipf_state_name[ipf_list->state]);
+
+    ds_put_cstr(ds, ")");
+}
+
+/* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
+ * ipf_dump_create() to create a string representation of the state of an
+ * ipf list, to which 'dump' is pointed to.  Returns EOF when there are no
+ * more ipf lists. */
+int
+ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char **dump)
+{
+    ovs_mutex_lock(&ipf->ipf_lock);
+
+    struct hmap_node *node = hmap_at_position(&ipf->frag_lists,
+                                              &ipf_dump_ctx->bucket_pos);
+    if (!node) {
+        ovs_mutex_unlock(&ipf->ipf_lock);
+        return EOF;
+    } else {
+        struct ipf_list *ipf_list_;
+        INIT_CONTAINER(ipf_list_, node, node);
+        struct ipf_list ipf_list = *ipf_list_;
+        ovs_mutex_unlock(&ipf->ipf_lock);
+        struct ds ds = DS_EMPTY_INITIALIZER;
+        ipf_dump_create(&ipf_list, &ds);
+        *dump = ds_steal_cstr(&ds);
+        return 0;
+    }
+}
+
+/* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
+int
+ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx)
+{
+    free(ipf_dump_ctx);
+    return 0;
+}
diff --git a/lib/ipf.h b/lib/ipf.h

new file mode 100644 (file)

index 0000000..6ac91b2
--- /dev/null
+++ b/lib/ipf.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef IPF_H
+#define IPF_H 1
+
+#include "dp-packet.h"
+#include "openvswitch/types.h"
+
+struct ipf;
+
+struct ipf_proto_status {
+   uint64_t nfrag_accepted;
+   uint64_t nfrag_completed_sent;
+   uint64_t nfrag_expired_sent;
+   uint64_t nfrag_too_small;
+   uint64_t nfrag_overlap;
+   uint64_t nfrag_purged;
+   unsigned int min_frag_size;
+   bool enabled;
+};
+
+struct ipf_status {
+   struct ipf_proto_status v4;
+   struct ipf_proto_status v6;
+   unsigned int nfrag;
+   unsigned int nfrag_max;
+};
+
+struct ipf *ipf_init(void);
+void ipf_destroy(struct ipf *ipf);
+void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
+                              long long now, ovs_be16 dl_type, uint16_t zone,
+                              uint32_t hash_basis);
+
+void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
+                               long long now, ovs_be16 dl_type);
+
+int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
+int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
+int ipf_set_max_nfrags(struct ipf *ipf, uint32_t value);
+int ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status);
+
+struct ipf_dump_ctx;
+int ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx);
+int ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx,
+                  char **dump);
+int ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx);
+
+#endif /* ipf.h */
diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at

index 3296d64e653872d6d14fd10756cea55649ddedf2..1057e34c571b7a5a34caa827d89300b53efe48b8 100644 (file)
--- a/tests/system-kmod-macros.at
+++ b/tests/system-kmod-macros.at
@@ -77,12 +77,6 @@ m4_define([CHECK_CONNTRACK],
  #
  m4_define([CHECK_CONNTRACK_ALG])
  
-# CHECK_CONNTRACK_FRAG()
-#
-# Perform requirements checks for running conntrack fragmentations tests.
-# The kernel always supports fragmentation, so no check is needed.
-m4_define([CHECK_CONNTRACK_FRAG])
-
  # CHECK_CONNTRACK_LOCAL_STACK()
  #
  # Perform requirements checks for running conntrack tests with local stack.
@@ -140,6 +134,46 @@ m4_define([CHECK_CT_DPIF_GET_NCONNS],
      AT_SKIP_IF([:])
  ])
  
+# DPCTL_SET_MIN_FRAG_SIZE()
+#
+# The kernel does not support this command.
+m4_define([DPCTL_SET_MIN_FRAG_SIZE],
+[
+
+])
+
+# DPCTL_MODIFY_FRAGMENTATION()
+#
+# The kernel does not support this command.
+m4_define([DPCTL_MODIFY_FRAGMENTATION],
+[
+
+])
+
+# DPCTL_CHECK_FRAGMENTATION_PASS()
+#
+# The kernel does not support this command.
+m4_define([DPCTL_CHECK_FRAGMENTATION_PASS],
+[
+
+])
+
+# DPCTL_CHECK_V6_FRAGMENTATION_PASS()
+#
+# The kernel does not support this command.
+m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS],
+[
+
+])
+
+# DPCTL_CHECK_FRAGMENTATION_FAIL()
+#
+# The kernel does not support this command.
+m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL],
+[
+
+])
+
  # OVS_CHECK_KERNEL([minversion], [minsublevel], [maxversion], [maxsublevel])
  #
  # Check if kernel version falls between minversion.minsublevel and
diff --git a/tests/system-traffic.at b/tests/system-traffic.at

index de40734e4a62e10c213ff3cb2a250f774ca9602c..6da5ac826a130094b97d4d4f6721d23bcabef162 100644 (file)
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -2356,7 +2356,6 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2375,6 +2374,9 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
  
  AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
  
+dnl Modify userspace conntrack fragmentation handling.
+DPCTL_MODIFY_FRAGMENTATION()
+
  dnl Ipv4 fragmentation connectivity check.
  NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
  3 packets transmitted, 3 received, 0% packet loss, time 0ms
@@ -2385,12 +2387,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING
  3 packets transmitted, 3 received, 0% packet loss, time 0ms
  ])
  
+dnl Check userspace conntrack fragmentation counters.
+DPCTL_CHECK_FRAGMENTATION_PASS()
+
  OVS_TRAFFIC_VSWITCHD_STOP
  AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation expiry])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2411,17 +2415,22 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
  
  AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
  
+dnl Modify userspace conntrack fragmentation handling.
+DPCTL_MODIFY_FRAGMENTATION()
+
  dnl Ipv4 fragmentation connectivity check.
  NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 1 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
  7 packets transmitted, 0 received, 100% packet loss, time 0ms
  ])
  
+dnl Check userspace conntrack fragmentation counters.
+DPCTL_CHECK_FRAGMENTATION_FAIL()
+
  OVS_TRAFFIC_VSWITCHD_STOP
  AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation + vlan])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2442,6 +2451,9 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
  
  AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
  
+dnl Modify userspace conntrack fragmentation handling.
+DPCTL_MODIFY_FRAGMENTATION()
+
  dnl Ipv4 fragmentation connectivity check.
  NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl
  3 packets transmitted, 3 received, 0% packet loss, time 0ms
@@ -2452,12 +2464,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING
  3 packets transmitted, 3 received, 0% packet loss, time 0ms
  ])
  
+dnl Check userspace conntrack fragmentation counters.
+DPCTL_CHECK_FRAGMENTATION_PASS()
+
  OVS_TRAFFIC_VSWITCHD_STOP
  AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation + cvlan])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0])
  OVS_CHECK_8021AD()
  
@@ -2511,6 +2525,8 @@ AT_CLEANUP
  AT_SETUP([conntrack - IPv4 fragmentation incomplete reassembled packet])
  CHECK_CONNTRACK()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
+
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2532,8 +2548,8 @@ AT_CLEANUP
  dnl Uses same first fragment as above 'incomplete reassembled packet' test.
  AT_SETUP([conntrack - IPv4 fragmentation with fragments specified])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2556,8 +2572,8 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation out of order])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2580,9 +2596,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1 octet])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_OVERLAP()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2604,9 +2620,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1 octet out of order])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_OVERLAP()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2628,7 +2644,6 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2668,7 +2683,6 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation expiry])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2709,7 +2723,6 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation + vlan])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
@@ -2752,7 +2765,6 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation + cvlan])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0])
  OVS_CHECK_8021AD()
  
@@ -2807,6 +2819,7 @@ AT_CLEANUP
  AT_SETUP([conntrack - IPv6 fragmentation incomplete reassembled packet])
  CHECK_CONNTRACK()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2827,8 +2840,8 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation with fragments specified])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2851,8 +2864,8 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation out of order])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2875,9 +2888,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2901,9 +2914,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers + out of order])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2927,9 +2940,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2953,9 +2966,9 @@ AT_CLEANUP
  
  AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2 + out of order])
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
  OVS_TRAFFIC_VSWITCHD_START()
+DPCTL_SET_MIN_FRAG_SIZE()
  
  ADD_NAMESPACES(at_ns0, at_ns1)
  
@@ -2980,7 +2993,6 @@ AT_CLEANUP
  AT_SETUP([conntrack - Fragmentation over vxlan])
  OVS_CHECK_VXLAN()
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_LOCAL_STACK()
  
  OVS_TRAFFIC_VSWITCHD_START()
@@ -3033,7 +3045,6 @@ AT_CLEANUP
  AT_SETUP([conntrack - IPv6 Fragmentation over vxlan])
  OVS_CHECK_VXLAN()
  CHECK_CONNTRACK()
-CHECK_CONNTRACK_FRAG()
  CHECK_CONNTRACK_LOCAL_STACK()
  
  OVS_TRAFFIC_VSWITCHD_START()
diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at

index 27bde8bee8aebd45f5c3796b9ff4655abb22c31f..4ea55ea4af9fab761f79a8ddbf8bb659e598f3b3 100644 (file)
--- a/tests/system-userspace-macros.at
+++ b/tests/system-userspace-macros.at
@@ -73,15 +73,6 @@ m4_define([CHECK_CONNTRACK],
  #
  m4_define([CHECK_CONNTRACK_ALG])
  
-# CHECK_CONNTRACK_FRAG()
-#
-# Perform requirements checks for running conntrack fragmentations tests.
-# The userspace doesn't support fragmentation yet, so skip the tests.
-m4_define([CHECK_CONNTRACK_FRAG],
-[
-    AT_SKIP_IF([:])
-])
-
  # CHECK_CONNTRACK_LOCAL_STACK()
  #
  # Perform requirements checks for running conntrack tests with local stack.
@@ -95,19 +86,13 @@ m4_define([CHECK_CONNTRACK_LOCAL_STACK],
  
  # CHECK_CONNTRACK_FRAG_OVERLAP()
  #
-# The userspace datapath does not support fragments yet.
-m4_define([CHECK_CONNTRACK_FRAG_OVERLAP],
-[
-    AT_SKIP_IF([:])
-])
+# The userspace datapath supports fragment overlap check.
+m4_define([CHECK_CONNTRACK_FRAG_OVERLAP])
  
-# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN()
+# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN
  #
-# The userspace datapath does not support fragments yet.
-m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN],
-[
-    AT_SKIP_IF([:])
-])
+# The userspace datapath supports fragments with multiple extension headers.
+m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN])
  
  # CHECK_CONNTRACK_NAT()
  #
@@ -137,6 +122,167 @@ m4_define([CHECK_CT_DPIF_SET_GET_MAXCONNS])
  # userspace datapath does support this feature.
  m4_define([CHECK_CT_DPIF_GET_NCONNS])
  
+# DPCTL_SET_MIN_FRAG_SIZE()
+#
+# The userspace datapath supports this command.
+m4_define([DPCTL_SET_MIN_FRAG_SIZE],
+[
+AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 400], [], [dnl
+setting minimum fragment size successful
+])
+AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v6 400], [], [dnl
+setting minimum fragment size successful
+])
+])
+
+# DPCTL_MODIFY_FRAGMENTATION()
+#
+# The userspace datapath supports this command.
+m4_define([DPCTL_MODIFY_FRAGMENTATION],
+[
+AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 1000], [], [dnl
+setting minimum fragment size successful
+])
+AT_CHECK([ovs-appctl dpctl/ipf-set-max-nfrags 500], [], [dnl
+setting maximum fragments successful
+])
+AT_CHECK([ovs-appctl dpctl/ipf-get-status], [], [dnl
+        Fragmentation Module Status
+        ---------------------------
+        v4 enabled: 1
+        v6 enabled: 1
+        max num frags (v4/v6): 500
+        num frag: 0
+        min v4 frag size: 1000
+        v4 frags accepted: 0
+        v4 frags completed: 0
+        v4 frags expired: 0
+        v4 frags too small: 0
+        v4 frags overlapped: 0
+        v4 frags purged: 0
+        min v6 frag size: 1280
+        v6 frags accepted: 0
+        v6 frags completed: 0
+        v6 frags expired: 0
+        v6 frags too small: 0
+        v6 frags overlapped: 0
+        v6 frags purged: 0
+])
+])
+
+# DPCTL_CHECK_FRAGMENTATION_PASS()
+#
+# Used to check fragmentation counters for some fragmentation tests using
+# the userspace datapath.
+m4_define([DPCTL_CHECK_FRAGMENTATION_PASS],
+[
+AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl
+        Fragmentation Module Status
+        ---------------------------
+        v4 enabled: 1
+        v6 enabled: 1
+        max num frags (v4/v6): 500
+        num frag: 0
+        min v4 frag size: 1000
+        v4 frags accepted: 30
+        v4 frags completed: 30
+        v4 frags expired: 0
+        v4 frags too small: 0
+        v4 frags overlapped: 0
+        v4 frags purged: 0
+        min v6 frag size: 1280
+        v6 frags accepted: 0
+        v6 frags completed: 0
+        v6 frags expired: 0
+        v6 frags too small: 0
+        v6 frags overlapped: 0
+        v6 frags purged: 0
+
+        Fragment Lists:
+
+])
+])
+
+# DPCTL_CHECK_V6_FRAGMENTATION_PASS()
+#
+# Used to check fragmentation counters for some fragmentation tests using
+# the userspace datapath.
+m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS],
+[
+AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl
+        Fragmentation Module Status
+        ---------------------------
+        v4 enabled: 1
+        v6 enabled: 1
+        max num frags (v4/v6): 1000
+        num frag: 0
+        min v4 frag size: 1200
+        v4 frags accepted: 0
+        v4 frags completed: 0
+        v4 frags expired: 0
+        v4 frags too small: 0
+        v4 frags overlapped: 0
+        v4 frags purged: 0
+        min v6 frag size: 1280
+        v6 frags accepted: 30
+        v6 frags completed: 30
+        v6 frags expired: 0
+        v6 frags too small: 0
+        v6 frags overlapped: 0
+        v6 frags purged: 0
+
+        Fragment Lists:
+
+])
+])
+
+# FORMAT_FRAG_LIST([])
+#
+# Strip content from the piped input which can differ from test to test; recirc_id
+# and ip_id fields in an ipf_list vary from test to test and hence are cleared.
+m4_define([FORMAT_FRAG_LIST],
+    [[sed -e 's/ip_id=[0-9]*/ip_id=<cleared>/g' -e 's/recirc_id=[0-9]*/recirc_id=<cleared>/g']])
+
+# DPCTL_CHECK_FRAGMENTATION_FAIL()
+#
+# Used to check fragmentation counters for some fragmentation tests using
+# the userspace datapath, when failure to transmit fragments is expected.
+m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL],
+[
+AT_CHECK([ovs-appctl dpctl/ipf-get-status -m | FORMAT_FRAG_LIST()], [], [dnl
+        Fragmentation Module Status
+        ---------------------------
+        v4 enabled: 1
+        v6 enabled: 1
+        max num frags (v4/v6): 500
+        num frag: 7
+        min v4 frag size: 1000
+        v4 frags accepted: 7
+        v4 frags completed: 0
+        v4 frags expired: 0
+        v4 frags too small: 0
+        v4 frags overlapped: 0
+        v4 frags purged: 0
+        min v6 frag size: 1280
+        v6 frags accepted: 0
+        v6 frags completed: 0
+        v6 frags expired: 0
+        v6 frags too small: 0
+        v6 frags overlapped: 0
+        v6 frags purged: 0
+
+        Fragment Lists:
+
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+(src=10.1.1.1,dst=10.1.1.2,recirc_id=<cleared>,ip_id=<cleared>,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag)
+])
+])
+
  # OVS_CHECK_KERNEL([minversion], [maxversion], [minsublevel], [maxsublevel])
  #
  # The userspace skips all tests that check kernel version.
author	Darrell Ball <dlu998@gmail.com>
	Wed, 13 Feb 2019 23:34:21 +0000 (15:34 -0800)
committer	Ben Pfaff <blp@ovn.org>
	Thu, 14 Feb 2019 22:18:56 +0000 (14:18 -0800)
Documentation/faq/releases.rst		patch \| blob \| blame \| history
NEWS		patch \| blob \| blame \| history
include/sparse/netinet/ip6.h		patch \| blob \| blame \| history
lib/automake.mk		patch \| blob \| blame \| history
lib/conntrack.c		patch \| blob \| blame \| history
lib/conntrack.h		patch \| blob \| blame \| history
lib/ct-dpif.c		patch \| blob \| blame \| history
lib/ct-dpif.h		patch \| blob \| blame \| history
lib/dpctl.c		patch \| blob \| blame \| history
lib/dpctl.man		patch \| blob \| blame \| history
lib/dpif-netdev.c		patch \| blob \| blame \| history
lib/dpif-netlink.c		patch \| blob \| blame \| history
lib/dpif-provider.h		patch \| blob \| blame \| history
lib/ipf.c	[new file with mode: 0644]	patch \| blob
lib/ipf.h	[new file with mode: 0644]	patch \| blob
tests/system-kmod-macros.at		patch \| blob \| blame \| history
tests/system-traffic.at		patch \| blob \| blame \| history
tests/system-userspace-macros.at		patch \| blob \| blame \| history