lib/ipf.c

   1 /*
   2  * Copyright (c) 2019 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include <ctype.h>
  19 #include <errno.h>
  20 #include <sys/types.h>
  21 #include <netinet/in.h>
  22 #include <netinet/ip6.h>
  23 #include <netinet/icmp6.h>
  24 #include <string.h>
  25
  26 #include "coverage.h"
  27 #include "csum.h"
  28 #include "ipf.h"
  29 #include "latch.h"
  30 #include "openvswitch/hmap.h"
  31 #include "openvswitch/poll-loop.h"
  32 #include "openvswitch/vlog.h"
  33 #include "ovs-atomic.h"
  34 #include "packets.h"
  35 #include "util.h"
  36
  37 VLOG_DEFINE_THIS_MODULE(ipf);
  38 COVERAGE_DEFINE(ipf_stuck_frag_list_purged);
  39
  40 enum {
  41     IPV4_PACKET_MAX_HDR_SIZE = 60,
  42     IPV4_PACKET_MAX_SIZE = 65535,
  43     IPV6_PACKET_MAX_DATA = 65535,
  44 };
  45
  46 enum ipf_list_state {
  47     IPF_LIST_STATE_UNUSED,
  48     IPF_LIST_STATE_REASS_FAIL,
  49     IPF_LIST_STATE_OTHER_SEEN,
  50     IPF_LIST_STATE_FIRST_SEEN,
  51     IPF_LIST_STATE_LAST_SEEN,
  52     IPF_LIST_STATE_FIRST_LAST_SEEN,
  53     IPF_LIST_STATE_COMPLETED,
  54     IPF_LIST_STATE_NUM,
  55 };
  56
  57 static char *ipf_state_name[IPF_LIST_STATE_NUM] =
  58     {"unused", "reassemble fail", "other frag", "first frag", "last frag",
  59      "first/last frag", "complete"};
  60
  61 enum ipf_list_type {
  62     IPF_FRAG_COMPLETED_LIST,
  63     IPF_FRAG_EXPIRY_LIST,
  64 };
  65
  66 enum {
  67     IPF_INVALID_IDX = -1,
  68     IPF_V4_FRAG_SIZE_LBOUND = 400,
  69     IPF_V4_FRAG_SIZE_MIN_DEF = 1200,
  70     IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */
  71     IPF_V6_FRAG_SIZE_MIN_DEF = 1280,
  72     IPF_MAX_FRAGS_DEFAULT = 1000,
  73     IPF_NFRAG_UBOUND = 5000,
  74 };
  75
  76 enum ipf_counter_type {
  77     IPF_NFRAGS_ACCEPTED,
  78     IPF_NFRAGS_COMPL_SENT,
  79     IPF_NFRAGS_EXPD_SENT,
  80     IPF_NFRAGS_TOO_SMALL,
  81     IPF_NFRAGS_OVERLAP,
  82     IPF_NFRAGS_PURGED,
  83     IPF_NFRAGS_NUM_CNTS,
  84 };
  85
  86 union ipf_addr {
  87     ovs_be32 ipv4;
  88     struct in6_addr ipv6;
  89 };
  90
  91 /* Represents a single fragment; part of a list of fragments. */
  92 struct ipf_frag {
  93     struct dp_packet *pkt;
  94     uint16_t start_data_byte;
  95     uint16_t end_data_byte;
  96     bool dnsteal; /* 'do not steal': if true, ipf should not free packet. */
  97 };
  98
  99 /* The key for a collection of fragments potentially making up an unfragmented
 100  * packet. */
 101 struct ipf_list_key {
 102     /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
 103      * two members. */
 104     union ipf_addr src_addr;
 105     union ipf_addr dst_addr;
 106     uint32_t recirc_id;
 107     ovs_be32 ip_id;   /* V6 is 32 bits. */
 108     ovs_be16 dl_type;
 109     uint16_t zone;
 110     uint8_t nw_proto;
 111 };
 112
 113 /* A collection of fragments potentially making up an unfragmented packet. */
 114 struct ipf_list {
 115     struct hmap_node node;         /* In struct ipf's 'frag_lists'. */
 116     struct ovs_list list_node;     /* In struct ipf's 'frag_exp_list' or
 117                                     * 'frag_complete_list'. */
 118     struct ipf_frag *frag_list;    /* List of fragments for this list. */
 119     struct ipf_list_key key;       /* The key for the fragemnt list. */
 120     struct dp_packet *reass_execute_ctx; /* Reassembled packet. */
 121     long long expiration;          /* In milliseconds. */
 122     int last_sent_idx;             /* Last sent fragment idx. */
 123     int last_inuse_idx;            /* Last inuse fragment idx. */
 124     int size;                      /* Fragment list size. */
 125     uint8_t state;                 /* Frag list state; see ipf_list_state. */
 126 };
 127
 128 /* Represents a reassambled packet which typically is passed through
 129  * conntrack. */
 130 struct reassembled_pkt {
 131     struct ovs_list rp_list_node;  /* In struct ipf's
 132                                     * 'reassembled_pkt_list'. */
 133     struct dp_packet *pkt;
 134     struct ipf_list *list;
 135 };
 136
 137 struct ipf {
 138     /* The clean thread is used to clean up fragments in the 'ipf'
 139      * module if packet batches are not longer be sent through its user. */
 140     pthread_t ipf_clean_thread;
 141     struct latch ipf_clean_thread_exit;
 142
 143     int max_v4_frag_list_size;
 144
 145     struct ovs_mutex ipf_lock; /* Protects all of the following. */
 146     /* These contain 'struct ipf_list's. */
 147     struct hmap frag_lists OVS_GUARDED;
 148     struct ovs_list frag_exp_list OVS_GUARDED;
 149     struct ovs_list frag_complete_list OVS_GUARDED;
 150     /* Contains 'struct reassembled_pkt's. */
 151     struct ovs_list reassembled_pkt_list OVS_GUARDED;
 152
 153     /* Used to allow disabling fragmentation reassembly. */
 154     atomic_bool ifp_v4_enabled;
 155     atomic_bool ifp_v6_enabled;
 156
 157     /* Will be clamped above 400 bytes; the value chosen should handle
 158      * alg control packets of interest that use string encoding of mutable
 159      * IP fields; meaning, the control packets should not be fragmented. */
 160     atomic_uint min_v4_frag_size;
 161     atomic_uint min_v6_frag_size;
 162
 163     /* Configurable maximum allowable fragments in process. */
 164     atomic_uint nfrag_max;
 165
 166     /* Number of fragments in process. */
 167     atomic_count nfrag;
 168
 169     atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS];
 170     atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS];
 171 };
 172
 173 static void
 174 ipf_print_reass_packet(const char *es, const void *pkt)
 175 {
 176     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
 177     if (!VLOG_DROP_WARN(&rl)) {
 178         struct ds ds = DS_EMPTY_INITIALIZER;
 179         ds_put_hex_dump(&ds, pkt, 128, 0, false);
 180         VLOG_WARN("%s\n%s", es, ds_cstr(&ds));
 181         ds_destroy(&ds);
 182     }
 183 }
 184
 185 static void
 186 ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr)
 187 {
 188     atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] : &ipf->n4frag_cnt[cntr]);
 189 }
 190
 191 static bool
 192 ipf_get_v4_enabled(struct ipf *ipf)
 193 {
 194     bool ifp_v4_enabled_;
 195     atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_);
 196     return ifp_v4_enabled_;
 197 }
 198
 199 static bool
 200 ipf_get_v6_enabled(struct ipf *ipf)
 201 {
 202     bool ifp_v6_enabled_;
 203     atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_);
 204     return ifp_v6_enabled_;
 205 }
 206
 207 static bool
 208 ipf_get_enabled(struct ipf *ipf)
 209 {
 210     return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf);
 211 }
 212
 213 static uint32_t
 214 ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr)
 215 {
 216     BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
 217     return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
 218 }
 219
 220 /* Adds a list of fragments to the list tracking expiry of yet to be
 221  * completed reassembled packets, hence subject to expirty. */
 222 static void
 223 ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list *ipf_list,
 224                     long long now)
 225    /* OVS_REQUIRES(ipf->ipf_lock) */
 226 {
 227     enum {
 228         IPF_FRAG_LIST_TIMEOUT = 15000,
 229     };
 230
 231     ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT;
 232     ovs_list_push_back(frag_exp_list, &ipf_list->list_node);
 233 }
 234
 235 /* Adds a list of fragments to the list of completed packets, which will be
 236  * subsequently transmitted. */
 237 static void
 238 ipf_completed_list_add(struct ovs_list *frag_complete_list,
 239                        struct ipf_list *ipf_list)
 240     /* OVS_REQUIRES(ipf_lock) */
 241 {
 242     ovs_list_push_back(frag_complete_list, &ipf_list->list_node);
 243 }
 244
 245 /* Adds a reassmebled packet to the list of reassembled packets, awaiting some
 246  * processing, such as being sent through conntrack. */
 247 static void
 248 ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list,
 249                          struct reassembled_pkt *rp)
 250     /* OVS_REQUIRES(ipf_lock) */
 251 {
 252     ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node);
 253 }
 254
 255 /* Removed a frag list from tracking datastructures and frees list heap
 256  * memory. */
 257 static void
 258 ipf_list_clean(struct hmap *frag_lists,
 259                struct ipf_list *ipf_list)
 260     /* OVS_REQUIRES(ipf_lock) */
 261 {
 262     ovs_list_remove(&ipf_list->list_node);
 263     hmap_remove(frag_lists, &ipf_list->node);
 264     free(ipf_list->frag_list);
 265     free(ipf_list);
 266 }
 267
 268 /* Removed a frag list sitting on the expiry list from tracking
 269  * datastructures and frees list heap memory. */
 270 static void
 271 ipf_expiry_list_clean(struct hmap *frag_lists,
 272                       struct ipf_list *ipf_list)
 273     /* OVS_REQUIRES(ipf_lock) */
 274 {
 275     ipf_list_clean(frag_lists, ipf_list);
 276 }
 277
 278 /* Removed a frag list sitting on the completed list from tracking
 279  * datastructures and frees list heap memory. */
 280 static void
 281 ipf_completed_list_clean(struct hmap *frag_lists,
 282                          struct ipf_list *ipf_list)
 283     /* OVS_REQUIRES(ipf_lock) */
 284 {
 285     ipf_list_clean(frag_lists, ipf_list);
 286 }
 287
 288 static void
 289 ipf_expiry_list_remove(struct ipf_list *ipf_list)
 290     /* OVS_REQUIRES(ipf_lock) */
 291 {
 292     ovs_list_remove(&ipf_list->list_node);
 293 }
 294
 295 static void
 296 ipf_reassembled_list_remove(struct reassembled_pkt *rp)
 297     /* OVS_REQUIRES(ipf_lock) */
 298 {
 299     ovs_list_remove(&rp->rp_list_node);
 300 }
 301
 302 /* Symmetric */
 303 static uint32_t
 304 ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
 305 {
 306     uint32_t hsrc, hdst, hash;
 307     hsrc = hdst = basis;
 308     hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
 309     hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
 310     hash = hsrc ^ hdst;
 311
 312     /* Hash the rest of the key. */
 313     return hash_words((uint32_t *) (&key->dst_addr + 1),
 314                       (uint32_t *) (key + 1) -
 315                       (uint32_t *) (&key->dst_addr + 1),
 316                       hash);
 317 }
 318
 319 static bool
 320 ipf_is_first_v4_frag(const struct dp_packet *pkt)
 321 {
 322     const struct ip_header *l3 = dp_packet_l3(pkt);
 323     if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
 324         l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
 325         return true;
 326     }
 327     return false;
 328 }
 329
 330 static bool
 331 ipf_is_last_v4_frag(const struct dp_packet *pkt)
 332 {
 333     const struct ip_header *l3 = dp_packet_l3(pkt);
 334     if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
 335         !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
 336         return true;
 337     }
 338     return false;
 339 }
 340
 341 static bool
 342 ipf_is_v6_frag(ovs_be16 ip6f_offlg)
 343 {
 344     if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) {
 345         return true;
 346     }
 347     return false;
 348 }
 349
 350 static bool
 351 ipf_is_first_v6_frag(ovs_be16 ip6f_offlg)
 352 {
 353     if (!(ip6f_offlg & IP6F_OFF_MASK) &&
 354         ip6f_offlg & IP6F_MORE_FRAG) {
 355         return true;
 356     }
 357     return false;
 358 }
 359
 360 static bool
 361 ipf_is_last_v6_frag(ovs_be16 ip6f_offlg)
 362 {
 363     if ((ip6f_offlg & IP6F_OFF_MASK) &&
 364         !(ip6f_offlg & IP6F_MORE_FRAG)) {
 365         return true;
 366     }
 367     return false;
 368 }
 369
 370 /* Checks for a completed packet collection of fragments. */
 371 static bool
 372 ipf_list_complete(const struct ipf_list *ipf_list)
 373     /* OVS_REQUIRES(ipf_lock) */
 374 {
 375     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 376         if (ipf_list->frag_list[i - 1].end_data_byte + 1
 377             != ipf_list->frag_list[i].start_data_byte) {
 378             return false;
 379         }
 380     }
 381     return true;
 382 }
 383
 384 /* Runs O(n) for a sorted or almost sorted list. */
 385 static void
 386 ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
 387     /* OVS_REQUIRES(ipf_lock) */
 388 {
 389     for (int li = 1; li <= last_idx; li++) {
 390         struct ipf_frag ipf_frag = frag_list[li];
 391         int ci = li - 1;
 392         while (ci >= 0 &&
 393                frag_list[ci].start_data_byte > ipf_frag.start_data_byte) {
 394             frag_list[ci + 1] = frag_list[ci];
 395             ci--;
 396         }
 397         frag_list[ci + 1] = ipf_frag;
 398     }
 399 }
 400
 401 /* Called on a sorted complete list of v4 fragments to reassemble them into
 402  * a single packet that can be processed, such as passing through conntrack.
 403  */
 404 static struct dp_packet *
 405 ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
 406     /* OVS_REQUIRES(ipf_lock) */
 407 {
 408     struct ipf_frag *frag_list = ipf_list->frag_list;
 409     struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
 410     dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
 411     struct ip_header *l3 = dp_packet_l3(pkt);
 412     int len = ntohs(l3->ip_tot_len);
 413
 414     int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
 415                    frag_list[1].start_data_byte + 1;
 416
 417     if (len + rest_len > IPV4_PACKET_MAX_SIZE) {
 418         ipf_print_reass_packet(
 419             "Unsupported big reassembled v4 packet; v4 hdr:", l3);
 420         dp_packet_delete(pkt);
 421         return NULL;
 422     }
 423
 424     dp_packet_prealloc_tailroom(pkt, rest_len);
 425
 426     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 427         size_t add_len = frag_list[i].end_data_byte -
 428                          frag_list[i].start_data_byte + 1;
 429         const char *l4 = dp_packet_l4(frag_list[i].pkt);
 430         dp_packet_put(pkt, l4, add_len);
 431     }
 432
 433     len += rest_len;
 434     l3 = dp_packet_l3(pkt);
 435     ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
 436     if (!dp_packet_hwol_is_ipv4(pkt)) {
 437         l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
 438                                     new_ip_frag_off);
 439         l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
 440     }
 441     l3->ip_tot_len = htons(len);
 442     l3->ip_frag_off = new_ip_frag_off;
 443     dp_packet_set_l2_pad_size(pkt, 0);
 444
 445     return pkt;
 446 }
 447
 448 /* Called on a sorted complete list of v6 fragments to reassemble them into
 449  * a single packet that can be processed, such as passing through conntrack.
 450  */
 451 static struct dp_packet *
 452 ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
 453     /* OVS_REQUIRES(ipf_lock) */
 454 {
 455     struct ipf_frag *frag_list = ipf_list->frag_list;
 456     struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
 457     dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
 458     struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 459     int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag);
 460
 461     int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
 462                    frag_list[1].start_data_byte + 1;
 463
 464     if (pl + rest_len > IPV6_PACKET_MAX_DATA) {
 465         ipf_print_reass_packet(
 466              "Unsupported big reassembled v6 packet; v6 hdr:", l3);
 467         dp_packet_delete(pkt);
 468         return NULL;
 469     }
 470
 471     dp_packet_prealloc_tailroom(pkt, rest_len);
 472
 473     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 474         size_t add_len = frag_list[i].end_data_byte -
 475                           frag_list[i].start_data_byte + 1;
 476         const char *l4 = dp_packet_l4(frag_list[i].pkt);
 477         dp_packet_put(pkt, l4, add_len);
 478     }
 479
 480     pl += rest_len;
 481     l3 = dp_packet_l3(pkt);
 482
 483     uint8_t nw_proto = l3->ip6_nxt;
 484     uint8_t nw_frag = 0;
 485     const void *data = l3 + 1;
 486     size_t datasize = pl;
 487
 488     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 489     if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr)
 490         || !nw_frag || !frag_hdr) {
 491
 492         ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3);
 493         dp_packet_delete(pkt);
 494         return NULL;
 495     }
 496
 497     struct ovs_16aligned_ip6_frag *fh =
 498         CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr);
 499     fh->ip6f_offlg = 0;
 500     l3->ip6_plen = htons(pl);
 501     l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto;
 502     dp_packet_set_l2_pad_size(pkt, 0);
 503     return pkt;
 504 }
 505
 506 /* Called when a frag list state transitions to another state. This is
 507  * triggered by new fragment for the list being received.*/
 508 static void
 509 ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
 510                           bool ff, bool lf, bool v6)
 511     OVS_REQUIRES(ipf->ipf_lock)
 512 {
 513     enum ipf_list_state curr_state = ipf_list->state;
 514     enum ipf_list_state next_state;
 515     switch (curr_state) {
 516     case IPF_LIST_STATE_UNUSED:
 517     case IPF_LIST_STATE_OTHER_SEEN:
 518         if (ff) {
 519             next_state = IPF_LIST_STATE_FIRST_SEEN;
 520         } else if (lf) {
 521             next_state = IPF_LIST_STATE_LAST_SEEN;
 522         } else {
 523             next_state = IPF_LIST_STATE_OTHER_SEEN;
 524         }
 525         break;
 526     case IPF_LIST_STATE_FIRST_SEEN:
 527         if (lf) {
 528             next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 529         } else {
 530             next_state = IPF_LIST_STATE_FIRST_SEEN;
 531         }
 532         break;
 533     case IPF_LIST_STATE_LAST_SEEN:
 534         if (ff) {
 535             next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 536         } else {
 537             next_state = IPF_LIST_STATE_LAST_SEEN;
 538         }
 539         break;
 540     case IPF_LIST_STATE_FIRST_LAST_SEEN:
 541         next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 542         break;
 543     case IPF_LIST_STATE_COMPLETED:
 544     case IPF_LIST_STATE_REASS_FAIL:
 545     case IPF_LIST_STATE_NUM:
 546     default:
 547         OVS_NOT_REACHED();
 548     }
 549
 550     if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) {
 551         ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
 552         if (ipf_list_complete(ipf_list)) {
 553             struct dp_packet *reass_pkt = v6
 554                 ? ipf_reassemble_v6_frags(ipf_list)
 555                 : ipf_reassemble_v4_frags(ipf_list);
 556             if (reass_pkt) {
 557                 struct reassembled_pkt *rp = xzalloc(sizeof *rp);
 558                 rp->pkt = reass_pkt;
 559                 rp->list = ipf_list;
 560                 ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp);
 561                 ipf_expiry_list_remove(ipf_list);
 562                 next_state = IPF_LIST_STATE_COMPLETED;
 563             } else {
 564                 next_state = IPF_LIST_STATE_REASS_FAIL;
 565             }
 566         }
 567     }
 568     ipf_list->state = next_state;
 569 }
 570
 571 /* Some sanity checks are redundant, but prudent, in case code paths for
 572  * fragments change in future. The processing cost for fragments is not
 573  * important. */
 574 static bool
 575 ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt)
 576 {
 577     if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) {
 578         goto invalid_pkt;
 579     }
 580
 581     const struct eth_header *l2 = dp_packet_eth(pkt);
 582     const struct ip_header *l3 = dp_packet_l3(pkt);
 583
 584     if (OVS_UNLIKELY(!l2 || !l3)) {
 585         goto invalid_pkt;
 586     }
 587
 588     size_t l3_size = dp_packet_l3_size(pkt);
 589     if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) {
 590         goto invalid_pkt;
 591     }
 592
 593     if (!IP_IS_FRAGMENT(l3->ip_frag_off)) {
 594         return false;
 595     }
 596
 597     uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
 598     if (OVS_UNLIKELY(ip_tot_len != l3_size)) {
 599         goto invalid_pkt;
 600     }
 601
 602     size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
 603     if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
 604         goto invalid_pkt;
 605     }
 606     if (OVS_UNLIKELY(l3_size < ip_hdr_len)) {
 607         goto invalid_pkt;
 608     }
 609
 610     if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
 611                      && !dp_packet_hwol_is_ipv4(pkt)
 612                      && csum(l3, ip_hdr_len) != 0)) {
 613         goto invalid_pkt;
 614     }
 615
 616     uint32_t min_v4_frag_size_;
 617     atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_);
 618     bool lf = ipf_is_last_v4_frag(pkt);
 619     if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v4_frag_size_)) {
 620         ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL);
 621         goto invalid_pkt;
 622     }
 623     return true;
 624
 625 invalid_pkt:
 626     pkt->md.ct_state = CS_INVALID;
 627     return false;
 628 }
 629
 630 static bool
 631 ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
 632                    struct ipf_list_key *key, uint16_t *start_data_byte,
 633                    uint16_t *end_data_byte, bool *ff, bool *lf)
 634 {
 635     const struct ip_header *l3 = dp_packet_l3(pkt);
 636     uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
 637     size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
 638
 639     *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8;
 640     *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1;
 641     *ff = ipf_is_first_v4_frag(pkt);
 642     *lf = ipf_is_last_v4_frag(pkt);
 643     memset(key, 0, sizeof *key);
 644     key->ip_id = be16_to_be32(l3->ip_id);
 645     key->dl_type = dl_type;
 646     key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src);
 647     key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst);
 648     key->nw_proto = l3->ip_proto;
 649     key->zone = zone;
 650     key->recirc_id = pkt->md.recirc_id;
 651     return true;
 652 }
 653
 654 /* Some sanity checks are redundant, but prudent, in case code paths for
 655  * fragments change in future. The processing cost for fragments is not
 656  * important. */
 657 static bool
 658 ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt)
 659 {
 660     const struct eth_header *l2 = dp_packet_eth(pkt);
 661     const struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 662     const char *l4 = dp_packet_l4(pkt);
 663
 664     if (OVS_UNLIKELY(!l2 || !l3 || !l4)) {
 665         goto invalid_pkt;
 666     }
 667
 668     size_t l3_size = dp_packet_l3_size(pkt);
 669     size_t l3_hdr_size = sizeof *l3;
 670
 671     if (OVS_UNLIKELY(l3_size < l3_hdr_size)) {
 672         goto invalid_pkt;
 673     }
 674
 675     uint8_t nw_frag = 0;
 676     uint8_t nw_proto = l3->ip6_nxt;
 677     const void *data = l3 + 1;
 678     size_t datasize = l3_size - l3_hdr_size;
 679     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 680     if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
 681                              &frag_hdr) || !nw_frag || !frag_hdr) {
 682         return false;
 683     }
 684
 685     int pl = ntohs(l3->ip6_plen);
 686     if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) {
 687         goto invalid_pkt;
 688     }
 689
 690     ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
 691     if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) {
 692         return false;
 693     }
 694
 695     uint32_t min_v6_frag_size_;
 696     atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_);
 697     bool lf = ipf_is_last_v6_frag(ip6f_offlg);
 698
 699     if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v6_frag_size_)) {
 700         ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL);
 701         goto invalid_pkt;
 702     }
 703
 704     return true;
 705
 706 invalid_pkt:
 707     pkt->md.ct_state = CS_INVALID;
 708     return false;
 709
 710 }
 711
 712 static void
 713 ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
 714                    struct ipf_list_key *key, uint16_t *start_data_byte,
 715                    uint16_t *end_data_byte, bool *ff, bool *lf)
 716 {
 717     const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 718     uint8_t nw_frag = 0;
 719     uint8_t nw_proto = l3->ip6_nxt;
 720     const void *data = l3 + 1;
 721     size_t datasize = dp_packet_l3_size(pkt) - sizeof *l3;
 722     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 723
 724     parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr);
 725     ovs_assert(nw_frag && frag_hdr);
 726     ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
 727     *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) +
 728         sizeof (struct ovs_16aligned_ip6_frag);
 729     *end_data_byte = *start_data_byte + dp_packet_l4_size(pkt) - 1;
 730     *ff = ipf_is_first_v6_frag(ip6f_offlg);
 731     *lf = ipf_is_last_v6_frag(ip6f_offlg);
 732     memset(key, 0, sizeof *key);
 733     key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident);
 734     key->dl_type = dl_type;
 735     memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6);
 736     /* We are not supporting parsing of the routing header to use as the
 737      * dst address part of the key. */
 738     memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6);
 739     key->nw_proto = 0;   /* Not used for key for V6. */
 740     key->zone = zone;
 741     key->recirc_id = pkt->md.recirc_id;
 742 }
 743
 744 static bool
 745 ipf_list_key_eq(const struct ipf_list_key *key1,
 746                 const struct ipf_list_key *key2)
 747     /* OVS_REQUIRES(ipf_lock) */
 748 {
 749     if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) &&
 750         !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) &&
 751         key1->dl_type == key2->dl_type &&
 752         key1->ip_id == key2->ip_id &&
 753         key1->zone == key2->zone &&
 754         key1->nw_proto == key2->nw_proto &&
 755         key1->recirc_id == key2->recirc_id) {
 756         return true;
 757     }
 758     return false;
 759 }
 760
 761 static struct ipf_list *
 762 ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
 763                     uint32_t hash)
 764     OVS_REQUIRES(ipf->ipf_lock)
 765 {
 766     struct ipf_list *ipf_list;
 767     HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
 768         if (ipf_list_key_eq(&ipf_list->key, key)) {
 769             return ipf_list;
 770         }
 771     }
 772     return NULL;
 773 }
 774
 775 static bool
 776 ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
 777                   size_t start_data_byte, size_t end_data_byte)
 778     /* OVS_REQUIRES(ipf_lock) */
 779 {
 780     for (int i = 0; i <= last_inuse_idx; i++) {
 781         if ((start_data_byte >= frag_list[i].start_data_byte &&
 782             start_data_byte <= frag_list[i].end_data_byte) ||
 783             (end_data_byte >= frag_list[i].start_data_byte &&
 784              end_data_byte <= frag_list[i].end_data_byte)) {
 785             return true;
 786         }
 787     }
 788     return false;
 789 }
 790
 791 /* Adds a fragment to a list of fragments, if the fragment is not a
 792  * duplicate. If the fragment is a duplicate, that fragment is marked
 793  * invalid to avoid the work that conntrack would do to mark the fragment
 794  * as invalid, which it will in all cases. */
 795 static bool
 796 ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
 797                  struct dp_packet *pkt, uint16_t start_data_byte,
 798                  uint16_t end_data_byte, bool ff, bool lf, bool v6,
 799                  bool dnsteal)
 800     OVS_REQUIRES(ipf->ipf_lock)
 801 {
 802     bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
 803         ipf_list->last_inuse_idx, start_data_byte, end_data_byte);
 804     int last_inuse_idx = ipf_list->last_inuse_idx;
 805
 806     if (!duped_frag) {
 807         if (last_inuse_idx < ipf_list->size - 1) {
 808             /* In the case of dpdk, it would be unfortunate if we had
 809              * to create a clone fragment outside the dpdk mp due to the
 810              * mempool size being too limited. We will otherwise need to
 811              * recommend not setting the mempool number of buffers too low
 812              * and also clamp the number of fragments. */
 813             struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx + 1];
 814             frag->pkt = pkt;
 815             frag->start_data_byte = start_data_byte;
 816             frag->end_data_byte = end_data_byte;
 817             frag->dnsteal = dnsteal;
 818             ipf_list->last_inuse_idx++;
 819             atomic_count_inc(&ipf->nfrag);
 820             ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
 821             ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
 822         } else {
 823             OVS_NOT_REACHED();
 824         }
 825     } else {
 826         ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP);
 827         pkt->md.ct_state = CS_INVALID;
 828         return false;
 829     }
 830     return true;
 831 }
 832
 833 static void
 834 ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key,
 835               int max_frag_list_size)
 836 {
 837     ipf_list->key = *key;
 838     ipf_list->last_inuse_idx = IPF_INVALID_IDX;
 839     ipf_list->last_sent_idx = IPF_INVALID_IDX;
 840     ipf_list->reass_execute_ctx = NULL;
 841     ipf_list->state = IPF_LIST_STATE_UNUSED;
 842     ipf_list->size = max_frag_list_size;
 843     ipf_list->frag_list
 844         = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
 845 }
 846
 847 /* Generates a fragment list key from a well formed fragment and either starts
 848  * a new fragment list or increases the size of the existing fragment list,
 849  * while checking if the maximum supported fragements are supported or the
 850  * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
 851  * to a list of fragemnts. */
 852 static bool
 853 ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
 854                 uint16_t zone, long long now, uint32_t hash_basis,
 855                 bool dnsteal)
 856     OVS_REQUIRES(ipf->ipf_lock)
 857 {
 858     struct ipf_list_key key;
 859     /* Initialize 4 variables for some versions of GCC. */
 860     uint16_t start_data_byte = 0;
 861     uint16_t end_data_byte = 0;
 862     bool ff = false;
 863     bool lf = false;
 864     bool v6 = dl_type == htons(ETH_TYPE_IPV6);
 865
 866     if (v6 && ipf_get_v6_enabled(ipf)) {
 867         ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
 868                            &end_data_byte, &ff, &lf);
 869     } else if (!v6 && ipf_get_v4_enabled(ipf)) {
 870         ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
 871                            &end_data_byte, &ff, &lf);
 872     } else {
 873         return false;
 874     }
 875
 876     unsigned int nfrag_max;
 877     atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max);
 878     if (atomic_count_get(&ipf->nfrag) >= nfrag_max) {
 879         return false;
 880     }
 881
 882     uint32_t hash = ipf_list_key_hash(&key, hash_basis);
 883     struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash);
 884     enum {
 885         IPF_FRAG_LIST_MIN_INCREMENT = 4,
 886         IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535,
 887     };
 888
 889     int max_frag_list_size;
 890     if (v6) {
 891         /* Because the calculation with extension headers is variable,
 892          * we don't calculate a hard maximum fragment list size upfront.  The
 893          * fragment list size is practically limited by the code, however. */
 894         max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE;
 895     } else {
 896         max_frag_list_size = ipf->max_v4_frag_list_size;
 897     }
 898
 899     if (!ipf_list) {
 900         ipf_list = xmalloc(sizeof *ipf_list);
 901         ipf_list_init(ipf_list, &key,
 902                       MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT));
 903         hmap_insert(&ipf->frag_lists, &ipf_list->node, hash);
 904         ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now);
 905     } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL ||
 906                ipf_list->state == IPF_LIST_STATE_COMPLETED) {
 907         /* Bail out as early as possible. */
 908         return false;
 909     } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
 910         int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
 911                             max_frag_list_size - ipf_list->size);
 912         /* Enforce limit. */
 913         if (increment > 0) {
 914             ipf_list->frag_list =
 915                 xrealloc(ipf_list->frag_list, (ipf_list->size + increment) *
 916                   sizeof *ipf_list->frag_list);
 917             ipf_list->size += increment;
 918         } else {
 919             return false;
 920         }
 921     }
 922
 923     return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
 924                             end_data_byte, ff, lf, v6, dnsteal);
 925 }
 926
 927 /* Filters out fragments from a batch of fragments and adjust the batch. */
 928 static void
 929 ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb,
 930                              ovs_be16 dl_type, uint16_t zone, long long now,
 931                              uint32_t hash_basis)
 932 {
 933     const size_t pb_cnt = dp_packet_batch_size(pb);
 934     int pb_idx; /* Index in a packet batch. */
 935     struct dp_packet *pkt;
 936
 937     DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
 938         if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) &&
 939                           ipf_is_valid_v4_frag(ipf, pkt))
 940                           ||
 941                           (dl_type == htons(ETH_TYPE_IPV6) &&
 942                           ipf_is_valid_v6_frag(ipf, pkt)))) {
 943
 944             ovs_mutex_lock(&ipf->ipf_lock);
 945             if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis,
 946                                  pb->do_not_steal)) {
 947                 dp_packet_batch_refill(pb, pkt, pb_idx);
 948             }
 949             ovs_mutex_unlock(&ipf->ipf_lock);
 950         } else {
 951             dp_packet_batch_refill(pb, pkt, pb_idx);
 952         }
 953     }
 954 }
 955
 956 /* In case of DPDK, a memory source check is done, as DPDK memory pool
 957  * management has trouble dealing with multiple source types.  The
 958  * check_source paramater is used to indicate when this check is needed. */
 959 static bool
 960 ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt,
 961                         bool check_source OVS_UNUSED)
 962 {
 963 #ifdef DPDK_NETDEV
 964     if ((dp_packet_batch_is_full(pb)) ||
 965         /* DPDK cannot handle multiple sources in a batch. */
 966         (check_source && !dp_packet_batch_is_empty(pb)
 967          && pb->packets[0]->source != pkt->source)) {
 968 #else
 969     if (dp_packet_batch_is_full(pb)) {
 970 #endif
 971         return false;
 972     }
 973
 974     dp_packet_batch_add(pb, pkt);
 975     return true;
 976 }
 977
 978 /* This would be used in rare cases where a list cannot be sent. One rare
 979  * reason known right now is a mempool source check, which exists due to DPDK
 980  * support, where packets are no longer being received on any port with a
 981  * source matching the fragment.  Another reason is a race where all
 982  * conntrack rules are unconfigured when some fragments are yet to be
 983  * flushed.
 984  *
 985  * Returns true if the list was purged. */
 986 static bool
 987 ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list,
 988                      long long now)
 989     OVS_REQUIRES(ipf->ipf_lock)
 990 {
 991     enum {
 992         IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000
 993     };
 994
 995     if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) {
 996         return false;
 997     }
 998
 999     while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1000         struct dp_packet * pkt
1001             = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1002         dp_packet_delete(pkt);
1003         atomic_count_dec(&ipf->nfrag);
1004         COVERAGE_INC(ipf_stuck_frag_list_purged);
1005         ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6),
1006                   IPF_NFRAGS_PURGED);
1007         ipf_list->last_sent_idx++;
1008     }
1009
1010     return true;
1011 }
1012
1013 /* Does the packet batch management and common accounting work associated
1014  * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
1015 static bool
1016 ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
1017                        struct dp_packet_batch *pb,
1018                        enum ipf_list_type list_type, bool v6, long long now)
1019     OVS_REQUIRES(ipf->ipf_lock)
1020 {
1021     if (ipf_purge_list_check(ipf, ipf_list, now)) {
1022         return true;
1023     }
1024
1025     while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1026         struct dp_packet *pkt
1027             = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1028         if (ipf_dp_packet_batch_add(pb, pkt, true)) {
1029             ipf_list->last_sent_idx++;
1030             atomic_count_dec(&ipf->nfrag);
1031
1032             if (list_type == IPF_FRAG_COMPLETED_LIST) {
1033                 ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT);
1034             } else {
1035                 ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT);
1036                 pkt->md.ct_state = CS_INVALID;
1037             }
1038
1039             if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
1040                 return true;
1041             }
1042         } else {
1043             return false;
1044         }
1045     }
1046     OVS_NOT_REACHED();
1047 }
1048
1049 /* Adds fragments associated with a completed fragment list to a packet batch
1050  * to be processed by the calling application, typically conntrack. Also
1051  * cleans up the list context when it is empty.*/
1052 static void
1053 ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1054                          long long now, bool v6)
1055 {
1056     if (ovs_list_is_empty(&ipf->frag_complete_list)) {
1057         return;
1058     }
1059
1060     ovs_mutex_lock(&ipf->ipf_lock);
1061     struct ipf_list *ipf_list, *next;
1062
1063     LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) {
1064         if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST,
1065                                    v6, now)) {
1066             ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1067         } else {
1068             break;
1069         }
1070     }
1071
1072     ovs_mutex_unlock(&ipf->ipf_lock);
1073 }
1074
1075 /* Conservatively adds fragments associated with a expired fragment list to
1076  * a packet batch to be processed by the calling application, typically
1077  * conntrack. Also cleans up the list context when it is empty.*/
1078 static void
1079 ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1080                        long long now, bool v6)
1081 {
1082     enum {
1083         /* Very conservative, due to DOS probability. */
1084         IPF_FRAG_LIST_MAX_EXPIRED = 1,
1085     };
1086
1087
1088     if (ovs_list_is_empty(&ipf->frag_exp_list)) {
1089         return;
1090     }
1091
1092     ovs_mutex_lock(&ipf->ipf_lock);
1093     struct ipf_list *ipf_list, *next;
1094     size_t lists_removed = 0;
1095
1096     LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) {
1097         if (now <= ipf_list->expiration ||
1098             lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
1099             break;
1100         }
1101
1102         if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_EXPIRY_LIST,
1103                                    v6, now)) {
1104             ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1105             lists_removed++;
1106         } else {
1107             break;
1108         }
1109     }
1110
1111     ovs_mutex_unlock(&ipf->ipf_lock);
1112 }
1113
1114 /* Adds a reassmebled packet to a packet batch to be processed by the caller.
1115  */
1116 static void
1117 ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
1118 {
1119     if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1120         return;
1121     }
1122
1123     ovs_mutex_lock(&ipf->ipf_lock);
1124     struct reassembled_pkt *rp, *next;
1125
1126     LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1127         if (!rp->list->reass_execute_ctx &&
1128             ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
1129             rp->list->reass_execute_ctx = rp->pkt;
1130         }
1131     }
1132
1133     ovs_mutex_unlock(&ipf->ipf_lock);
1134 }
1135
1136 /* Checks for reassembled packets post processing by conntrack and edits the
1137  * fragments if needed based on what conntrack decided. */
1138 static void
1139 ipf_post_execute_reass_pkts(struct ipf *ipf,
1140                             struct dp_packet_batch *pb, bool v6)
1141 {
1142     if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1143         return;
1144     }
1145
1146     ovs_mutex_lock(&ipf->ipf_lock);
1147     struct reassembled_pkt *rp, *next;
1148
1149     LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1150         const size_t pb_cnt = dp_packet_batch_size(pb);
1151         int pb_idx;
1152         struct dp_packet *pkt;
1153         /* Inner batch loop is constant time since batch size is <=
1154          * NETDEV_MAX_BURST. */
1155         DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
1156             if (pkt == rp->list->reass_execute_ctx) {
1157                 for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
1158                     rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
1159                     rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
1160                     rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
1161                     rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
1162                     rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
1163                         pkt->md.ct_orig_tuple_ipv6;
1164                     if (pkt->md.ct_orig_tuple_ipv6) {
1165                         rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
1166                             pkt->md.ct_orig_tuple.ipv6;
1167                     } else {
1168                         rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
1169                             pkt->md.ct_orig_tuple.ipv4;
1170                     }
1171                 }
1172
1173                 const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
1174                 void *l4_frag = dp_packet_l4(frag_0->pkt);
1175                 void *l4_reass = dp_packet_l4(pkt);
1176                 memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
1177
1178                 if (v6) {
1179                     struct ovs_16aligned_ip6_hdr *l3_frag
1180                         = dp_packet_l3(frag_0->pkt);
1181                     struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
1182                     l3_frag->ip6_src = l3_reass->ip6_src;
1183                     l3_frag->ip6_dst = l3_reass->ip6_dst;
1184                 } else {
1185                     struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
1186                     struct ip_header *l3_reass = dp_packet_l3(pkt);
1187                     if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
1188                         ovs_be32 reass_ip =
1189                             get_16aligned_be32(&l3_reass->ip_src);
1190                         ovs_be32 frag_ip =
1191                             get_16aligned_be32(&l3_frag->ip_src);
1192
1193                         l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1194                                                          frag_ip, reass_ip);
1195                         reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
1196                         frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
1197                         l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1198                                                          frag_ip, reass_ip);
1199                     }
1200
1201                     l3_frag->ip_src = l3_reass->ip_src;
1202                     l3_frag->ip_dst = l3_reass->ip_dst;
1203                 }
1204
1205                 ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
1206                 ipf_reassembled_list_remove(rp);
1207                 dp_packet_delete(rp->pkt);
1208                 free(rp);
1209             } else {
1210                 dp_packet_batch_refill(pb, pkt, pb_idx);
1211             }
1212         }
1213     }
1214
1215     ovs_mutex_unlock(&ipf->ipf_lock);
1216 }
1217
1218 /* Extracts any fragments from the batch and reassembles them when a
1219  * complete packet is received.  Completed packets are attempted to
1220  * be added to the batch to be sent through conntrack. */
1221 void
1222 ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1223                          long long now, ovs_be16 dl_type, uint16_t zone,
1224                          uint32_t hash_basis)
1225 {
1226     if (ipf_get_enabled(ipf)) {
1227         ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis);
1228     }
1229
1230     if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1231         ipf_execute_reass_pkts(ipf, pb);
1232     }
1233 }
1234
1235 /* Updates fragments based on the processing of the reassembled packet sent
1236  * through conntrack and adds these fragments to any batches seen.  Expired
1237  * fragments are marked as invalid and also added to the batches seen
1238  * with low priority.  Reassembled packets are freed. */
1239 void
1240 ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1241                           long long now, ovs_be16 dl_type)
1242 {
1243     if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1244         bool v6 = dl_type == htons(ETH_TYPE_IPV6);
1245         ipf_post_execute_reass_pkts(ipf, pb, v6);
1246         ipf_send_completed_frags(ipf, pb, now, v6);
1247         ipf_send_expired_frags(ipf, pb, now, v6);
1248     }
1249 }
1250
1251 static void *
1252 ipf_clean_thread_main(void *f)
1253 {
1254     struct ipf *ipf = f;
1255
1256     enum {
1257         IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000,
1258     };
1259
1260     while (!latch_is_set(&ipf->ipf_clean_thread_exit)) {
1261
1262         long long now = time_msec();
1263
1264         if (!ovs_list_is_empty(&ipf->frag_exp_list) ||
1265             !ovs_list_is_empty(&ipf->frag_complete_list)) {
1266
1267             ovs_mutex_lock(&ipf->ipf_lock);
1268
1269             struct ipf_list *ipf_list, *next;
1270             LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1271                                 &ipf->frag_exp_list) {
1272                 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1273                     ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1274                 }
1275             }
1276
1277             LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1278                                 &ipf->frag_complete_list) {
1279                 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1280                     ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1281                 }
1282             }
1283
1284             ovs_mutex_unlock(&ipf->ipf_lock);
1285         }
1286
1287         poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT);
1288         latch_wait(&ipf->ipf_clean_thread_exit);
1289         poll_block();
1290     }
1291
1292     return NULL;
1293 }
1294
1295 struct ipf *
1296 ipf_init(void)
1297 {
1298     struct ipf *ipf = xzalloc(sizeof *ipf);
1299
1300     ovs_mutex_init_adaptive(&ipf->ipf_lock);
1301     ovs_mutex_lock(&ipf->ipf_lock);
1302     hmap_init(&ipf->frag_lists);
1303     ovs_list_init(&ipf->frag_exp_list);
1304     ovs_list_init(&ipf->frag_complete_list);
1305     ovs_list_init(&ipf->reassembled_pkt_list);
1306     atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF);
1307     atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF);
1308     ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1309         IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1310         ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1311     ovs_mutex_unlock(&ipf->ipf_lock);
1312     atomic_count_init(&ipf->nfrag, 0);
1313     for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) {
1314         atomic_init(&ipf->n4frag_cnt[i], 0);
1315         atomic_init(&ipf->n6frag_cnt[i], 0);
1316     }
1317     atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT);
1318     atomic_init(&ipf->ifp_v4_enabled, true);
1319     atomic_init(&ipf->ifp_v6_enabled, true);
1320     latch_init(&ipf->ipf_clean_thread_exit);
1321     ipf->ipf_clean_thread = ovs_thread_create("ipf_clean",
1322                                          ipf_clean_thread_main, ipf);
1323
1324     return ipf;
1325 }
1326
1327 void
1328 ipf_destroy(struct ipf *ipf)
1329 {
1330     ovs_mutex_lock(&ipf->ipf_lock);
1331     latch_set(&ipf->ipf_clean_thread_exit);
1332     pthread_join(ipf->ipf_clean_thread, NULL);
1333     latch_destroy(&ipf->ipf_clean_thread_exit);
1334
1335     struct ipf_list *ipf_list;
1336     HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) {
1337         while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1338             struct dp_packet *pkt
1339                 = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1340             if (!ipf_list->frag_list[ipf_list->last_sent_idx + 1].dnsteal) {
1341                 dp_packet_delete(pkt);
1342             }
1343             atomic_count_dec(&ipf->nfrag);
1344             ipf_list->last_sent_idx++;
1345         }
1346         free(ipf_list->frag_list);
1347         free(ipf_list);
1348     }
1349
1350     if (atomic_count_get(&ipf->nfrag)) {
1351         VLOG_WARN("ipf destroy with non-zero fragment count. ");
1352     }
1353
1354     struct reassembled_pkt *rp;
1355     LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) {
1356         dp_packet_delete(rp->pkt);
1357         free(rp);
1358     }
1359
1360     hmap_destroy(&ipf->frag_lists);
1361     ovs_list_poison(&ipf->frag_exp_list);
1362     ovs_list_poison(&ipf->frag_complete_list);
1363     ovs_list_poison(&ipf->reassembled_pkt_list);
1364     ovs_mutex_unlock(&ipf->ipf_lock);
1365     ovs_mutex_destroy(&ipf->ipf_lock);
1366     free(ipf);
1367 }
1368
1369 int
1370 ipf_set_enabled(struct ipf *ipf, bool v6, bool enable)
1371 {
1372     atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled,
1373                          enable);
1374     return 0;
1375 }
1376
1377 int
1378 ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value)
1379 {
1380     /* If the user specifies an unreasonably large number, fragmentation
1381      * will not work well but it will not blow up. */
1382     if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND :  IPF_V4_FRAG_SIZE_LBOUND)) {
1383         return 1;
1384     }
1385
1386     ovs_mutex_lock(&ipf->ipf_lock);
1387     if (v6) {
1388         atomic_store_relaxed(&ipf->min_v6_frag_size, value);
1389     } else {
1390         atomic_store_relaxed(&ipf->min_v4_frag_size, value);
1391         ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1392             IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1393             ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1394     }
1395     ovs_mutex_unlock(&ipf->ipf_lock);
1396     return 0;
1397 }
1398
1399 int
1400 ipf_set_max_nfrags(struct ipf *ipf, uint32_t value)
1401 {
1402     if (value > IPF_NFRAG_UBOUND) {
1403         return 1;
1404     }
1405     atomic_store_relaxed(&ipf->nfrag_max, value);
1406     return 0;
1407 }
1408
1409 int
1410 ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status)
1411 {
1412     ipf_status->nfrag = atomic_count_get(&ipf->nfrag);
1413     atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max);
1414
1415     atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled);
1416     atomic_read_relaxed(&ipf->min_v4_frag_size,
1417                         &ipf_status->v4.min_frag_size);
1418     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED],
1419                         &ipf_status->v4.nfrag_accepted);
1420     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT],
1421                         &ipf_status->v4.nfrag_completed_sent);
1422     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT],
1423                         &ipf_status->v4.nfrag_expired_sent);
1424     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL],
1425                         &ipf_status->v4.nfrag_too_small);
1426     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP],
1427                         &ipf_status->v4.nfrag_overlap);
1428     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED],
1429                         &ipf_status->v4.nfrag_purged);
1430
1431     atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled);
1432     atomic_read_relaxed(&ipf->min_v6_frag_size,
1433                         &ipf_status->v6.min_frag_size);
1434     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED],
1435                         &ipf_status->v6.nfrag_accepted);
1436     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT],
1437                         &ipf_status->v6.nfrag_completed_sent);
1438     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT],
1439                         &ipf_status->v6.nfrag_expired_sent);
1440     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL],
1441                         &ipf_status->v6.nfrag_too_small);
1442     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP],
1443                         &ipf_status->v6.nfrag_overlap);
1444     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED],
1445                         &ipf_status->v6.nfrag_purged);
1446     return 0;
1447 }
1448
1449 struct ipf_dump_ctx {
1450     struct hmap_position bucket_pos;
1451 };
1452
1453 /* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
1454  * caller must call ipf_dump_done() when dumping is finished. */
1455 int
1456 ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx)
1457 {
1458     *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx);
1459     return 0;
1460 }
1461
1462 /* Creates a string representation of the state of an 'ipf_list' and puts
1463  * it in 'ds'. */
1464 static void
1465 ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds)
1466 {
1467     ds_put_cstr(ds, "(");
1468     if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) {
1469         ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",",
1470                       IP_ARGS(ipf_list->key.src_addr.ipv4),
1471                       IP_ARGS(ipf_list->key.dst_addr.ipv4));
1472     } else {
1473         ds_put_cstr(ds, "src=");
1474         ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds);
1475         ds_put_cstr(ds, ",dst=");
1476         ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds);
1477         ds_put_cstr(ds, ",");
1478     }
1479
1480     ds_put_format(ds, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
1481                   ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id),
1482                   ntohs(ipf_list->key.dl_type), ipf_list->key.zone,
1483                   ipf_list->key.nw_proto);
1484
1485     ds_put_format(ds, ",num_fragments=%u,state=%s",
1486                   ipf_list->last_inuse_idx + 1,
1487                   ipf_state_name[ipf_list->state]);
1488
1489     ds_put_cstr(ds, ")");
1490 }
1491
1492 /* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
1493  * ipf_dump_create() to create a string representation of the state of an
1494  * ipf list, to which 'dump' is pointed to.  Returns EOF when there are no
1495  * more ipf lists. */
1496 int
1497 ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char **dump)
1498 {
1499     ovs_mutex_lock(&ipf->ipf_lock);
1500
1501     struct hmap_node *node = hmap_at_position(&ipf->frag_lists,
1502                                               &ipf_dump_ctx->bucket_pos);
1503     if (!node) {
1504         ovs_mutex_unlock(&ipf->ipf_lock);
1505         return EOF;
1506     } else {
1507         struct ipf_list *ipf_list_;
1508         INIT_CONTAINER(ipf_list_, node, node);
1509         struct ipf_list ipf_list = *ipf_list_;
1510         ovs_mutex_unlock(&ipf->ipf_lock);
1511         struct ds ds = DS_EMPTY_INITIALIZER;
1512         ipf_dump_create(&ipf_list, &ds);
1513         *dump = ds_steal_cstr(&ds);
1514         return 0;
1515     }
1516 }
1517
1518 /* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
1519 int
1520 ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx)
1521 {
1522     free(ipf_dump_ctx);
1523     return 0;
1524 }