lib/ipf.c

   1 /*
   2  * Copyright (c) 2019 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include <ctype.h>
  19 #include <errno.h>
  20 #include <sys/types.h>
  21 #include <netinet/in.h>
  22 #include <netinet/ip6.h>
  23 #include <netinet/icmp6.h>
  24 #include <string.h>
  25
  26 #include "coverage.h"
  27 #include "csum.h"
  28 #include "ipf.h"
  29 #include "latch.h"
  30 #include "openvswitch/hmap.h"
  31 #include "openvswitch/poll-loop.h"
  32 #include "openvswitch/vlog.h"
  33 #include "ovs-atomic.h"
  34 #include "packets.h"
  35 #include "util.h"
  36
  37 VLOG_DEFINE_THIS_MODULE(ipf);
  38 COVERAGE_DEFINE(ipf_stuck_frag_list_purged);
  39
  40 enum {
  41     IPV4_PACKET_MAX_HDR_SIZE = 60,
  42     IPV4_PACKET_MAX_SIZE = 65535,
  43     IPV6_PACKET_MAX_DATA = 65535,
  44 };
  45
  46 enum ipf_list_state {
  47     IPF_LIST_STATE_UNUSED,
  48     IPF_LIST_STATE_REASS_FAIL,
  49     IPF_LIST_STATE_OTHER_SEEN,
  50     IPF_LIST_STATE_FIRST_SEEN,
  51     IPF_LIST_STATE_LAST_SEEN,
  52     IPF_LIST_STATE_FIRST_LAST_SEEN,
  53     IPF_LIST_STATE_COMPLETED,
  54     IPF_LIST_STATE_NUM,
  55 };
  56
  57 static char *ipf_state_name[IPF_LIST_STATE_NUM] =
  58     {"unused", "reassemble fail", "other frag", "first frag", "last frag",
  59      "first/last frag", "complete"};
  60
  61 enum ipf_list_type {
  62     IPF_FRAG_COMPLETED_LIST,
  63     IPF_FRAG_EXPIRY_LIST,
  64 };
  65
  66 enum {
  67     IPF_INVALID_IDX = -1,
  68     IPF_V4_FRAG_SIZE_LBOUND = 400,
  69     IPF_V4_FRAG_SIZE_MIN_DEF = 1200,
  70     IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */
  71     IPF_V6_FRAG_SIZE_MIN_DEF = 1280,
  72     IPF_MAX_FRAGS_DEFAULT = 1000,
  73     IPF_NFRAG_UBOUND = 5000,
  74 };
  75
  76 enum ipf_counter_type {
  77     IPF_NFRAGS_ACCEPTED,
  78     IPF_NFRAGS_COMPL_SENT,
  79     IPF_NFRAGS_EXPD_SENT,
  80     IPF_NFRAGS_TOO_SMALL,
  81     IPF_NFRAGS_OVERLAP,
  82     IPF_NFRAGS_PURGED,
  83     IPF_NFRAGS_NUM_CNTS,
  84 };
  85
  86 union ipf_addr {
  87     ovs_be32 ipv4;
  88     struct in6_addr ipv6;
  89 };
  90
  91 /* Represents a single fragment; part of a list of fragments. */
  92 struct ipf_frag {
  93     struct dp_packet *pkt;
  94     uint16_t start_data_byte;
  95     uint16_t end_data_byte;
  96     bool dnsteal; /* 'do not steal': if true, ipf should not free packet. */
  97 };
  98
  99 /* The key for a collection of fragments potentially making up an unfragmented
 100  * packet. */
 101 struct ipf_list_key {
 102     /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
 103      * two members. */
 104     union ipf_addr src_addr;
 105     union ipf_addr dst_addr;
 106     uint32_t recirc_id;
 107     ovs_be32 ip_id;   /* V6 is 32 bits. */
 108     ovs_be16 dl_type;
 109     uint16_t zone;
 110     uint8_t nw_proto;
 111 };
 112
 113 /* A collection of fragments potentially making up an unfragmented packet. */
 114 struct ipf_list {
 115     struct hmap_node node;         /* In struct ipf's 'frag_lists'. */
 116     struct ovs_list list_node;     /* In struct ipf's 'frag_exp_list' or
 117                                     * 'frag_complete_list'. */
 118     struct ipf_frag *frag_list;    /* List of fragments for this list. */
 119     struct ipf_list_key key;       /* The key for the fragemnt list. */
 120     struct dp_packet *reass_execute_ctx; /* Reassembled packet. */
 121     long long expiration;          /* In milliseconds. */
 122     int last_sent_idx;             /* Last sent fragment idx. */
 123     int last_inuse_idx;            /* Last inuse fragment idx. */
 124     int size;                      /* Fragment list size. */
 125     uint8_t state;                 /* Frag list state; see ipf_list_state. */
 126 };
 127
 128 /* Represents a reassambled packet which typically is passed through
 129  * conntrack. */
 130 struct reassembled_pkt {
 131     struct ovs_list rp_list_node;  /* In struct ipf's
 132                                     * 'reassembled_pkt_list'. */
 133     struct dp_packet *pkt;
 134     struct ipf_list *list;
 135 };
 136
 137 struct ipf {
 138     /* The clean thread is used to clean up fragments in the 'ipf'
 139      * module if packet batches are not longer be sent through its user. */
 140     pthread_t ipf_clean_thread;
 141     struct latch ipf_clean_thread_exit;
 142
 143     int max_v4_frag_list_size;
 144
 145     struct ovs_mutex ipf_lock; /* Protects all of the following. */
 146     /* These contain 'struct ipf_list's. */
 147     struct hmap frag_lists OVS_GUARDED;
 148     struct ovs_list frag_exp_list OVS_GUARDED;
 149     struct ovs_list frag_complete_list OVS_GUARDED;
 150     /* Contains 'struct reassembled_pkt's. */
 151     struct ovs_list reassembled_pkt_list OVS_GUARDED;
 152
 153     /* Used to allow disabling fragmentation reassembly. */
 154     atomic_bool ifp_v4_enabled;
 155     atomic_bool ifp_v6_enabled;
 156
 157     /* Will be clamped above 400 bytes; the value chosen should handle
 158      * alg control packets of interest that use string encoding of mutable
 159      * IP fields; meaning, the control packets should not be fragmented. */
 160     atomic_uint min_v4_frag_size;
 161     atomic_uint min_v6_frag_size;
 162
 163     /* Configurable maximum allowable fragments in process. */
 164     atomic_uint nfrag_max;
 165
 166     /* Number of fragments in process. */
 167     atomic_count nfrag;
 168
 169     atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS];
 170     atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS];
 171 };
 172
 173 static void
 174 ipf_print_reass_packet(const char *es, const void *pkt)
 175 {
 176     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
 177     if (!VLOG_DROP_WARN(&rl)) {
 178         struct ds ds = DS_EMPTY_INITIALIZER;
 179         ds_put_hex_dump(&ds, pkt, 128, 0, false);
 180         VLOG_WARN("%s\n%s", es, ds_cstr(&ds));
 181         ds_destroy(&ds);
 182     }
 183 }
 184
 185 static void
 186 ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr)
 187 {
 188     atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] : &ipf->n4frag_cnt[cntr]);
 189 }
 190
 191 static bool
 192 ipf_get_v4_enabled(struct ipf *ipf)
 193 {
 194     bool ifp_v4_enabled_;
 195     atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_);
 196     return ifp_v4_enabled_;
 197 }
 198
 199 static bool
 200 ipf_get_v6_enabled(struct ipf *ipf)
 201 {
 202     bool ifp_v6_enabled_;
 203     atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_);
 204     return ifp_v6_enabled_;
 205 }
 206
 207 static bool
 208 ipf_get_enabled(struct ipf *ipf)
 209 {
 210     return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf);
 211 }
 212
 213 static uint32_t
 214 ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr)
 215 {
 216     BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
 217     return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
 218 }
 219
 220 /* Adds a list of fragments to the list tracking expiry of yet to be
 221  * completed reassembled packets, hence subject to expirty. */
 222 static void
 223 ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list *ipf_list,
 224                     long long now)
 225    /* OVS_REQUIRES(ipf->ipf_lock) */
 226 {
 227     enum {
 228         IPF_FRAG_LIST_TIMEOUT = 15000,
 229     };
 230
 231     ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT;
 232     ovs_list_push_back(frag_exp_list, &ipf_list->list_node);
 233 }
 234
 235 /* Adds a list of fragments to the list of completed packets, which will be
 236  * subsequently transmitted. */
 237 static void
 238 ipf_completed_list_add(struct ovs_list *frag_complete_list,
 239                        struct ipf_list *ipf_list)
 240     /* OVS_REQUIRES(ipf_lock) */
 241 {
 242     ovs_list_push_back(frag_complete_list, &ipf_list->list_node);
 243 }
 244
 245 /* Adds a reassmebled packet to the list of reassembled packets, awaiting some
 246  * processing, such as being sent through conntrack. */
 247 static void
 248 ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list,
 249                          struct reassembled_pkt *rp)
 250     /* OVS_REQUIRES(ipf_lock) */
 251 {
 252     ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node);
 253 }
 254
 255 /* Removed a frag list from tracking datastructures and frees list heap
 256  * memory. */
 257 static void
 258 ipf_list_clean(struct hmap *frag_lists,
 259                struct ipf_list *ipf_list)
 260     /* OVS_REQUIRES(ipf_lock) */
 261 {
 262     ovs_list_remove(&ipf_list->list_node);
 263     hmap_remove(frag_lists, &ipf_list->node);
 264     free(ipf_list->frag_list);
 265     free(ipf_list);
 266 }
 267
 268 /* Removed a frag list sitting on the expiry list from tracking
 269  * datastructures and frees list heap memory. */
 270 static void
 271 ipf_expiry_list_clean(struct hmap *frag_lists,
 272                       struct ipf_list *ipf_list)
 273     /* OVS_REQUIRES(ipf_lock) */
 274 {
 275     ipf_list_clean(frag_lists, ipf_list);
 276 }
 277
 278 /* Removed a frag list sitting on the completed list from tracking
 279  * datastructures and frees list heap memory. */
 280 static void
 281 ipf_completed_list_clean(struct hmap *frag_lists,
 282                          struct ipf_list *ipf_list)
 283     /* OVS_REQUIRES(ipf_lock) */
 284 {
 285     ipf_list_clean(frag_lists, ipf_list);
 286 }
 287
 288 static void
 289 ipf_expiry_list_remove(struct ipf_list *ipf_list)
 290     /* OVS_REQUIRES(ipf_lock) */
 291 {
 292     ovs_list_remove(&ipf_list->list_node);
 293 }
 294
 295 static void
 296 ipf_reassembled_list_remove(struct reassembled_pkt *rp)
 297     /* OVS_REQUIRES(ipf_lock) */
 298 {
 299     ovs_list_remove(&rp->rp_list_node);
 300 }
 301
 302 /* Symmetric */
 303 static uint32_t
 304 ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
 305 {
 306     uint32_t hsrc, hdst, hash;
 307     hsrc = hdst = basis;
 308     hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
 309     hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
 310     hash = hsrc ^ hdst;
 311
 312     /* Hash the rest of the key. */
 313     return hash_words((uint32_t *) (&key->dst_addr + 1),
 314                       (uint32_t *) (key + 1) -
 315                       (uint32_t *) (&key->dst_addr + 1),
 316                       hash);
 317 }
 318
 319 static bool
 320 ipf_is_first_v4_frag(const struct dp_packet *pkt)
 321 {
 322     const struct ip_header *l3 = dp_packet_l3(pkt);
 323     if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
 324         l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
 325         return true;
 326     }
 327     return false;
 328 }
 329
 330 static bool
 331 ipf_is_last_v4_frag(const struct dp_packet *pkt)
 332 {
 333     const struct ip_header *l3 = dp_packet_l3(pkt);
 334     if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
 335         !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
 336         return true;
 337     }
 338     return false;
 339 }
 340
 341 static bool
 342 ipf_is_v6_frag(ovs_be16 ip6f_offlg)
 343 {
 344     if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) {
 345         return true;
 346     }
 347     return false;
 348 }
 349
 350 static bool
 351 ipf_is_first_v6_frag(ovs_be16 ip6f_offlg)
 352 {
 353     if (!(ip6f_offlg & IP6F_OFF_MASK) &&
 354         ip6f_offlg & IP6F_MORE_FRAG) {
 355         return true;
 356     }
 357     return false;
 358 }
 359
 360 static bool
 361 ipf_is_last_v6_frag(ovs_be16 ip6f_offlg)
 362 {
 363     if ((ip6f_offlg & IP6F_OFF_MASK) &&
 364         !(ip6f_offlg & IP6F_MORE_FRAG)) {
 365         return true;
 366     }
 367     return false;
 368 }
 369
 370 /* Checks for a completed packet collection of fragments. */
 371 static bool
 372 ipf_list_complete(const struct ipf_list *ipf_list)
 373     /* OVS_REQUIRES(ipf_lock) */
 374 {
 375     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 376         if (ipf_list->frag_list[i - 1].end_data_byte + 1
 377             != ipf_list->frag_list[i].start_data_byte) {
 378             return false;
 379         }
 380     }
 381     return true;
 382 }
 383
 384 /* Runs O(n) for a sorted or almost sorted list. */
 385 static void
 386 ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
 387     /* OVS_REQUIRES(ipf_lock) */
 388 {
 389     for (int li = 1; li <= last_idx; li++) {
 390         struct ipf_frag ipf_frag = frag_list[li];
 391         int ci = li - 1;
 392         while (ci >= 0 &&
 393                frag_list[ci].start_data_byte > ipf_frag.start_data_byte) {
 394             frag_list[ci + 1] = frag_list[ci];
 395             ci--;
 396         }
 397         frag_list[ci + 1] = ipf_frag;
 398     }
 399 }
 400
 401 /* Called on a sorted complete list of v4 fragments to reassemble them into
 402  * a single packet that can be processed, such as passing through conntrack.
 403  */
 404 static struct dp_packet *
 405 ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
 406     /* OVS_REQUIRES(ipf_lock) */
 407 {
 408     struct ipf_frag *frag_list = ipf_list->frag_list;
 409     struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
 410     dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
 411     struct ip_header *l3 = dp_packet_l3(pkt);
 412     int len = ntohs(l3->ip_tot_len);
 413
 414     int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
 415                    frag_list[1].start_data_byte + 1;
 416
 417     if (len + rest_len > IPV4_PACKET_MAX_SIZE) {
 418         ipf_print_reass_packet(
 419             "Unsupported big reassembled v4 packet; v4 hdr:", l3);
 420         dp_packet_delete(pkt);
 421         return NULL;
 422     }
 423
 424     dp_packet_prealloc_tailroom(pkt, rest_len);
 425
 426     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 427         size_t add_len = frag_list[i].end_data_byte -
 428                          frag_list[i].start_data_byte + 1;
 429         const char *l4 = dp_packet_l4(frag_list[i].pkt);
 430         dp_packet_put(pkt, l4, add_len);
 431     }
 432
 433     len += rest_len;
 434     l3 = dp_packet_l3(pkt);
 435     ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
 436     l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
 437                                 new_ip_frag_off);
 438     l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
 439     l3->ip_tot_len = htons(len);
 440     l3->ip_frag_off = new_ip_frag_off;
 441     dp_packet_set_l2_pad_size(pkt, 0);
 442
 443     return pkt;
 444 }
 445
 446 /* Called on a sorted complete list of v6 fragments to reassemble them into
 447  * a single packet that can be processed, such as passing through conntrack.
 448  */
 449 static struct dp_packet *
 450 ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
 451     /* OVS_REQUIRES(ipf_lock) */
 452 {
 453     struct ipf_frag *frag_list = ipf_list->frag_list;
 454     struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
 455     dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
 456     struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 457     int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag);
 458
 459     int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
 460                    frag_list[1].start_data_byte + 1;
 461
 462     if (pl + rest_len > IPV6_PACKET_MAX_DATA) {
 463         ipf_print_reass_packet(
 464              "Unsupported big reassembled v6 packet; v6 hdr:", l3);
 465         dp_packet_delete(pkt);
 466         return NULL;
 467     }
 468
 469     dp_packet_prealloc_tailroom(pkt, rest_len);
 470
 471     for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
 472         size_t add_len = frag_list[i].end_data_byte -
 473                           frag_list[i].start_data_byte + 1;
 474         const char *l4 = dp_packet_l4(frag_list[i].pkt);
 475         dp_packet_put(pkt, l4, add_len);
 476     }
 477
 478     pl += rest_len;
 479     l3 = dp_packet_l3(pkt);
 480
 481     uint8_t nw_proto = l3->ip6_nxt;
 482     uint8_t nw_frag = 0;
 483     const void *data = l3 + 1;
 484     size_t datasize = pl;
 485
 486     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 487     if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr)
 488         || !nw_frag || !frag_hdr) {
 489
 490         ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3);
 491         dp_packet_delete(pkt);
 492         return NULL;
 493     }
 494
 495     struct ovs_16aligned_ip6_frag *fh =
 496         CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr);
 497     fh->ip6f_offlg = 0;
 498     l3->ip6_plen = htons(pl);
 499     l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto;
 500     dp_packet_set_l2_pad_size(pkt, 0);
 501     return pkt;
 502 }
 503
 504 /* Called when a frag list state transitions to another state. This is
 505  * triggered by new fragment for the list being received.*/
 506 static void
 507 ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
 508                           bool ff, bool lf, bool v6)
 509     OVS_REQUIRES(ipf->ipf_lock)
 510 {
 511     enum ipf_list_state curr_state = ipf_list->state;
 512     enum ipf_list_state next_state;
 513     switch (curr_state) {
 514     case IPF_LIST_STATE_UNUSED:
 515     case IPF_LIST_STATE_OTHER_SEEN:
 516         if (ff) {
 517             next_state = IPF_LIST_STATE_FIRST_SEEN;
 518         } else if (lf) {
 519             next_state = IPF_LIST_STATE_LAST_SEEN;
 520         } else {
 521             next_state = IPF_LIST_STATE_OTHER_SEEN;
 522         }
 523         break;
 524     case IPF_LIST_STATE_FIRST_SEEN:
 525         if (lf) {
 526             next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 527         } else {
 528             next_state = IPF_LIST_STATE_FIRST_SEEN;
 529         }
 530         break;
 531     case IPF_LIST_STATE_LAST_SEEN:
 532         if (ff) {
 533             next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 534         } else {
 535             next_state = IPF_LIST_STATE_LAST_SEEN;
 536         }
 537         break;
 538     case IPF_LIST_STATE_FIRST_LAST_SEEN:
 539         next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
 540         break;
 541     case IPF_LIST_STATE_COMPLETED:
 542     case IPF_LIST_STATE_REASS_FAIL:
 543     case IPF_LIST_STATE_NUM:
 544     default:
 545         OVS_NOT_REACHED();
 546     }
 547
 548     if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) {
 549         ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
 550         if (ipf_list_complete(ipf_list)) {
 551             struct dp_packet *reass_pkt = v6
 552                 ? ipf_reassemble_v6_frags(ipf_list)
 553                 : ipf_reassemble_v4_frags(ipf_list);
 554             if (reass_pkt) {
 555                 struct reassembled_pkt *rp = xzalloc(sizeof *rp);
 556                 rp->pkt = reass_pkt;
 557                 rp->list = ipf_list;
 558                 ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp);
 559                 ipf_expiry_list_remove(ipf_list);
 560                 next_state = IPF_LIST_STATE_COMPLETED;
 561             } else {
 562                 next_state = IPF_LIST_STATE_REASS_FAIL;
 563             }
 564         }
 565     }
 566     ipf_list->state = next_state;
 567 }
 568
 569 /* Some sanity checks are redundant, but prudent, in case code paths for
 570  * fragments change in future. The processing cost for fragments is not
 571  * important. */
 572 static bool
 573 ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt)
 574 {
 575     if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) {
 576         goto invalid_pkt;
 577     }
 578
 579     const struct eth_header *l2 = dp_packet_eth(pkt);
 580     const struct ip_header *l3 = dp_packet_l3(pkt);
 581
 582     if (OVS_UNLIKELY(!l2 || !l3)) {
 583         goto invalid_pkt;
 584     }
 585
 586     size_t l3_size = dp_packet_l3_size(pkt);
 587     if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) {
 588         goto invalid_pkt;
 589     }
 590
 591     if (!IP_IS_FRAGMENT(l3->ip_frag_off)) {
 592         return false;
 593     }
 594
 595     uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
 596     if (OVS_UNLIKELY(ip_tot_len != l3_size)) {
 597         goto invalid_pkt;
 598     }
 599
 600     size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
 601     if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
 602         goto invalid_pkt;
 603     }
 604     if (OVS_UNLIKELY(l3_size < ip_hdr_len)) {
 605         goto invalid_pkt;
 606     }
 607
 608     if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
 609                      && csum(l3, ip_hdr_len) != 0)) {
 610         goto invalid_pkt;
 611     }
 612
 613     uint32_t min_v4_frag_size_;
 614     atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_);
 615     bool lf = ipf_is_last_v4_frag(pkt);
 616     if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v4_frag_size_)) {
 617         ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL);
 618         goto invalid_pkt;
 619     }
 620     return true;
 621
 622 invalid_pkt:
 623     pkt->md.ct_state = CS_INVALID;
 624     return false;
 625 }
 626
 627 static bool
 628 ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
 629                    struct ipf_list_key *key, uint16_t *start_data_byte,
 630                    uint16_t *end_data_byte, bool *ff, bool *lf)
 631 {
 632     const struct ip_header *l3 = dp_packet_l3(pkt);
 633     uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
 634     size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
 635
 636     *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8;
 637     *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1;
 638     *ff = ipf_is_first_v4_frag(pkt);
 639     *lf = ipf_is_last_v4_frag(pkt);
 640     memset(key, 0, sizeof *key);
 641     key->ip_id = be16_to_be32(l3->ip_id);
 642     key->dl_type = dl_type;
 643     key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src);
 644     key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst);
 645     key->nw_proto = l3->ip_proto;
 646     key->zone = zone;
 647     key->recirc_id = pkt->md.recirc_id;
 648     return true;
 649 }
 650
 651 /* Some sanity checks are redundant, but prudent, in case code paths for
 652  * fragments change in future. The processing cost for fragments is not
 653  * important. */
 654 static bool
 655 ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt)
 656 {
 657     const struct eth_header *l2 = dp_packet_eth(pkt);
 658     const struct  ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 659     const char *l4 = dp_packet_l4(pkt);
 660
 661     if (OVS_UNLIKELY(!l2 || !l3 || !l4)) {
 662         goto invalid_pkt;
 663     }
 664
 665     size_t l3_size = dp_packet_l3_size(pkt);
 666     size_t l3_hdr_size = sizeof *l3;
 667
 668     if (OVS_UNLIKELY(l3_size < l3_hdr_size)) {
 669         goto invalid_pkt;
 670     }
 671
 672     uint8_t nw_frag = 0;
 673     uint8_t nw_proto = l3->ip6_nxt;
 674     const void *data = l3 + 1;
 675     size_t datasize = l3_size - l3_hdr_size;
 676     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 677     if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
 678                              &frag_hdr) || !nw_frag || !frag_hdr) {
 679         return false;
 680     }
 681
 682     int pl = ntohs(l3->ip6_plen);
 683     if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) {
 684         goto invalid_pkt;
 685     }
 686
 687     ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
 688     if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) {
 689         return false;
 690     }
 691
 692     uint32_t min_v6_frag_size_;
 693     atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_);
 694     bool lf = ipf_is_last_v6_frag(ip6f_offlg);
 695
 696     if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v6_frag_size_)) {
 697         ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL);
 698         goto invalid_pkt;
 699     }
 700
 701     return true;
 702
 703 invalid_pkt:
 704     pkt->md.ct_state = CS_INVALID;
 705     return false;
 706
 707 }
 708
 709 static void
 710 ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
 711                    struct ipf_list_key *key, uint16_t *start_data_byte,
 712                    uint16_t *end_data_byte, bool *ff, bool *lf)
 713 {
 714     const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
 715     uint8_t nw_frag = 0;
 716     uint8_t nw_proto = l3->ip6_nxt;
 717     const void *data = l3 + 1;
 718     size_t datasize = dp_packet_l3_size(pkt) - sizeof *l3;
 719     const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
 720
 721     parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr);
 722     ovs_assert(nw_frag && frag_hdr);
 723     ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
 724     *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) +
 725         sizeof (struct ovs_16aligned_ip6_frag);
 726     *end_data_byte = *start_data_byte + dp_packet_l4_size(pkt) - 1;
 727     *ff = ipf_is_first_v6_frag(ip6f_offlg);
 728     *lf = ipf_is_last_v6_frag(ip6f_offlg);
 729     memset(key, 0, sizeof *key);
 730     key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident);
 731     key->dl_type = dl_type;
 732     memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6);
 733     /* We are not supporting parsing of the routing header to use as the
 734      * dst address part of the key. */
 735     memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6);
 736     key->nw_proto = 0;   /* Not used for key for V6. */
 737     key->zone = zone;
 738     key->recirc_id = pkt->md.recirc_id;
 739 }
 740
 741 static bool
 742 ipf_list_key_eq(const struct ipf_list_key *key1,
 743                 const struct ipf_list_key *key2)
 744     /* OVS_REQUIRES(ipf_lock) */
 745 {
 746     if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) &&
 747         !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) &&
 748         key1->dl_type == key2->dl_type &&
 749         key1->ip_id == key2->ip_id &&
 750         key1->zone == key2->zone &&
 751         key1->nw_proto == key2->nw_proto &&
 752         key1->recirc_id == key2->recirc_id) {
 753         return true;
 754     }
 755     return false;
 756 }
 757
 758 static struct ipf_list *
 759 ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
 760                     uint32_t hash)
 761     OVS_REQUIRES(ipf->ipf_lock)
 762 {
 763     struct ipf_list *ipf_list;
 764     HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
 765         if (ipf_list_key_eq(&ipf_list->key, key)) {
 766             return ipf_list;
 767         }
 768     }
 769     return NULL;
 770 }
 771
 772 static bool
 773 ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
 774                   size_t start_data_byte, size_t end_data_byte)
 775     /* OVS_REQUIRES(ipf_lock) */
 776 {
 777     for (int i = 0; i <= last_inuse_idx; i++) {
 778         if ((start_data_byte >= frag_list[i].start_data_byte &&
 779             start_data_byte <= frag_list[i].end_data_byte) ||
 780             (end_data_byte >= frag_list[i].start_data_byte &&
 781              end_data_byte <= frag_list[i].end_data_byte)) {
 782             return true;
 783         }
 784     }
 785     return false;
 786 }
 787
 788 /* Adds a fragment to a list of fragments, if the fragment is not a
 789  * duplicate. If the fragment is a duplicate, that fragment is marked
 790  * invalid to avoid the work that conntrack would do to mark the fragment
 791  * as invalid, which it will in all cases. */
 792 static bool
 793 ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
 794                  struct dp_packet *pkt, uint16_t start_data_byte,
 795                  uint16_t end_data_byte, bool ff, bool lf, bool v6,
 796                  bool dnsteal)
 797     OVS_REQUIRES(ipf->ipf_lock)
 798 {
 799     bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
 800         ipf_list->last_inuse_idx, start_data_byte, end_data_byte);
 801     int last_inuse_idx = ipf_list->last_inuse_idx;
 802
 803     if (!duped_frag) {
 804         if (last_inuse_idx < ipf_list->size - 1) {
 805             /* In the case of dpdk, it would be unfortunate if we had
 806              * to create a clone fragment outside the dpdk mp due to the
 807              * mempool size being too limited. We will otherwise need to
 808              * recommend not setting the mempool number of buffers too low
 809              * and also clamp the number of fragments. */
 810             struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx + 1];
 811             frag->pkt = pkt;
 812             frag->start_data_byte = start_data_byte;
 813             frag->end_data_byte = end_data_byte;
 814             frag->dnsteal = dnsteal;
 815             ipf_list->last_inuse_idx++;
 816             atomic_count_inc(&ipf->nfrag);
 817             ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
 818             ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
 819         } else {
 820             OVS_NOT_REACHED();
 821         }
 822     } else {
 823         ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP);
 824         pkt->md.ct_state = CS_INVALID;
 825         return false;
 826     }
 827     return true;
 828 }
 829
 830 static void
 831 ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key,
 832               int max_frag_list_size)
 833 {
 834     ipf_list->key = *key;
 835     ipf_list->last_inuse_idx = IPF_INVALID_IDX;
 836     ipf_list->last_sent_idx = IPF_INVALID_IDX;
 837     ipf_list->reass_execute_ctx = NULL;
 838     ipf_list->state = IPF_LIST_STATE_UNUSED;
 839     ipf_list->size = max_frag_list_size;
 840     ipf_list->frag_list
 841         = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
 842 }
 843
 844 /* Generates a fragment list key from a well formed fragment and either starts
 845  * a new fragment list or increases the size of the existing fragment list,
 846  * while checking if the maximum supported fragements are supported or the
 847  * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
 848  * to a list of fragemnts. */
 849 static bool
 850 ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
 851                 uint16_t zone, long long now, uint32_t hash_basis,
 852                 bool dnsteal)
 853     OVS_REQUIRES(ipf->ipf_lock)
 854 {
 855     struct ipf_list_key key;
 856     /* Initialize 4 variables for some versions of GCC. */
 857     uint16_t start_data_byte = 0;
 858     uint16_t end_data_byte = 0;
 859     bool ff = false;
 860     bool lf = false;
 861     bool v6 = dl_type == htons(ETH_TYPE_IPV6);
 862
 863     if (v6 && ipf_get_v6_enabled(ipf)) {
 864         ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
 865                            &end_data_byte, &ff, &lf);
 866     } else if (!v6 && ipf_get_v4_enabled(ipf)) {
 867         ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
 868                            &end_data_byte, &ff, &lf);
 869     } else {
 870         return false;
 871     }
 872
 873     unsigned int nfrag_max;
 874     atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max);
 875     if (atomic_count_get(&ipf->nfrag) >= nfrag_max) {
 876         return false;
 877     }
 878
 879     uint32_t hash = ipf_list_key_hash(&key, hash_basis);
 880     struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash);
 881     enum {
 882         IPF_FRAG_LIST_MIN_INCREMENT = 4,
 883         IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535,
 884     };
 885
 886     int max_frag_list_size;
 887     if (v6) {
 888         /* Because the calculation with extension headers is variable,
 889          * we don't calculate a hard maximum fragment list size upfront.  The
 890          * fragment list size is practically limited by the code, however. */
 891         max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE;
 892     } else {
 893         max_frag_list_size = ipf->max_v4_frag_list_size;
 894     }
 895
 896     if (!ipf_list) {
 897         ipf_list = xmalloc(sizeof *ipf_list);
 898         ipf_list_init(ipf_list, &key,
 899                       MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT));
 900         hmap_insert(&ipf->frag_lists, &ipf_list->node, hash);
 901         ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now);
 902     } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL ||
 903                ipf_list->state == IPF_LIST_STATE_COMPLETED) {
 904         /* Bail out as early as possible. */
 905         return false;
 906     } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
 907         int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
 908                             max_frag_list_size - ipf_list->size);
 909         /* Enforce limit. */
 910         if (increment > 0) {
 911             ipf_list->frag_list =
 912                 xrealloc(ipf_list->frag_list, (ipf_list->size + increment) *
 913                   sizeof *ipf_list->frag_list);
 914             ipf_list->size += increment;
 915         } else {
 916             return false;
 917         }
 918     }
 919
 920     return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
 921                             end_data_byte, ff, lf, v6, dnsteal);
 922 }
 923
 924 /* Filters out fragments from a batch of fragments and adjust the batch. */
 925 static void
 926 ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb,
 927                              ovs_be16 dl_type, uint16_t zone, long long now,
 928                              uint32_t hash_basis)
 929 {
 930     const size_t pb_cnt = dp_packet_batch_size(pb);
 931     int pb_idx; /* Index in a packet batch. */
 932     struct dp_packet *pkt;
 933
 934     DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
 935         if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) &&
 936                           ipf_is_valid_v4_frag(ipf, pkt))
 937                           ||
 938                           (dl_type == htons(ETH_TYPE_IPV6) &&
 939                           ipf_is_valid_v6_frag(ipf, pkt)))) {
 940
 941             ovs_mutex_lock(&ipf->ipf_lock);
 942             if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis,
 943                                  pb->do_not_steal)) {
 944                 dp_packet_batch_refill(pb, pkt, pb_idx);
 945             }
 946             ovs_mutex_unlock(&ipf->ipf_lock);
 947         } else {
 948             dp_packet_batch_refill(pb, pkt, pb_idx);
 949         }
 950     }
 951 }
 952
 953 /* In case of DPDK, a memory source check is done, as DPDK memory pool
 954  * management has trouble dealing with multiple source types.  The
 955  * check_source paramater is used to indicate when this check is needed. */
 956 static bool
 957 ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt,
 958                         bool check_source OVS_UNUSED)
 959 {
 960 #ifdef DPDK_NETDEV
 961     if ((dp_packet_batch_is_full(pb)) ||
 962         /* DPDK cannot handle multiple sources in a batch. */
 963         (check_source && !dp_packet_batch_is_empty(pb)
 964          && pb->packets[0]->source != pkt->source)) {
 965 #else
 966     if (dp_packet_batch_is_full(pb)) {
 967 #endif
 968         return false;
 969     }
 970
 971     dp_packet_batch_add(pb, pkt);
 972     return true;
 973 }
 974
 975 /* This would be used in rare cases where a list cannot be sent. One rare
 976  * reason known right now is a mempool source check, which exists due to DPDK
 977  * support, where packets are no longer being received on any port with a
 978  * source matching the fragment.  Another reason is a race where all
 979  * conntrack rules are unconfigured when some fragments are yet to be
 980  * flushed.
 981  *
 982  * Returns true if the list was purged. */
 983 static bool
 984 ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list,
 985                      long long now)
 986     OVS_REQUIRES(ipf->ipf_lock)
 987 {
 988     enum {
 989         IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000
 990     };
 991
 992     if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) {
 993         return false;
 994     }
 995
 996     while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
 997         struct dp_packet * pkt
 998             = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
 999         dp_packet_delete(pkt);
1000         atomic_count_dec(&ipf->nfrag);
1001         COVERAGE_INC(ipf_stuck_frag_list_purged);
1002         ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6),
1003                   IPF_NFRAGS_PURGED);
1004         ipf_list->last_sent_idx++;
1005     }
1006
1007     return true;
1008 }
1009
1010 /* Does the packet batch management and common accounting work associated
1011  * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
1012 static bool
1013 ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
1014                        struct dp_packet_batch *pb,
1015                        enum ipf_list_type list_type, bool v6, long long now)
1016     OVS_REQUIRES(ipf->ipf_lock)
1017 {
1018     if (ipf_purge_list_check(ipf, ipf_list, now)) {
1019         return true;
1020     }
1021
1022     while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1023         struct dp_packet *pkt
1024             = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1025         if (ipf_dp_packet_batch_add(pb, pkt, true)) {
1026             ipf_list->last_sent_idx++;
1027             atomic_count_dec(&ipf->nfrag);
1028
1029             if (list_type == IPF_FRAG_COMPLETED_LIST) {
1030                 ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT);
1031             } else {
1032                 ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT);
1033                 pkt->md.ct_state = CS_INVALID;
1034             }
1035
1036             if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
1037                 return true;
1038             }
1039         } else {
1040             return false;
1041         }
1042     }
1043     OVS_NOT_REACHED();
1044 }
1045
1046 /* Adds fragments associated with a completed fragment list to a packet batch
1047  * to be processed by the calling application, typically conntrack. Also
1048  * cleans up the list context when it is empty.*/
1049 static void
1050 ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1051                          long long now, bool v6)
1052 {
1053     if (ovs_list_is_empty(&ipf->frag_complete_list)) {
1054         return;
1055     }
1056
1057     ovs_mutex_lock(&ipf->ipf_lock);
1058     struct ipf_list *ipf_list, *next;
1059
1060     LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) {
1061         if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST,
1062                                    v6, now)) {
1063             ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1064         } else {
1065             break;
1066         }
1067     }
1068
1069     ovs_mutex_unlock(&ipf->ipf_lock);
1070 }
1071
1072 /* Conservatively adds fragments associated with a expired fragment list to
1073  * a packet batch to be processed by the calling application, typically
1074  * conntrack. Also cleans up the list context when it is empty.*/
1075 static void
1076 ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1077                        long long now, bool v6)
1078 {
1079     enum {
1080         /* Very conservative, due to DOS probability. */
1081         IPF_FRAG_LIST_MAX_EXPIRED = 1,
1082     };
1083
1084
1085     if (ovs_list_is_empty(&ipf->frag_exp_list)) {
1086         return;
1087     }
1088
1089     ovs_mutex_lock(&ipf->ipf_lock);
1090     struct ipf_list *ipf_list, *next;
1091     size_t lists_removed = 0;
1092
1093     LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) {
1094         if (now <= ipf_list->expiration ||
1095             lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
1096             break;
1097         }
1098
1099         if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_EXPIRY_LIST,
1100                                    v6, now)) {
1101             ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1102             lists_removed++;
1103         } else {
1104             break;
1105         }
1106     }
1107
1108     ovs_mutex_unlock(&ipf->ipf_lock);
1109 }
1110
1111 /* Adds a reassmebled packet to a packet batch to be processed by the caller.
1112  */
1113 static void
1114 ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
1115 {
1116     if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1117         return;
1118     }
1119
1120     ovs_mutex_lock(&ipf->ipf_lock);
1121     struct reassembled_pkt *rp, *next;
1122
1123     LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1124         if (!rp->list->reass_execute_ctx &&
1125             ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
1126             rp->list->reass_execute_ctx = rp->pkt;
1127         }
1128     }
1129
1130     ovs_mutex_unlock(&ipf->ipf_lock);
1131 }
1132
1133 /* Checks for reassembled packets post processing by conntrack and edits the
1134  * fragments if needed based on what conntrack decided. */
1135 static void
1136 ipf_post_execute_reass_pkts(struct ipf *ipf,
1137                             struct dp_packet_batch *pb, bool v6)
1138 {
1139     if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1140         return;
1141     }
1142
1143     ovs_mutex_lock(&ipf->ipf_lock);
1144     struct reassembled_pkt *rp, *next;
1145
1146     LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1147         const size_t pb_cnt = dp_packet_batch_size(pb);
1148         int pb_idx;
1149         struct dp_packet *pkt;
1150         /* Inner batch loop is constant time since batch size is <=
1151          * NETDEV_MAX_BURST. */
1152         DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
1153             if (pkt == rp->list->reass_execute_ctx) {
1154                 for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
1155                     rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
1156                     rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
1157                     rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
1158                     rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
1159                     rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
1160                         pkt->md.ct_orig_tuple_ipv6;
1161                     if (pkt->md.ct_orig_tuple_ipv6) {
1162                         rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
1163                             pkt->md.ct_orig_tuple.ipv6;
1164                     } else {
1165                         rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
1166                             pkt->md.ct_orig_tuple.ipv4;
1167                     }
1168                 }
1169
1170                 const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
1171                 void *l4_frag = dp_packet_l4(frag_0->pkt);
1172                 void *l4_reass = dp_packet_l4(pkt);
1173                 memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
1174
1175                 if (v6) {
1176                     struct ovs_16aligned_ip6_hdr *l3_frag
1177                         = dp_packet_l3(frag_0->pkt);
1178                     struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
1179                     l3_frag->ip6_src = l3_reass->ip6_src;
1180                     l3_frag->ip6_dst = l3_reass->ip6_dst;
1181                 } else {
1182                     struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
1183                     struct ip_header *l3_reass = dp_packet_l3(pkt);
1184                     ovs_be32 reass_ip = get_16aligned_be32(&l3_reass->ip_src);
1185                     ovs_be32 frag_ip = get_16aligned_be32(&l3_frag->ip_src);
1186                     l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1187                                                      frag_ip, reass_ip);
1188                     l3_frag->ip_src = l3_reass->ip_src;
1189
1190                     reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
1191                     frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
1192                     l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1193                                                      frag_ip, reass_ip);
1194                     l3_frag->ip_dst = l3_reass->ip_dst;
1195                 }
1196
1197                 ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
1198                 ipf_reassembled_list_remove(rp);
1199                 dp_packet_delete(rp->pkt);
1200                 free(rp);
1201             } else {
1202                 dp_packet_batch_refill(pb, pkt, pb_idx);
1203             }
1204         }
1205     }
1206
1207     ovs_mutex_unlock(&ipf->ipf_lock);
1208 }
1209
1210 /* Extracts any fragments from the batch and reassembles them when a
1211  * complete packet is received.  Completed packets are attempted to
1212  * be added to the batch to be sent through conntrack. */
1213 void
1214 ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1215                          long long now, ovs_be16 dl_type, uint16_t zone,
1216                          uint32_t hash_basis)
1217 {
1218     if (ipf_get_enabled(ipf)) {
1219         ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis);
1220     }
1221
1222     if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1223         ipf_execute_reass_pkts(ipf, pb);
1224     }
1225 }
1226
1227 /* Updates fragments based on the processing of the reassembled packet sent
1228  * through conntrack and adds these fragments to any batches seen.  Expired
1229  * fragments are marked as invalid and also added to the batches seen
1230  * with low priority.  Reassembled packets are freed. */
1231 void
1232 ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1233                           long long now, ovs_be16 dl_type)
1234 {
1235     if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1236         bool v6 = dl_type == htons(ETH_TYPE_IPV6);
1237         ipf_post_execute_reass_pkts(ipf, pb, v6);
1238         ipf_send_completed_frags(ipf, pb, now, v6);
1239         ipf_send_expired_frags(ipf, pb, now, v6);
1240     }
1241 }
1242
1243 static void *
1244 ipf_clean_thread_main(void *f)
1245 {
1246     struct ipf *ipf = f;
1247
1248     enum {
1249         IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000,
1250     };
1251
1252     while (!latch_is_set(&ipf->ipf_clean_thread_exit)) {
1253
1254         long long now = time_msec();
1255
1256         if (!ovs_list_is_empty(&ipf->frag_exp_list) ||
1257             !ovs_list_is_empty(&ipf->frag_complete_list)) {
1258
1259             ovs_mutex_lock(&ipf->ipf_lock);
1260
1261             struct ipf_list *ipf_list, *next;
1262             LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1263                                 &ipf->frag_exp_list) {
1264                 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1265                     ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1266                 }
1267             }
1268
1269             LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1270                                 &ipf->frag_complete_list) {
1271                 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1272                     ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1273                 }
1274             }
1275
1276             ovs_mutex_unlock(&ipf->ipf_lock);
1277         }
1278
1279         poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT);
1280         latch_wait(&ipf->ipf_clean_thread_exit);
1281         poll_block();
1282     }
1283
1284     return NULL;
1285 }
1286
1287 struct ipf *
1288 ipf_init(void)
1289 {
1290     struct ipf *ipf = xzalloc(sizeof *ipf);
1291
1292     ovs_mutex_init_adaptive(&ipf->ipf_lock);
1293     ovs_mutex_lock(&ipf->ipf_lock);
1294     hmap_init(&ipf->frag_lists);
1295     ovs_list_init(&ipf->frag_exp_list);
1296     ovs_list_init(&ipf->frag_complete_list);
1297     ovs_list_init(&ipf->reassembled_pkt_list);
1298     atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF);
1299     atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF);
1300     ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1301         IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1302         ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1303     ovs_mutex_unlock(&ipf->ipf_lock);
1304     atomic_count_init(&ipf->nfrag, 0);
1305     for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) {
1306         atomic_init(&ipf->n4frag_cnt[i], 0);
1307         atomic_init(&ipf->n6frag_cnt[i], 0);
1308     }
1309     atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT);
1310     atomic_init(&ipf->ifp_v4_enabled, true);
1311     atomic_init(&ipf->ifp_v6_enabled, true);
1312     latch_init(&ipf->ipf_clean_thread_exit);
1313     ipf->ipf_clean_thread = ovs_thread_create("ipf_clean",
1314                                          ipf_clean_thread_main, ipf);
1315
1316     return ipf;
1317 }
1318
1319 void
1320 ipf_destroy(struct ipf *ipf)
1321 {
1322     ovs_mutex_lock(&ipf->ipf_lock);
1323     latch_set(&ipf->ipf_clean_thread_exit);
1324     pthread_join(ipf->ipf_clean_thread, NULL);
1325     latch_destroy(&ipf->ipf_clean_thread_exit);
1326
1327     struct ipf_list *ipf_list;
1328     HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) {
1329         while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1330             struct dp_packet *pkt
1331                 = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1332             if (!ipf_list->frag_list[ipf_list->last_sent_idx + 1].dnsteal) {
1333                 dp_packet_delete(pkt);
1334             }
1335             atomic_count_dec(&ipf->nfrag);
1336             ipf_list->last_sent_idx++;
1337         }
1338         free(ipf_list->frag_list);
1339         free(ipf_list);
1340     }
1341
1342     if (atomic_count_get(&ipf->nfrag)) {
1343         VLOG_WARN("ipf destroy with non-zero fragment count. ");
1344     }
1345
1346     struct reassembled_pkt *rp;
1347     LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) {
1348         dp_packet_delete(rp->pkt);
1349         free(rp);
1350     }
1351
1352     hmap_destroy(&ipf->frag_lists);
1353     ovs_list_poison(&ipf->frag_exp_list);
1354     ovs_list_poison(&ipf->frag_complete_list);
1355     ovs_list_poison(&ipf->reassembled_pkt_list);
1356     ovs_mutex_unlock(&ipf->ipf_lock);
1357     ovs_mutex_destroy(&ipf->ipf_lock);
1358     free(ipf);
1359 }
1360
1361 int
1362 ipf_set_enabled(struct ipf *ipf, bool v6, bool enable)
1363 {
1364     atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled,
1365                          enable);
1366     return 0;
1367 }
1368
1369 int
1370 ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value)
1371 {
1372     /* If the user specifies an unreasonably large number, fragmentation
1373      * will not work well but it will not blow up. */
1374     if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND :  IPF_V4_FRAG_SIZE_LBOUND)) {
1375         return 1;
1376     }
1377
1378     ovs_mutex_lock(&ipf->ipf_lock);
1379     if (v6) {
1380         atomic_store_relaxed(&ipf->min_v6_frag_size, value);
1381     } else {
1382         atomic_store_relaxed(&ipf->min_v4_frag_size, value);
1383         ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1384             IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1385             ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1386     }
1387     ovs_mutex_unlock(&ipf->ipf_lock);
1388     return 0;
1389 }
1390
1391 int
1392 ipf_set_max_nfrags(struct ipf *ipf, uint32_t value)
1393 {
1394     if (value > IPF_NFRAG_UBOUND) {
1395         return 1;
1396     }
1397     atomic_store_relaxed(&ipf->nfrag_max, value);
1398     return 0;
1399 }
1400
1401 int
1402 ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status)
1403 {
1404     ipf_status->nfrag = atomic_count_get(&ipf->nfrag);
1405     atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max);
1406
1407     atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled);
1408     atomic_read_relaxed(&ipf->min_v4_frag_size,
1409                         &ipf_status->v4.min_frag_size);
1410     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED],
1411                         &ipf_status->v4.nfrag_accepted);
1412     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT],
1413                         &ipf_status->v4.nfrag_completed_sent);
1414     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT],
1415                         &ipf_status->v4.nfrag_expired_sent);
1416     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL],
1417                         &ipf_status->v4.nfrag_too_small);
1418     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP],
1419                         &ipf_status->v4.nfrag_overlap);
1420     atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED],
1421                         &ipf_status->v4.nfrag_purged);
1422
1423     atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled);
1424     atomic_read_relaxed(&ipf->min_v6_frag_size,
1425                         &ipf_status->v6.min_frag_size);
1426     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED],
1427                         &ipf_status->v6.nfrag_accepted);
1428     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT],
1429                         &ipf_status->v6.nfrag_completed_sent);
1430     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT],
1431                         &ipf_status->v6.nfrag_expired_sent);
1432     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL],
1433                         &ipf_status->v6.nfrag_too_small);
1434     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP],
1435                         &ipf_status->v6.nfrag_overlap);
1436     atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED],
1437                         &ipf_status->v6.nfrag_purged);
1438     return 0;
1439 }
1440
1441 struct ipf_dump_ctx {
1442     struct hmap_position bucket_pos;
1443 };
1444
1445 /* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
1446  * caller must call ipf_dump_done() when dumping is finished. */
1447 int
1448 ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx)
1449 {
1450     *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx);
1451     return 0;
1452 }
1453
1454 /* Creates a string representation of the state of an 'ipf_list' and puts
1455  * it in 'ds'. */
1456 static void
1457 ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds)
1458 {
1459     ds_put_cstr(ds, "(");
1460     if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) {
1461         ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",",
1462                       IP_ARGS(ipf_list->key.src_addr.ipv4),
1463                       IP_ARGS(ipf_list->key.dst_addr.ipv4));
1464     } else {
1465         ds_put_cstr(ds, "src=");
1466         ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds);
1467         ds_put_cstr(ds, ",dst=");
1468         ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds);
1469         ds_put_cstr(ds, ",");
1470     }
1471
1472     ds_put_format(ds, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
1473                   ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id),
1474                   ntohs(ipf_list->key.dl_type), ipf_list->key.zone,
1475                   ipf_list->key.nw_proto);
1476
1477     ds_put_format(ds, ",num_fragments=%u,state=%s",
1478                   ipf_list->last_inuse_idx + 1,
1479                   ipf_state_name[ipf_list->state]);
1480
1481     ds_put_cstr(ds, ")");
1482 }
1483
1484 /* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
1485  * ipf_dump_create() to create a string representation of the state of an
1486  * ipf list, to which 'dump' is pointed to.  Returns EOF when there are no
1487  * more ipf lists. */
1488 int
1489 ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char **dump)
1490 {
1491     ovs_mutex_lock(&ipf->ipf_lock);
1492
1493     struct hmap_node *node = hmap_at_position(&ipf->frag_lists,
1494                                               &ipf_dump_ctx->bucket_pos);
1495     if (!node) {
1496         ovs_mutex_unlock(&ipf->ipf_lock);
1497         return EOF;
1498     } else {
1499         struct ipf_list *ipf_list_;
1500         INIT_CONTAINER(ipf_list_, node, node);
1501         struct ipf_list ipf_list = *ipf_list_;
1502         ovs_mutex_unlock(&ipf->ipf_lock);
1503         struct ds ds = DS_EMPTY_INITIALIZER;
1504         ipf_dump_create(&ipf_list, &ds);
1505         *dump = ds_steal_cstr(&ds);
1506         return 0;
1507     }
1508 }
1509
1510 /* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
1511 int
1512 ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx)
1513 {
1514     free(ipf_dump_ctx);
1515     return 0;
1516 }