lib/conntrack.c

   1 /*
   2  * Copyright (c) 2015, 2016 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "conntrack.h"
  19
  20 #include <errno.h>
  21 #include <sys/types.h>
  22 #include <netinet/in.h>
  23 #include <netinet/icmp6.h>
  24
  25 #include "bitmap.h"
  26 #include "conntrack-private.h"
  27 #include "coverage.h"
  28 #include "csum.h"
  29 #include "ct-dpif.h"
  30 #include "dp-packet.h"
  31 #include "flow.h"
  32 #include "netdev.h"
  33 #include "odp-netlink.h"
  34 #include "openvswitch/hmap.h"
  35 #include "openvswitch/vlog.h"
  36 #include "ovs-rcu.h"
  37 #include "ovs-thread.h"
  38 #include "poll-loop.h"
  39 #include "random.h"
  40 #include "timeval.h"
  41
  42
  43 VLOG_DEFINE_THIS_MODULE(conntrack);
  44
  45 COVERAGE_DEFINE(conntrack_full);
  46 COVERAGE_DEFINE(conntrack_long_cleanup);
  47
  48 struct conn_lookup_ctx {
  49     struct conn_key key;
  50     struct conn *conn;
  51     uint32_t hash;
  52     bool reply;
  53     bool related;
  54 };
  55
  56 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
  57                              ovs_be16 dl_type, struct conn_lookup_ctx *,
  58                              uint16_t zone);
  59 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
  60 static void conn_key_reverse(struct conn_key *);
  61 static void conn_key_lookup(struct conntrack_bucket *ctb,
  62                             struct conn_lookup_ctx *ctx,
  63                             long long now);
  64 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
  65 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
  66                              struct conn_key *, long long now);
  67 static void delete_conn(struct conn *);
  68 static enum ct_update_res conn_update(struct conn *,
  69                                       struct conntrack_bucket *ctb,
  70                                       struct dp_packet *, bool reply,
  71                                       long long now);
  72 static bool conn_expired(struct conn *, long long now);
  73 static void set_mark(struct dp_packet *, struct conn *,
  74                      uint32_t val, uint32_t mask);
  75 static void set_label(struct dp_packet *, struct conn *,
  76                       const struct ovs_key_ct_labels *val,
  77                       const struct ovs_key_ct_labels *mask);
  78 static void *clean_thread_main(void *f_);
  79
  80 static struct nat_conn_key_node *
  81 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
  82                      const struct conn_key *key,
  83                      uint32_t basis);
  84
  85 static void
  86 nat_conn_keys_remove(struct hmap *nat_conn_keys,
  87                      const struct conn_key *key,
  88                      uint32_t basis);
  89
  90 static bool
  91 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
  92                        struct conn *nat_conn);
  93
  94 static uint8_t
  95 reverse_icmp_type(uint8_t type);
  96 static uint8_t
  97 reverse_icmp6_type(uint8_t type);
  98 static inline bool
  99 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
 100                 const char **new_data, bool validate_checksum);
 101 static inline bool
 102 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
 103                 const char **new_data);
 104
 105 static struct ct_l4_proto *l4_protos[] = {
 106     [IPPROTO_TCP] = &ct_proto_tcp,
 107     [IPPROTO_UDP] = &ct_proto_other,
 108     [IPPROTO_ICMP] = &ct_proto_icmp4,
 109     [IPPROTO_ICMPV6] = &ct_proto_icmp6,
 110 };
 111
 112 long long ct_timeout_val[] = {
 113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
 114     CT_TIMEOUTS
 115 #undef CT_TIMEOUT
 116 };
 117
 118 /* If the total number of connections goes above this value, no new connections
 119  * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
 120 #define DEFAULT_N_CONN_LIMIT 3000000
 121
 122 /* Initializes the connection tracker 'ct'.  The caller is responsible for
 123  * calling 'conntrack_destroy()', when the instance is not needed anymore */
 124 void
 125 conntrack_init(struct conntrack *ct)
 126 {
 127     unsigned i, j;
 128     long long now = time_msec();
 129
 130     ct_rwlock_init(&ct->nat_resources_lock);
 131     ct_rwlock_wrlock(&ct->nat_resources_lock);
 132     hmap_init(&ct->nat_conn_keys);
 133     ct_rwlock_unlock(&ct->nat_resources_lock);
 134
 135     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 136         struct conntrack_bucket *ctb = &ct->buckets[i];
 137
 138         ct_lock_init(&ctb->lock);
 139         ct_lock_lock(&ctb->lock);
 140         hmap_init(&ctb->connections);
 141         for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
 142             ovs_list_init(&ctb->exp_lists[j]);
 143         }
 144         ct_lock_unlock(&ctb->lock);
 145         ovs_mutex_init(&ctb->cleanup_mutex);
 146         ovs_mutex_lock(&ctb->cleanup_mutex);
 147         ctb->next_cleanup = now + CT_TM_MIN;
 148         ovs_mutex_unlock(&ctb->cleanup_mutex);
 149     }
 150     ct->hash_basis = random_uint32();
 151     atomic_count_init(&ct->n_conn, 0);
 152     atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
 153     latch_init(&ct->clean_thread_exit);
 154     ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
 155 }
 156
 157 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
 158 void
 159 conntrack_destroy(struct conntrack *ct)
 160 {
 161     unsigned i;
 162
 163     latch_set(&ct->clean_thread_exit);
 164     pthread_join(ct->clean_thread, NULL);
 165     latch_destroy(&ct->clean_thread_exit);
 166     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 167         struct conntrack_bucket *ctb = &ct->buckets[i];
 168         struct conn *conn;
 169
 170         ovs_mutex_destroy(&ctb->cleanup_mutex);
 171         ct_lock_lock(&ctb->lock);
 172         HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
 173             if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
 174                 atomic_count_dec(&ct->n_conn);
 175             }
 176             delete_conn(conn);
 177         }
 178         hmap_destroy(&ctb->connections);
 179         ct_lock_unlock(&ctb->lock);
 180         ct_lock_destroy(&ctb->lock);
 181     }
 182     ct_rwlock_wrlock(&ct->nat_resources_lock);
 183     struct nat_conn_key_node *nat_conn_key_node;
 184     HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
 185         free(nat_conn_key_node);
 186     }
 187     hmap_destroy(&ct->nat_conn_keys);
 188     ct_rwlock_unlock(&ct->nat_resources_lock);
 189     ct_rwlock_destroy(&ct->nat_resources_lock);
 190 }
 191 \f
 192 static unsigned hash_to_bucket(uint32_t hash)
 193 {
 194     /* Extracts the most significant bits in hash. The least significant bits
 195      * are already used internally by the hmap implementation. */
 196     BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
 197
 198     return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
 199 }
 200
 201 static void
 202 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
 203             const struct conn_key *key)
 204 {
 205     pkt->md.ct_state |= CS_TRACKED;
 206     pkt->md.ct_zone = zone;
 207     pkt->md.ct_mark = conn ? conn->mark : 0;
 208     pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
 209
 210     /* Use the original direction tuple if we have it. */
 211     if (conn) {
 212         key = &conn->key;
 213     }
 214     pkt->md.ct_orig_tuple_ipv6 = false;
 215     if (key) {
 216         if (key->dl_type == htons(ETH_TYPE_IP)) {
 217             pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
 218                 key->src.addr.ipv4_aligned,
 219                 key->dst.addr.ipv4_aligned,
 220                 key->nw_proto != IPPROTO_ICMP
 221                 ? key->src.port : htons(key->src.icmp_type),
 222                 key->nw_proto != IPPROTO_ICMP
 223                 ? key->dst.port : htons(key->src.icmp_code),
 224                 key->nw_proto,
 225             };
 226         } else {
 227             pkt->md.ct_orig_tuple_ipv6 = true;
 228             pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
 229                 key->src.addr.ipv6_aligned,
 230                 key->dst.addr.ipv6_aligned,
 231                 key->nw_proto != IPPROTO_ICMPV6
 232                 ? key->src.port : htons(key->src.icmp_type),
 233                 key->nw_proto != IPPROTO_ICMPV6
 234                 ? key->dst.port : htons(key->src.icmp_code),
 235                 key->nw_proto,
 236             };
 237         }
 238     } else {
 239         memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
 240     }
 241
 242 }
 243
 244 static void
 245 pat_packet(struct dp_packet *pkt, const struct conn *conn)
 246 {
 247     if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 248         if (conn->key.nw_proto == IPPROTO_TCP) {
 249             struct tcp_header *th = dp_packet_l4(pkt);
 250             packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
 251         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 252             struct udp_header *uh = dp_packet_l4(pkt);
 253             packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
 254         }
 255     } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 256         if (conn->key.nw_proto == IPPROTO_TCP) {
 257             struct tcp_header *th = dp_packet_l4(pkt);
 258             packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
 259         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 260             struct udp_header *uh = dp_packet_l4(pkt);
 261             packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
 262         }
 263     }
 264 }
 265
 266 static void
 267 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
 268 {
 269     if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 270         pkt->md.ct_state |= CS_SRC_NAT;
 271         if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
 272             struct ip_header *nh = dp_packet_l3(pkt);
 273             packet_set_ipv4_addr(pkt, &nh->ip_src,
 274                                  conn->rev_key.dst.addr.ipv4_aligned);
 275         } else {
 276             struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
 277             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 278                                  nh6->ip6_src.be32,
 279                                  &conn->rev_key.dst.addr.ipv6_aligned,
 280                                  true);
 281         }
 282         if (!related) {
 283             pat_packet(pkt, conn);
 284         }
 285     } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 286         pkt->md.ct_state |= CS_DST_NAT;
 287         if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
 288             struct ip_header *nh = dp_packet_l3(pkt);
 289             packet_set_ipv4_addr(pkt, &nh->ip_dst,
 290                                  conn->rev_key.src.addr.ipv4_aligned);
 291         } else {
 292             struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
 293             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 294                                  nh6->ip6_dst.be32,
 295                                  &conn->rev_key.src.addr.ipv6_aligned,
 296                                  true);
 297         }
 298         if (!related) {
 299             pat_packet(pkt, conn);
 300         }
 301     }
 302 }
 303
 304 static void
 305 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
 306 {
 307     if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 308         if (conn->key.nw_proto == IPPROTO_TCP) {
 309             struct tcp_header *th = dp_packet_l4(pkt);
 310             packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
 311         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 312             struct udp_header *uh = dp_packet_l4(pkt);
 313             packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
 314         }
 315     } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 316         if (conn->key.nw_proto == IPPROTO_TCP) {
 317             struct tcp_header *th = dp_packet_l4(pkt);
 318             packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
 319         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 320             struct udp_header *uh = dp_packet_l4(pkt);
 321             packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
 322         }
 323     }
 324 }
 325
 326 static void
 327 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
 328 {
 329     if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 330         if (conn->key.nw_proto == IPPROTO_TCP) {
 331             struct tcp_header *th_in = dp_packet_l4(pkt);
 332             packet_set_tcp_port(pkt, conn->key.src.port,
 333                                 th_in->tcp_dst);
 334         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 335             struct udp_header *uh_in = dp_packet_l4(pkt);
 336             packet_set_udp_port(pkt, conn->key.src.port,
 337                                 uh_in->udp_dst);
 338         }
 339     } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 340         if (conn->key.nw_proto == IPPROTO_TCP) {
 341             struct tcp_header *th_in = dp_packet_l4(pkt);
 342             packet_set_tcp_port(pkt, th_in->tcp_src,
 343                                 conn->key.dst.port);
 344         } else if (conn->key.nw_proto == IPPROTO_UDP) {
 345             struct udp_header *uh_in = dp_packet_l4(pkt);
 346             packet_set_udp_port(pkt, uh_in->udp_src,
 347                                 conn->key.dst.port);
 348         }
 349     }
 350 }
 351
 352 static void
 353 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
 354 {
 355     char *tail = dp_packet_tail(pkt);
 356     char pad = dp_packet_l2_pad_size(pkt);
 357     struct conn_key inner_key;
 358     const char *inner_l4 = NULL;
 359     uint16_t orig_l3_ofs = pkt->l3_ofs;
 360     uint16_t orig_l4_ofs = pkt->l4_ofs;
 361
 362     if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
 363         struct ip_header *nh = dp_packet_l3(pkt);
 364         struct icmp_header *icmp = dp_packet_l4(pkt);
 365         struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
 366         extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
 367                         -pad, &inner_l4, false);
 368
 369         pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
 370         pkt->l4_ofs += inner_l4 - (char *) icmp;
 371
 372         if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 373             packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
 374                                  conn->key.src.addr.ipv4_aligned);
 375         } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 376             packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
 377                                  conn->key.dst.addr.ipv4_aligned);
 378         }
 379         reverse_pat_packet(pkt, conn);
 380         icmp->icmp_csum = 0;
 381         icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
 382     } else {
 383         struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
 384         struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
 385         struct ovs_16aligned_ip6_hdr *inner_l3_6 =
 386             (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
 387         extract_l3_ipv6(&inner_key, inner_l3_6,
 388                         tail - ((char *)inner_l3_6) - pad,
 389                         &inner_l4);
 390         pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
 391         pkt->l4_ofs += inner_l4 - (char *) icmp6;
 392
 393         if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 394             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 395                                  inner_l3_6->ip6_src.be32,
 396                                  &conn->key.src.addr.ipv6_aligned,
 397                                  true);
 398         } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 399             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 400                                  inner_l3_6->ip6_dst.be32,
 401                                  &conn->key.dst.addr.ipv6_aligned,
 402                                  true);
 403         }
 404         reverse_pat_packet(pkt, conn);
 405         uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
 406         icmp6->icmp6_base.icmp6_cksum = 0;
 407         icmp6->icmp6_base.icmp6_cksum = csum_finish(
 408             csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
 409     }
 410     pkt->l3_ofs = orig_l3_ofs;
 411     pkt->l4_ofs = orig_l4_ofs;
 412 }
 413
 414 static void
 415 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
 416               bool related)
 417 {
 418     if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
 419         pkt->md.ct_state |= CS_DST_NAT;
 420         if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
 421             struct ip_header *nh = dp_packet_l3(pkt);
 422             packet_set_ipv4_addr(pkt, &nh->ip_dst,
 423                                  conn->key.src.addr.ipv4_aligned);
 424         } else {
 425             struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
 426             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 427                                  nh6->ip6_dst.be32,
 428                                  &conn->key.src.addr.ipv6_aligned, true);
 429         }
 430
 431         if (OVS_UNLIKELY(related)) {
 432             reverse_nat_packet(pkt, conn);
 433         } else {
 434             un_pat_packet(pkt, conn);
 435         }
 436     } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
 437         pkt->md.ct_state |= CS_SRC_NAT;
 438         if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
 439             struct ip_header *nh = dp_packet_l3(pkt);
 440             packet_set_ipv4_addr(pkt, &nh->ip_src,
 441                                  conn->key.dst.addr.ipv4_aligned);
 442         } else {
 443             struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
 444             packet_set_ipv6_addr(pkt, conn->key.nw_proto,
 445                                  nh6->ip6_src.be32,
 446                                  &conn->key.dst.addr.ipv6_aligned, true);
 447         }
 448
 449         if (OVS_UNLIKELY(related)) {
 450             reverse_nat_packet(pkt, conn);
 451         } else {
 452             un_pat_packet(pkt, conn);
 453         }
 454     }
 455 }
 456
 457 /* Typical usage of this helper is in non per-packet code;
 458  * this is because the bucket lock needs to be held for lookup
 459  * and a hash would have already been needed. Hence, this function
 460  * is just intended for code clarity. */
 461 static struct conn *
 462 conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
 463 {
 464     struct conn_lookup_ctx ctx;
 465     ctx.conn = NULL;
 466     ctx.key = *key;
 467     ctx.hash = conn_key_hash(key, ct->hash_basis);
 468     unsigned bucket = hash_to_bucket(ctx.hash);
 469     conn_key_lookup(&ct->buckets[bucket], &ctx, now);
 470     return ctx.conn;
 471 }
 472
 473 static void
 474 nat_clean(struct conntrack *ct, struct conn *conn,
 475           struct conntrack_bucket *ctb)
 476     OVS_REQUIRES(ctb->lock)
 477 {
 478     long long now = time_msec();
 479     ct_rwlock_wrlock(&ct->nat_resources_lock);
 480     nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
 481     ct_rwlock_unlock(&ct->nat_resources_lock);
 482     ct_lock_unlock(&ctb->lock);
 483
 484     uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
 485     unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
 486
 487     ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
 488     ct_rwlock_wrlock(&ct->nat_resources_lock);
 489
 490     struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
 491
 492     struct nat_conn_key_node *nat_conn_key_node =
 493         nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
 494                              ct->hash_basis);
 495
 496     /* In the unlikely event, rev conn was recreated, then skip
 497      * rev_conn cleanup. */
 498     if (rev_conn && (!nat_conn_key_node ||
 499                      memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
 500                             sizeof nat_conn_key_node->value))) {
 501         hmap_remove(&ct->buckets[bucket_rev_conn].connections,
 502                     &rev_conn->node);
 503         free(rev_conn);
 504     }
 505     delete_conn(conn);
 506
 507     ct_rwlock_unlock(&ct->nat_resources_lock);
 508     ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
 509     ct_lock_lock(&ctb->lock);
 510 }
 511
 512 static void
 513 conn_clean(struct conntrack *ct, struct conn *conn,
 514            struct conntrack_bucket *ctb)
 515     OVS_REQUIRES(ctb->lock)
 516 {
 517     ovs_list_remove(&conn->exp_node);
 518     hmap_remove(&ctb->connections, &conn->node);
 519     atomic_count_dec(&ct->n_conn);
 520     if (conn->nat_info) {
 521         nat_clean(ct, conn, ctb);
 522     } else {
 523         delete_conn(conn);
 524     }
 525 }
 526
 527 static struct conn *
 528 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
 529                struct conn_lookup_ctx *ctx, bool commit, long long now,
 530                const struct nat_action_info_t *nat_action_info,
 531                struct conn *conn_for_un_nat_copy)
 532 {
 533     unsigned bucket = hash_to_bucket(ctx->hash);
 534     struct conn *nc = NULL;
 535
 536     if (!valid_new(pkt, &ctx->key)) {
 537         pkt->md.ct_state = CS_INVALID;
 538         return nc;
 539     }
 540     pkt->md.ct_state = CS_NEW;
 541
 542     if (commit) {
 543         unsigned int n_conn_limit;
 544
 545         atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
 546
 547         if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
 548             COVERAGE_INC(conntrack_full);
 549             return nc;
 550         }
 551
 552         nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
 553         ctx->conn = nc;
 554         nc->rev_key = nc->key;
 555         conn_key_reverse(&nc->rev_key);
 556
 557         if (nat_action_info) {
 558             nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
 559             ct_rwlock_wrlock(&ct->nat_resources_lock);
 560
 561             bool nat_res = nat_select_range_tuple(ct, nc,
 562                                                   conn_for_un_nat_copy);
 563
 564             if (!nat_res) {
 565                 free(nc->nat_info);
 566                 nc->nat_info = NULL;
 567                 free (nc);
 568                 ct_rwlock_unlock(&ct->nat_resources_lock);
 569                 return NULL;
 570             }
 571
 572             if (conn_for_un_nat_copy &&
 573                 nc->conn_type == CT_CONN_TYPE_DEFAULT) {
 574                 *nc = *conn_for_un_nat_copy;
 575                 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
 576             }
 577             ct_rwlock_unlock(&ct->nat_resources_lock);
 578
 579             nat_packet(pkt, nc, ctx->related);
 580         }
 581         hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
 582         atomic_count_inc(&ct->n_conn);
 583     }
 584     return nc;
 585 }
 586
 587 static bool
 588 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
 589                   struct conn_lookup_ctx *ctx, struct conn **conn,
 590                   long long now, unsigned bucket)
 591     OVS_REQUIRES(ct->buckets[bucket].lock)
 592 {
 593     bool create_new_conn = false;
 594
 595     if (ctx->related) {
 596         pkt->md.ct_state |= CS_RELATED;
 597         if (ctx->reply) {
 598             pkt->md.ct_state |= CS_REPLY_DIR;
 599         }
 600     } else {
 601         enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
 602                                              pkt, ctx->reply, now);
 603
 604         switch (res) {
 605         case CT_UPDATE_VALID:
 606             pkt->md.ct_state |= CS_ESTABLISHED;
 607             pkt->md.ct_state &= ~CS_NEW;
 608             if (ctx->reply) {
 609                 pkt->md.ct_state |= CS_REPLY_DIR;
 610             }
 611             break;
 612         case CT_UPDATE_INVALID:
 613             pkt->md.ct_state = CS_INVALID;
 614             break;
 615         case CT_UPDATE_NEW:
 616             conn_clean(ct, *conn, &ct->buckets[bucket]);
 617             create_new_conn = true;
 618             break;
 619         default:
 620             OVS_NOT_REACHED();
 621         }
 622     }
 623     return create_new_conn;
 624 }
 625
 626 static void
 627 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
 628                    long long now)
 629 {
 630     struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
 631     nc->key = conn_for_un_nat_copy->rev_key;
 632     nc->rev_key = conn_for_un_nat_copy->key;
 633     uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
 634     unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
 635     ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
 636     ct_rwlock_rdlock(&ct->nat_resources_lock);
 637
 638     struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
 639
 640     struct nat_conn_key_node *nat_conn_key_node =
 641         nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
 642     if (nat_conn_key_node
 643         && !memcmp(&nat_conn_key_node->value, &nc->rev_key,
 644                    sizeof nat_conn_key_node->value)
 645         && !rev_conn) {
 646         hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
 647                     &nc->node, un_nat_hash);
 648     } else {
 649         free(nc);
 650     }
 651     ct_rwlock_unlock(&ct->nat_resources_lock);
 652     ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
 653 }
 654
 655 static void
 656 handle_nat(struct dp_packet *pkt, struct conn *conn,
 657            uint16_t zone, bool reply, bool related)
 658 {
 659     if (conn->nat_info &&
 660         (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
 661           (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
 662            zone != pkt->md.ct_zone))) {
 663         if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
 664             pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
 665         }
 666         if (reply) {
 667             un_nat_packet(pkt, conn, related);
 668         } else {
 669             nat_packet(pkt, conn, related);
 670         }
 671     }
 672 }
 673
 674 static void
 675 process_one(struct conntrack *ct, struct dp_packet *pkt,
 676             struct conn_lookup_ctx *ctx, uint16_t zone,
 677             bool force, bool commit, long long now, const uint32_t *setmark,
 678             const struct ovs_key_ct_labels *setlabel,
 679             const struct nat_action_info_t *nat_action_info)
 680 {
 681     struct conn *conn;
 682     unsigned bucket = hash_to_bucket(ctx->hash);
 683     ct_lock_lock(&ct->buckets[bucket].lock);
 684     conn_key_lookup(&ct->buckets[bucket], ctx, now);
 685     conn = ctx->conn;
 686
 687     /* Delete found entry if in wrong direction. 'force' implies commit. */
 688     if (conn && force && ctx->reply) {
 689         conn_clean(ct, conn, &ct->buckets[bucket]);
 690         conn = NULL;
 691     }
 692
 693     if (OVS_LIKELY(conn)) {
 694         if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
 695
 696             ctx->reply = true;
 697
 698             struct conn_lookup_ctx ctx2;
 699             ctx2.conn = NULL;
 700             ctx2.key = conn->rev_key;
 701             ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
 702
 703             ct_lock_unlock(&ct->buckets[bucket].lock);
 704             bucket = hash_to_bucket(ctx2.hash);
 705
 706             ct_lock_lock(&ct->buckets[bucket].lock);
 707             conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
 708
 709             if (ctx2.conn) {
 710                 conn = ctx2.conn;
 711             } else {
 712                 /* It is a race condition where conn has timed out and removed
 713                  * between unlock of the rev_conn and lock of the forward conn;
 714                  * nothing to do. */
 715                 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
 716                 ct_lock_unlock(&ct->buckets[bucket].lock);
 717                 return;
 718             }
 719         }
 720     }
 721
 722     bool create_new_conn = false;
 723     struct conn conn_for_un_nat_copy;
 724     conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
 725     if (OVS_LIKELY(conn)) {
 726         create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
 727         if (nat_action_info && !create_new_conn) {
 728             handle_nat(pkt, conn, zone, ctx->reply, ctx->related);
 729         }
 730     } else {
 731         if (ctx->related) {
 732             pkt->md.ct_state = CS_INVALID;
 733         } else {
 734             create_new_conn = true;
 735         }
 736     }
 737
 738     if (OVS_UNLIKELY(create_new_conn)) {
 739         conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
 740                               &conn_for_un_nat_copy);
 741     }
 742
 743     write_ct_md(pkt, zone, conn, &ctx->key);
 744     if (conn && setmark) {
 745         set_mark(pkt, conn, setmark[0], setmark[1]);
 746     }
 747
 748     if (conn && setlabel) {
 749         set_label(pkt, conn, &setlabel[0], &setlabel[1]);
 750     }
 751
 752     ct_lock_unlock(&ct->buckets[bucket].lock);
 753
 754     if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
 755         create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
 756     }
 757 }
 758
 759 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
 760  * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
 761  * the l3 and and l4 offset properly set.
 762  *
 763  * If 'commit' is true, the packets are allowed to create new entries in the
 764  * connection tables.  'setmark', if not NULL, should point to a two
 765  * elements array containing a value and a mask to set the connection mark.
 766  * 'setlabel' behaves similarly for the connection label.*/
 767 int
 768 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
 769                   ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
 770                   const uint32_t *setmark,
 771                   const struct ovs_key_ct_labels *setlabel,
 772                   const char *helper,
 773                   const struct nat_action_info_t *nat_action_info)
 774 {
 775     struct dp_packet **pkts = pkt_batch->packets;
 776     size_t cnt = pkt_batch->count;
 777     long long now = time_msec();
 778     struct conn_lookup_ctx ctx;
 779
 780     if (helper) {
 781         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
 782
 783         VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
 784         /* Continue without the helper */
 785     }
 786
 787     for (size_t i = 0; i < cnt; i++) {
 788         if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
 789             pkts[i]->md.ct_state = CS_INVALID;
 790             write_ct_md(pkts[i], zone, NULL, NULL);
 791             continue;
 792         }
 793         process_one(ct, pkts[i], &ctx, zone, force, commit,
 794                     now, setmark, setlabel, nat_action_info);
 795     }
 796
 797     return 0;
 798 }
 799
 800 static void
 801 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
 802 {
 803     pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
 804     conn->mark = pkt->md.ct_mark;
 805 }
 806
 807 static void
 808 set_label(struct dp_packet *pkt, struct conn *conn,
 809           const struct ovs_key_ct_labels *val,
 810           const struct ovs_key_ct_labels *mask)
 811 {
 812     ovs_u128 v, m;
 813
 814     memcpy(&v, val, sizeof v);
 815     memcpy(&m, mask, sizeof m);
 816
 817     pkt->md.ct_label.u64.lo = v.u64.lo
 818                               | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
 819     pkt->md.ct_label.u64.hi = v.u64.hi
 820                               | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
 821     conn->label = pkt->md.ct_label;
 822 }
 823
 824 \f
 825 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
 826  * earliest expiration time among the remaining connections in 'ctb'.  Returns
 827  * LLONG_MAX if 'ctb' is empty.  The return value might be smaller than 'now',
 828  * if 'limit' is reached */
 829 static long long
 830 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
 831              size_t limit)
 832     OVS_REQUIRES(ctb->lock)
 833 {
 834     struct conn *conn, *next;
 835     long long min_expiration = LLONG_MAX;
 836     unsigned i;
 837     size_t count = 0;
 838
 839     for (i = 0; i < N_CT_TM; i++) {
 840         LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
 841             if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
 842                 if (!conn_expired(conn, now) || count >= limit) {
 843                     min_expiration = MIN(min_expiration, conn->expiration);
 844                     if (count >= limit) {
 845                         /* Do not check other lists. */
 846                         COVERAGE_INC(conntrack_long_cleanup);
 847                         return min_expiration;
 848                     }
 849                     break;
 850                 }
 851                 conn_clean(ct, conn, ctb);
 852                 count++;
 853             }
 854         }
 855     }
 856
 857     return min_expiration;
 858 }
 859
 860 /* Cleans up old connection entries from 'ct'.  Returns the time when the
 861  * next expiration might happen.  The return value might be smaller than
 862  * 'now', meaning that an internal limit has been reached, and some expired
 863  * connections have not been deleted. */
 864 static long long
 865 conntrack_clean(struct conntrack *ct, long long now)
 866 {
 867     long long next_wakeup = now + CT_TM_MIN;
 868     unsigned int n_conn_limit;
 869     size_t clean_count = 0;
 870     unsigned i;
 871
 872     atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
 873
 874     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 875         struct conntrack_bucket *ctb = &ct->buckets[i];
 876         size_t prev_count;
 877         long long min_exp;
 878
 879         ovs_mutex_lock(&ctb->cleanup_mutex);
 880         if (ctb->next_cleanup > now) {
 881             goto next_bucket;
 882         }
 883
 884         ct_lock_lock(&ctb->lock);
 885         prev_count = hmap_count(&ctb->connections);
 886         /* If the connections are well distributed among buckets, we want to
 887          * limit to 10% of the global limit equally split among buckets. If
 888          * the bucket is busier than the others, we limit to 10% of its
 889          * current size. */
 890         min_exp = sweep_bucket(ct, ctb, now,
 891                 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
 892         clean_count += prev_count - hmap_count(&ctb->connections);
 893
 894         if (min_exp > now) {
 895             /* We call hmap_shrink() only if sweep_bucket() managed to delete
 896              * every expired connection. */
 897             hmap_shrink(&ctb->connections);
 898         }
 899
 900         ct_lock_unlock(&ctb->lock);
 901
 902         ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
 903
 904 next_bucket:
 905         next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
 906         ovs_mutex_unlock(&ctb->cleanup_mutex);
 907     }
 908
 909     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
 910              clean_count, time_msec() - now);
 911
 912     return next_wakeup;
 913 }
 914
 915 /* Cleanup:
 916  *
 917  * We must call conntrack_clean() periodically.  conntrack_clean() return
 918  * value gives an hint on when the next cleanup must be done (either because
 919  * there is an actual connection that expires, or because a new connection
 920  * might be created with the minimum timeout).
 921  *
 922  * The logic below has two goals:
 923  *
 924  * - We want to reduce the number of wakeups and batch connection cleanup
 925  *   when the load is not very high.  CT_CLEAN_INTERVAL ensures that if we
 926  *   are coping with the current cleanup tasks, then we wait at least
 927  *   5 seconds to do further cleanup.
 928  *
 929  * - We don't want to keep the buckets locked too long, as we might prevent
 930  *   traffic from flowing.  CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
 931  *   behind, there is at least some 200ms blocks of time when buckets will be
 932  *   left alone, so the datapath can operate unhindered.
 933  */
 934 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
 935 #define CT_CLEAN_MIN_INTERVAL 200  /* 0.2 seconds */
 936
 937 static void *
 938 clean_thread_main(void *f_)
 939 {
 940     struct conntrack *ct = f_;
 941
 942     while (!latch_is_set(&ct->clean_thread_exit)) {
 943         long long next_wake;
 944         long long now = time_msec();
 945
 946         next_wake = conntrack_clean(ct, now);
 947
 948         if (next_wake < now) {
 949             poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
 950         } else {
 951             poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
 952         }
 953         latch_wait(&ct->clean_thread_exit);
 954         poll_block();
 955     }
 956
 957     return NULL;
 958 }
 959 \f
 960 /* Key extraction */
 961
 962 /* The function stores a pointer to the first byte after the header in
 963  * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
 964  * not interested in the header's tail,  meaning that the header has
 965  * already been parsed (e.g. by flow_extract): we take this as a hint to
 966  * save a few checks.  If 'validate_checksum' is true, the function returns
 967  * false if the IPv4 checksum is invalid. */
 968 static inline bool
 969 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
 970                 const char **new_data, bool validate_checksum)
 971 {
 972     const struct ip_header *ip = data;
 973     size_t ip_len;
 974
 975     if (new_data) {
 976         if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
 977             return false;
 978         }
 979     }
 980
 981     ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
 982
 983     if (new_data) {
 984         if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
 985             return false;
 986         }
 987         if (OVS_UNLIKELY(size < ip_len)) {
 988             return false;
 989         }
 990
 991         *new_data = (char *) data + ip_len;
 992     }
 993
 994     if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
 995         return false;
 996     }
 997
 998     if (validate_checksum && csum(data, ip_len) != 0) {
 999         return false;
1000     }
1001
1002     key->src.addr.ipv4 = ip->ip_src;
1003     key->dst.addr.ipv4 = ip->ip_dst;
1004     key->nw_proto = ip->ip_proto;
1005
1006     return true;
1007 }
1008
1009 /* The function stores a pointer to the first byte after the header in
1010  * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
1011  * not interested in the header's tail,  meaning that the header has
1012  * already been parsed (e.g. by flow_extract): we take this as a hint to
1013  * save a few checks. */
1014 static inline bool
1015 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1016                 const char **new_data)
1017 {
1018     const struct ovs_16aligned_ip6_hdr *ip6 = data;
1019
1020     if (new_data) {
1021         if (OVS_UNLIKELY(size < sizeof *ip6)) {
1022             return false;
1023         }
1024     }
1025
1026     uint8_t nw_proto = ip6->ip6_nxt;
1027     uint8_t nw_frag = 0;
1028
1029     data = ip6 + 1;
1030     size -=  sizeof *ip6;
1031
1032     if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1033         return false;
1034     }
1035
1036     if (new_data) {
1037         *new_data = data;
1038     }
1039
1040     if (nw_frag) {
1041         return false;
1042     }
1043
1044     key->src.addr.ipv6 = ip6->ip6_src;
1045     key->dst.addr.ipv6 = ip6->ip6_dst;
1046     key->nw_proto = nw_proto;
1047
1048     return true;
1049 }
1050
1051 static inline bool
1052 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1053                const void *l3)
1054 {
1055     uint32_t csum = 0;
1056
1057     if (key->dl_type == htons(ETH_TYPE_IP)) {
1058         csum = packet_csum_pseudoheader(l3);
1059     } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1060         csum = packet_csum_pseudoheader6(l3);
1061     } else {
1062         return false;
1063     }
1064
1065     csum = csum_continue(csum, data, size);
1066
1067     return csum_finish(csum) == 0;
1068 }
1069
1070 static inline bool
1071 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1072              const void *l3)
1073 {
1074     const struct tcp_header *tcp = data;
1075     if (size < sizeof *tcp) {
1076         return false;
1077     }
1078
1079     size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1080     if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1081         return false;
1082     }
1083
1084     return checksum_valid(key, data, size, l3);
1085 }
1086
1087 static inline bool
1088 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1089              const void *l3)
1090 {
1091     const struct udp_header *udp = data;
1092     if (size < sizeof *udp) {
1093         return false;
1094     }
1095
1096     size_t udp_len = ntohs(udp->udp_len);
1097     if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1098         return false;
1099     }
1100
1101     /* Validation must be skipped if checksum is 0 on IPv4 packets */
1102     return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1103            || checksum_valid(key, data, size, l3);
1104 }
1105
1106 static inline bool
1107 check_l4_icmp(const void *data, size_t size)
1108 {
1109     return csum(data, size) == 0;
1110 }
1111
1112 static inline bool
1113 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1114                const void *l3)
1115 {
1116     return checksum_valid(key, data, size, l3);
1117 }
1118
1119 static inline bool
1120 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1121 {
1122     const struct tcp_header *tcp = data;
1123
1124     if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1125         return false;
1126     }
1127
1128     key->src.port = tcp->tcp_src;
1129     key->dst.port = tcp->tcp_dst;
1130
1131     /* Port 0 is invalid */
1132     return key->src.port && key->dst.port;
1133 }
1134
1135 static inline bool
1136 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1137 {
1138     const struct udp_header *udp = data;
1139
1140     if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1141         return false;
1142     }
1143
1144     key->src.port = udp->udp_src;
1145     key->dst.port = udp->udp_dst;
1146
1147     /* Port 0 is invalid */
1148     return key->src.port && key->dst.port;
1149 }
1150
1151 static inline bool extract_l4(struct conn_key *key, const void *data,
1152                               size_t size, bool *related, const void *l3);
1153
1154 static uint8_t
1155 reverse_icmp_type(uint8_t type)
1156 {
1157     switch (type) {
1158     case ICMP4_ECHO_REQUEST:
1159         return ICMP4_ECHO_REPLY;
1160     case ICMP4_ECHO_REPLY:
1161         return ICMP4_ECHO_REQUEST;
1162
1163     case ICMP4_TIMESTAMP:
1164         return ICMP4_TIMESTAMPREPLY;
1165     case ICMP4_TIMESTAMPREPLY:
1166         return ICMP4_TIMESTAMP;
1167
1168     case ICMP4_INFOREQUEST:
1169         return ICMP4_INFOREPLY;
1170     case ICMP4_INFOREPLY:
1171         return ICMP4_INFOREQUEST;
1172     default:
1173         OVS_NOT_REACHED();
1174     }
1175 }
1176
1177 /* If 'related' is not NULL and the function is processing an ICMP
1178  * error packet, extract the l3 and l4 fields from the nested header
1179  * instead and set *related to true.  If 'related' is NULL we're
1180  * already processing a nested header and no such recursion is
1181  * possible */
1182 static inline int
1183 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1184                 bool *related)
1185 {
1186     const struct icmp_header *icmp = data;
1187
1188     if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1189         return false;
1190     }
1191
1192     switch (icmp->icmp_type) {
1193     case ICMP4_ECHO_REQUEST:
1194     case ICMP4_ECHO_REPLY:
1195     case ICMP4_TIMESTAMP:
1196     case ICMP4_TIMESTAMPREPLY:
1197     case ICMP4_INFOREQUEST:
1198     case ICMP4_INFOREPLY:
1199         if (icmp->icmp_code != 0) {
1200             return false;
1201         }
1202         /* Separate ICMP connection: identified using id */
1203         key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1204         key->src.icmp_type = icmp->icmp_type;
1205         key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1206         break;
1207     case ICMP4_DST_UNREACH:
1208     case ICMP4_TIME_EXCEEDED:
1209     case ICMP4_PARAM_PROB:
1210     case ICMP4_SOURCEQUENCH:
1211     case ICMP4_REDIRECT: {
1212         /* ICMP packet part of another connection. We should
1213          * extract the key from embedded packet header */
1214         struct conn_key inner_key;
1215         const char *l3 = (const char *) (icmp + 1);
1216         const char *tail = (const char *) data + size;
1217         const char *l4;
1218         bool ok;
1219
1220         if (!related) {
1221             return false;
1222         }
1223
1224         memset(&inner_key, 0, sizeof inner_key);
1225         inner_key.dl_type = htons(ETH_TYPE_IP);
1226         ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1227         if (!ok) {
1228             return false;
1229         }
1230
1231         if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1232             || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1233             return false;
1234         }
1235
1236         key->src = inner_key.src;
1237         key->dst = inner_key.dst;
1238         key->nw_proto = inner_key.nw_proto;
1239
1240         ok = extract_l4(key, l4, tail - l4, NULL, l3);
1241         if (ok) {
1242             conn_key_reverse(key);
1243             *related = true;
1244         }
1245         return ok;
1246     }
1247     default:
1248         return false;
1249     }
1250
1251     return true;
1252 }
1253
1254 static uint8_t
1255 reverse_icmp6_type(uint8_t type)
1256 {
1257     switch (type) {
1258     case ICMP6_ECHO_REQUEST:
1259         return ICMP6_ECHO_REPLY;
1260     case ICMP6_ECHO_REPLY:
1261         return ICMP6_ECHO_REQUEST;
1262     default:
1263         OVS_NOT_REACHED();
1264     }
1265 }
1266
1267 /* If 'related' is not NULL and the function is processing an ICMP
1268  * error packet, extract the l3 and l4 fields from the nested header
1269  * instead and set *related to true.  If 'related' is NULL we're
1270  * already processing a nested header and no such recursion is
1271  * possible */
1272 static inline bool
1273 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1274                  bool *related)
1275 {
1276     const struct icmp6_header *icmp6 = data;
1277
1278     /* All the messages that we support need at least 4 bytes after
1279      * the header */
1280     if (size < sizeof *icmp6 + 4) {
1281         return false;
1282     }
1283
1284     switch (icmp6->icmp6_type) {
1285     case ICMP6_ECHO_REQUEST:
1286     case ICMP6_ECHO_REPLY:
1287         if (icmp6->icmp6_code != 0) {
1288             return false;
1289         }
1290         /* Separate ICMP connection: identified using id */
1291         key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1292         key->src.icmp_type = icmp6->icmp6_type;
1293         key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1294         break;
1295     case ICMP6_DST_UNREACH:
1296     case ICMP6_PACKET_TOO_BIG:
1297     case ICMP6_TIME_EXCEEDED:
1298     case ICMP6_PARAM_PROB: {
1299         /* ICMP packet part of another connection. We should
1300          * extract the key from embedded packet header */
1301         struct conn_key inner_key;
1302         const char *l3 = (const char *) icmp6 + 8;
1303         const char *tail = (const char *) data + size;
1304         const char *l4 = NULL;
1305         bool ok;
1306
1307         if (!related) {
1308             return false;
1309         }
1310
1311         memset(&inner_key, 0, sizeof inner_key);
1312         inner_key.dl_type = htons(ETH_TYPE_IPV6);
1313         ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1314         if (!ok) {
1315             return false;
1316         }
1317
1318         /* pf doesn't do this, but it seems a good idea */
1319         if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1320                               &key->dst.addr.ipv6_aligned)
1321             || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1322                                  &key->src.addr.ipv6_aligned)) {
1323             return false;
1324         }
1325
1326         key->src = inner_key.src;
1327         key->dst = inner_key.dst;
1328         key->nw_proto = inner_key.nw_proto;
1329
1330         ok = extract_l4(key, l4, tail - l4, NULL, l3);
1331         if (ok) {
1332             conn_key_reverse(key);
1333             *related = true;
1334         }
1335         return ok;
1336     }
1337     default:
1338         return false;
1339     }
1340
1341     return true;
1342 }
1343
1344 /* Extract l4 fields into 'key', which must already contain valid l3
1345  * members.
1346  *
1347  * If 'related' is not NULL and an ICMP error packet is being
1348  * processed, the function will extract the key from the packet nested
1349  * in the ICMP paylod and set '*related' to true.
1350  *
1351  * If 'related' is NULL, it means that we're already parsing a header nested
1352  * in an ICMP error.  In this case, we skip checksum and length validation. */
1353 static inline bool
1354 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1355            const void *l3)
1356 {
1357     if (key->nw_proto == IPPROTO_TCP) {
1358         return (!related || check_l4_tcp(key, data, size, l3))
1359                && extract_l4_tcp(key, data, size);
1360     } else if (key->nw_proto == IPPROTO_UDP) {
1361         return (!related || check_l4_udp(key, data, size, l3))
1362                && extract_l4_udp(key, data, size);
1363     } else if (key->dl_type == htons(ETH_TYPE_IP)
1364                && key->nw_proto == IPPROTO_ICMP) {
1365         return (!related || check_l4_icmp(data, size))
1366                && extract_l4_icmp(key, data, size, related);
1367     } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1368                && key->nw_proto == IPPROTO_ICMPV6) {
1369         return (!related || check_l4_icmp6(key, data, size, l3))
1370                && extract_l4_icmp6(key, data, size, related);
1371     } else {
1372         return false;
1373     }
1374 }
1375
1376 static bool
1377 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1378                  struct conn_lookup_ctx *ctx, uint16_t zone)
1379 {
1380     const struct eth_header *l2 = dp_packet_eth(pkt);
1381     const struct ip_header *l3 = dp_packet_l3(pkt);
1382     const char *l4 = dp_packet_l4(pkt);
1383     const char *tail = dp_packet_tail(pkt);
1384     bool ok;
1385
1386     memset(ctx, 0, sizeof *ctx);
1387
1388     if (!l2 || !l3 || !l4) {
1389         return false;
1390     }
1391
1392     ctx->key.zone = zone;
1393
1394     /* XXX In this function we parse the packet (again, it has already
1395      * gone through miniflow_extract()) for two reasons:
1396      *
1397      * 1) To extract the l3 addresses and l4 ports.
1398      *    We already have the l3 and l4 headers' pointers.  Extracting
1399      *    the l3 addresses and the l4 ports is really cheap, since they
1400      *    can be found at fixed locations.
1401      * 2) To extract the l4 type.
1402      *    Extracting the l4 types, for IPv6 can be quite expensive, because
1403      *    it's not at a fixed location.
1404      *
1405      * Here's a way to avoid (2) with the help of the datapath.
1406      * The datapath doesn't keep the packet's extracted flow[1], so
1407      * using that is not an option.  We could use the packet's matching
1408      * megaflow, but we have to make sure that the l4 type (nw_proto)
1409      * is unwildcarded.  This means either:
1410      *
1411      * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1412      *    if the actions contains ct().
1413      *
1414      * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1415      *    action.  This is already done in different actions, but it's
1416      *    unnecessary for the kernel.
1417      *
1418      * ---
1419      * [1] The reasons for this are that keeping the flow increases
1420      *     (slightly) the cache footprint and increases computation
1421      *     time as we move the packet around. Most importantly, the flow
1422      *     should be updated by the actions and this can be slow, as
1423      *     we use a sparse representation (miniflow).
1424      *
1425      */
1426     ctx->key.dl_type = dl_type;
1427     if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1428         ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
1429     } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1430         ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1431     } else {
1432         ok = false;
1433     }
1434
1435     if (ok) {
1436         if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
1437             ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1438             return true;
1439         }
1440     }
1441
1442     return false;
1443 }
1444 \f
1445 /* Symmetric */
1446 static uint32_t
1447 conn_key_hash(const struct conn_key *key, uint32_t basis)
1448 {
1449     uint32_t hsrc, hdst, hash;
1450     int i;
1451
1452     hsrc = hdst = basis;
1453
1454     for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
1455         hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
1456         hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1457     }
1458
1459     /* Even if source and destination are swapped the hash will be the same. */
1460     hash = hsrc ^ hdst;
1461
1462     /* Hash the rest of the key(L3 and L4 types and zone). */
1463     hash = hash_words((uint32_t *) (&key->dst + 1),
1464                       (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1465                       hash);
1466
1467     return hash;
1468 }
1469
1470 static void
1471 conn_key_reverse(struct conn_key *key)
1472 {
1473     struct ct_endpoint tmp;
1474
1475     tmp = key->src;
1476     key->src = key->dst;
1477     key->dst = tmp;
1478 }
1479
1480 static uint32_t
1481 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1482                      struct in6_addr *ipv6_aligned_max)
1483 {
1484     uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1485     uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] +  sizeof(uint64_t);
1486     uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1487     uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1488
1489     ovs_be64 addr6_64_min_hi;
1490     ovs_be64 addr6_64_min_lo;
1491     memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1492     memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1493
1494     ovs_be64 addr6_64_max_hi;
1495     ovs_be64 addr6_64_max_lo;
1496     memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1497     memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1498
1499     uint64_t diff;
1500     if (addr6_64_min_hi == addr6_64_max_hi &&
1501         ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1502         diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1503     } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1504                ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1505         diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1506                              ntohll(addr6_64_max_lo) - 1);
1507     } else {
1508         /* Limit address delta supported to 32 bits or 4 billion approximately.
1509          * Possibly, this should be visible to the user through a datapath
1510          * support check, however the practical impact is probably nil. */
1511         diff = 0xfffffffe;
1512     }
1513     if (diff > 0xfffffffe) {
1514         diff = 0xfffffffe;
1515     }
1516     return diff;
1517 }
1518
1519 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1520  * restricts the input parameters. */
1521 static void
1522 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1523 {
1524     uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1525     uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1526     ovs_be64 addr6_64_hi;
1527     ovs_be64 addr6_64_lo;
1528     memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1529     memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1530
1531     if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1532         addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1533     } else if (addr6_64_hi != OVS_BE64_MAX) {
1534         addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1535         addr6_64_lo = htonll(increment - (UINT64_MAX -
1536                                           ntohll(addr6_64_lo) + 1));
1537     } else {
1538         OVS_NOT_REACHED();
1539     }
1540
1541     memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1542     memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1543
1544     return;
1545 }
1546
1547 static uint32_t
1548 nat_range_hash(const struct conn *conn, uint32_t basis)
1549 {
1550     uint32_t hash = basis;
1551     int i;
1552     uint16_t port;
1553
1554     for (i = 0;
1555          i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t);
1556          i++) {
1557         hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]);
1558         hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]);
1559     }
1560
1561     memcpy(&port, &conn->nat_info->min_port, sizeof port);
1562     hash = hash_add(hash, port);
1563
1564     for (i = 0; i < sizeof(conn->key.src.addr) / sizeof(uint32_t); i++) {
1565         hash = hash_add(hash, ((uint32_t *) &conn->key.src)[i]);
1566         hash = hash_add(hash, ((uint32_t *) &conn->key.dst)[i]);
1567     }
1568
1569     memcpy(&port, &conn->key.src.port, sizeof port);
1570     hash = hash_add(hash, port);
1571     memcpy(&port, &conn->key.dst.port, sizeof port);
1572     hash = hash_add(hash, port);
1573
1574     hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1575     hash = hash_add(hash, conn->key.nw_proto);
1576     hash = hash_add(hash, conn->key.zone);
1577     return hash;
1578 }
1579
1580 static bool
1581 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1582                        struct conn *nat_conn)
1583 {
1584 #define MIN_NAT_EPHEMERAL_PORT 1024
1585 #define MAX_NAT_EPHEMERAL_PORT 65535
1586
1587     uint16_t min_port;
1588     uint16_t max_port;
1589     uint16_t first_port;
1590
1591     uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1592
1593     if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1594         (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1595         min_port = ntohs(conn->key.src.port);
1596         max_port = ntohs(conn->key.src.port);
1597         first_port = min_port;
1598     } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1599                (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1600         min_port = ntohs(conn->key.dst.port);
1601         max_port = ntohs(conn->key.dst.port);
1602         first_port = min_port;
1603     } else {
1604         uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1605         uint32_t port_index = hash % (deltap + 1);
1606         first_port = conn->nat_info->min_port + port_index;
1607         min_port = conn->nat_info->min_port;
1608         max_port = conn->nat_info->max_port;
1609     }
1610
1611     uint32_t deltaa = 0;
1612     uint32_t address_index;
1613     struct ct_addr ct_addr;
1614     memset(&ct_addr, 0, sizeof ct_addr);
1615     struct ct_addr max_ct_addr;
1616     memset(&max_ct_addr, 0, sizeof max_ct_addr);
1617     max_ct_addr = conn->nat_info->max_addr;
1618
1619     if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1620         deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
1621                  ntohl(conn->nat_info->min_addr.ipv4_aligned);
1622         address_index = hash % (deltaa + 1);
1623         ct_addr.ipv4_aligned = htonl(
1624             ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
1625     } else {
1626         deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
1627                                       &conn->nat_info->max_addr.ipv6_aligned);
1628         /* deltaa must be within 32 bits for full hash coverage. A 64 or
1629          * 128 bit hash is unnecessary and hence not used here. Most code
1630          * is kept common with V4; nat_ipv6_addrs_delta() will do the
1631          * enforcement via max_ct_addr. */
1632         max_ct_addr = conn->nat_info->min_addr;
1633         nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
1634
1635         address_index = hash % (deltaa + 1);
1636         ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
1637         nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
1638     }
1639
1640     uint16_t port = first_port;
1641     bool all_ports_tried = false;
1642     bool original_ports_tried = false;
1643     struct ct_addr first_addr = ct_addr;
1644     *nat_conn = *conn;
1645
1646     while (true) {
1647         if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1648             nat_conn->rev_key.dst.addr = ct_addr;
1649         } else {
1650             nat_conn->rev_key.src.addr = ct_addr;
1651         }
1652
1653         if ((conn->key.nw_proto == IPPROTO_ICMP) ||
1654             (conn->key.nw_proto == IPPROTO_ICMPV6)) {
1655             all_ports_tried = true;
1656         } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1657             nat_conn->rev_key.dst.port = htons(port);
1658         } else {
1659             nat_conn->rev_key.src.port = htons(port);
1660         }
1661
1662         struct nat_conn_key_node *nat_conn_key_node =
1663             nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
1664                                  ct->hash_basis);
1665
1666         if (!nat_conn_key_node) {
1667             struct nat_conn_key_node *nat_conn_key =
1668                 xzalloc(sizeof *nat_conn_key);
1669             nat_conn_key->key = nat_conn->rev_key;
1670             nat_conn_key->value = nat_conn->key;
1671             uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
1672                                                        ct->hash_basis);
1673             hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
1674                         nat_conn_key_hash);
1675             return true;
1676         } else if (!all_ports_tried) {
1677             if (min_port == max_port) {
1678                 all_ports_tried = true;
1679             } else if (port == max_port) {
1680                 port = min_port;
1681             } else {
1682                 port++;
1683             }
1684             if (port == first_port) {
1685                 all_ports_tried = true;
1686             }
1687         } else {
1688             if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
1689                 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1690                     ct_addr.ipv4_aligned = htonl(
1691                         ntohl(ct_addr.ipv4_aligned) + 1);
1692                 } else {
1693                     nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
1694                 }
1695             } else {
1696                 ct_addr = conn->nat_info->min_addr;
1697             }
1698             if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
1699                 if (!original_ports_tried) {
1700                     original_ports_tried = true;
1701                     ct_addr = conn->nat_info->min_addr;
1702                     min_port = MIN_NAT_EPHEMERAL_PORT;
1703                     max_port = MAX_NAT_EPHEMERAL_PORT;
1704                 } else {
1705                     break;
1706                 }
1707             }
1708             first_port = min_port;
1709             port = first_port;
1710             all_ports_tried = false;
1711         }
1712     }
1713     return false;
1714 }
1715
1716 static struct nat_conn_key_node *
1717 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
1718                      const struct conn_key *key,
1719                      uint32_t basis)
1720 {
1721     struct nat_conn_key_node *nat_conn_key_node;
1722     uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1723
1724     HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1725                              nat_conn_keys) {
1726         if (!memcmp(&nat_conn_key_node->key, key,
1727                     sizeof nat_conn_key_node->key)) {
1728             return nat_conn_key_node;
1729         }
1730     }
1731     return NULL;
1732 }
1733
1734 static void
1735 nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
1736                      uint32_t basis)
1737 {
1738     struct nat_conn_key_node *nat_conn_key_node;
1739     uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1740
1741     HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1742                              nat_conn_keys) {
1743         if (!memcmp(&nat_conn_key_node->key, key,
1744                     sizeof nat_conn_key_node->key)) {
1745             hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
1746             free(nat_conn_key_node);
1747             return;
1748         }
1749     }
1750 }
1751
1752 static void
1753 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
1754                 long long now)
1755 {
1756     uint32_t hash = ctx->hash;
1757     struct conn *conn;
1758
1759     ctx->conn = NULL;
1760
1761     HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1762         if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
1763                 && !conn_expired(conn, now)) {
1764             ctx->conn = conn;
1765             ctx->reply = false;
1766             break;
1767         }
1768         if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
1769                 && !conn_expired(conn, now)) {
1770             ctx->conn = conn;
1771             ctx->reply = true;
1772             break;
1773         }
1774     }
1775 }
1776
1777 static enum ct_update_res
1778 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1779             struct dp_packet *pkt, bool reply, long long now)
1780 {
1781     return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1782                                                       reply, now);
1783 }
1784
1785 static bool
1786 conn_expired(struct conn *conn, long long now)
1787 {
1788     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1789         return now >= conn->expiration;
1790     }
1791     return false;
1792 }
1793
1794 static bool
1795 valid_new(struct dp_packet *pkt, struct conn_key *key)
1796 {
1797     return l4_protos[key->nw_proto]->valid_new(pkt);
1798 }
1799
1800 static struct conn *
1801 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1802          struct conn_key *key, long long now)
1803 {
1804     struct conn *newconn;
1805
1806     newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1807
1808     if (newconn) {
1809         newconn->key = *key;
1810     }
1811
1812     return newconn;
1813 }
1814
1815 static void
1816 delete_conn(struct conn *conn)
1817 {
1818     free(conn->nat_info);
1819     free(conn);
1820 }
1821 \f
1822 static void
1823 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1824                                  union ct_dpif_inet_addr *b,
1825                                  ovs_be16 dl_type)
1826 {
1827     if (dl_type == htons(ETH_TYPE_IP)) {
1828         b->ip = a->ipv4_aligned;
1829     } else if (dl_type == htons(ETH_TYPE_IPV6)){
1830         b->in6 = a->ipv6_aligned;
1831     }
1832 }
1833
1834 static void
1835 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1836 {
1837     if (key->dl_type == htons(ETH_TYPE_IP)) {
1838         tuple->l3_type = AF_INET;
1839     } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1840         tuple->l3_type = AF_INET6;
1841     }
1842     tuple->ip_proto = key->nw_proto;
1843     ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1844                                      key->dl_type);
1845     ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1846                                      key->dl_type);
1847
1848     if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1849         tuple->icmp_id = key->src.icmp_id;
1850         tuple->icmp_type = key->src.icmp_type;
1851         tuple->icmp_code = key->src.icmp_code;
1852     } else {
1853         tuple->src_port = key->src.port;
1854         tuple->dst_port = key->dst.port;
1855     }
1856 }
1857
1858 static void
1859 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1860                       long long now)
1861 {
1862     struct ct_l4_proto *class;
1863     long long expiration;
1864     memset(entry, 0, sizeof *entry);
1865     conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1866     conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1867
1868     entry->zone = conn->key.zone;
1869     entry->mark = conn->mark;
1870
1871     memcpy(&entry->labels, &conn->label, sizeof entry->labels);
1872     /* Not implemented yet */
1873     entry->timestamp.start = 0;
1874     entry->timestamp.stop = 0;
1875
1876     expiration = conn->expiration - now;
1877     entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1878
1879     class = l4_protos[conn->key.nw_proto];
1880     if (class->conn_get_protoinfo) {
1881         class->conn_get_protoinfo(conn, &entry->protoinfo);
1882     }
1883 }
1884
1885 int
1886 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1887                      const uint16_t *pzone)
1888 {
1889     memset(dump, 0, sizeof(*dump));
1890     if (pzone) {
1891         dump->zone = *pzone;
1892         dump->filter_zone = true;
1893     }
1894     dump->ct = ct;
1895
1896     return 0;
1897 }
1898
1899 int
1900 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1901 {
1902     struct conntrack *ct = dump->ct;
1903     long long now = time_msec();
1904
1905     while (dump->bucket < CONNTRACK_BUCKETS) {
1906         struct hmap_node *node;
1907
1908         ct_lock_lock(&ct->buckets[dump->bucket].lock);
1909         for (;;) {
1910             struct conn *conn;
1911
1912             node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1913                                     &dump->bucket_pos);
1914             if (!node) {
1915                 break;
1916             }
1917             INIT_CONTAINER(conn, node, node);
1918             if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
1919                  (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
1920                 conn_to_ct_dpif_entry(conn, entry, now);
1921                 break;
1922             }
1923             /* Else continue, until we find an entry in the appropriate zone
1924              * or the bucket has been scanned completely. */
1925         }
1926         ct_lock_unlock(&ct->buckets[dump->bucket].lock);
1927
1928         if (!node) {
1929             memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
1930             dump->bucket++;
1931         } else {
1932             return 0;
1933         }
1934     }
1935     return EOF;
1936 }
1937
1938 int
1939 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
1940 {
1941     return 0;
1942 }
1943
1944 int
1945 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
1946 {
1947     unsigned i;
1948
1949     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1950         struct conn *conn, *next;
1951
1952         ct_lock_lock(&ct->buckets[i].lock);
1953         HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
1954             if ((!zone || *zone == conn->key.zone) &&
1955                 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
1956                 conn_clean(ct, conn, &ct->buckets[i]);
1957             }
1958         }
1959         ct_lock_unlock(&ct->buckets[i].lock);
1960     }
1961     return 0;
1962 }