lib/conntrack.c

   1 /*
   2  * Copyright (c) 2015, 2016 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "conntrack.h"
  19
  20 #include <errno.h>
  21 #include <sys/types.h>
  22 #include <netinet/in.h>
  23 #include <netinet/icmp6.h>
  24
  25 #include "bitmap.h"
  26 #include "conntrack-private.h"
  27 #include "coverage.h"
  28 #include "csum.h"
  29 #include "dp-packet.h"
  30 #include "flow.h"
  31 #include "netdev.h"
  32 #include "odp-netlink.h"
  33 #include "openvswitch/hmap.h"
  34 #include "openvswitch/vlog.h"
  35 #include "ovs-rcu.h"
  36 #include "ovs-thread.h"
  37 #include "poll-loop.h"
  38 #include "random.h"
  39 #include "timeval.h"
  40
  41 VLOG_DEFINE_THIS_MODULE(conntrack);
  42
  43 COVERAGE_DEFINE(conntrack_full);
  44 COVERAGE_DEFINE(conntrack_long_cleanup);
  45
  46 struct conn_lookup_ctx {
  47     struct conn_key key;
  48     struct conn *conn;
  49     uint32_t hash;
  50     bool reply;
  51     bool related;
  52 };
  53
  54 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
  55                              struct conn_lookup_ctx *, uint16_t zone);
  56 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
  57 static void conn_key_reverse(struct conn_key *);
  58 static void conn_key_lookup(struct conntrack_bucket *ctb,
  59                             struct conn_lookup_ctx *ctx,
  60                             long long now);
  61 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
  62 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
  63                              struct conn_key *, long long now);
  64 static void delete_conn(struct conn *);
  65 static enum ct_update_res conn_update(struct conn *,
  66                                       struct conntrack_bucket *ctb,
  67                                       struct dp_packet *, bool reply,
  68                                       long long now);
  69 static bool conn_expired(struct conn *, long long now);
  70 static void set_mark(struct dp_packet *, struct conn *,
  71                      uint32_t val, uint32_t mask);
  72 static void set_label(struct dp_packet *, struct conn *,
  73                       const struct ovs_key_ct_labels *val,
  74                       const struct ovs_key_ct_labels *mask);
  75 static void *clean_thread_main(void *f_);
  76
  77 static struct ct_l4_proto *l4_protos[] = {
  78     [IPPROTO_TCP] = &ct_proto_tcp,
  79     [IPPROTO_UDP] = &ct_proto_other,
  80     [IPPROTO_ICMP] = &ct_proto_other,
  81     [IPPROTO_ICMPV6] = &ct_proto_other,
  82 };
  83
  84 long long ct_timeout_val[] = {
  85 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
  86     CT_TIMEOUTS
  87 #undef CT_TIMEOUT
  88 };
  89
  90 /* If the total number of connections goes above this value, no new connections
  91  * are accepted */
  92 #define DEFAULT_N_CONN_LIMIT 3000000
  93
  94 /* Initializes the connection tracker 'ct'.  The caller is responsible for
  95  * calling 'conntrack_destroy()', when the instance is not needed anymore */
  96 void
  97 conntrack_init(struct conntrack *ct)
  98 {
  99     unsigned i, j;
 100     long long now = time_msec();
 101
 102     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 103         struct conntrack_bucket *ctb = &ct->buckets[i];
 104
 105         ct_lock_init(&ctb->lock);
 106         ct_lock_lock(&ctb->lock);
 107         hmap_init(&ctb->connections);
 108         for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
 109             ovs_list_init(&ctb->exp_lists[j]);
 110         }
 111         ct_lock_unlock(&ctb->lock);
 112         ovs_mutex_init(&ctb->cleanup_mutex);
 113         ovs_mutex_lock(&ctb->cleanup_mutex);
 114         ctb->next_cleanup = now + CT_TM_MIN;
 115         ovs_mutex_unlock(&ctb->cleanup_mutex);
 116     }
 117     ct->hash_basis = random_uint32();
 118     atomic_count_init(&ct->n_conn, 0);
 119     atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
 120     latch_init(&ct->clean_thread_exit);
 121     ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
 122 }
 123
 124 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
 125 void
 126 conntrack_destroy(struct conntrack *ct)
 127 {
 128     unsigned i;
 129
 130     latch_set(&ct->clean_thread_exit);
 131     pthread_join(ct->clean_thread, NULL);
 132     latch_destroy(&ct->clean_thread_exit);
 133     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 134         struct conntrack_bucket *ctb = &ct->buckets[i];
 135         struct conn *conn;
 136
 137         ovs_mutex_destroy(&ctb->cleanup_mutex);
 138         ct_lock_lock(&ctb->lock);
 139         HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
 140             atomic_count_dec(&ct->n_conn);
 141             delete_conn(conn);
 142         }
 143         hmap_destroy(&ctb->connections);
 144         ct_lock_unlock(&ctb->lock);
 145         ct_lock_destroy(&ctb->lock);
 146     }
 147 }
 148 \f
 149 static unsigned hash_to_bucket(uint32_t hash)
 150 {
 151     /* Extracts the most significant bits in hash. The least significant bits
 152      * are already used internally by the hmap implementation. */
 153     BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
 154
 155     return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
 156 }
 157
 158 static void
 159 write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
 160             uint32_t mark, ovs_u128 label)
 161 {
 162     pkt->md.ct_state = state | CS_TRACKED;
 163     pkt->md.ct_zone = zone;
 164     pkt->md.ct_mark = mark;
 165     pkt->md.ct_label = label;
 166 }
 167
 168 static struct conn *
 169 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
 170                struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
 171                long long now)
 172 {
 173     unsigned bucket = hash_to_bucket(ctx->hash);
 174     struct conn *nc = NULL;
 175
 176     if (!valid_new(pkt, &ctx->key)) {
 177         *state |= CS_INVALID;
 178         return nc;
 179     }
 180
 181     *state |= CS_NEW;
 182
 183     if (commit) {
 184         unsigned int n_conn_limit;
 185
 186         atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
 187
 188         if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
 189             COVERAGE_INC(conntrack_full);
 190             return nc;
 191         }
 192
 193         nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
 194
 195         memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
 196
 197         conn_key_reverse(&nc->rev_key);
 198         hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
 199         atomic_count_inc(&ct->n_conn);
 200     }
 201
 202     return nc;
 203 }
 204
 205 static struct conn *
 206 process_one(struct conntrack *ct, struct dp_packet *pkt,
 207             struct conn_lookup_ctx *ctx, uint16_t zone,
 208             bool commit, long long now)
 209 {
 210     unsigned bucket = hash_to_bucket(ctx->hash);
 211     struct conn *conn = ctx->conn;
 212     uint16_t state = 0;
 213
 214     if (conn) {
 215         if (ctx->related) {
 216             state |= CS_RELATED;
 217             if (ctx->reply) {
 218                 state |= CS_REPLY_DIR;
 219             }
 220         } else {
 221             enum ct_update_res res;
 222
 223             res = conn_update(conn, &ct->buckets[bucket], pkt,
 224                               ctx->reply, now);
 225
 226             switch (res) {
 227             case CT_UPDATE_VALID:
 228                 state |= CS_ESTABLISHED;
 229                 if (ctx->reply) {
 230                     state |= CS_REPLY_DIR;
 231                 }
 232                 break;
 233             case CT_UPDATE_INVALID:
 234                 state |= CS_INVALID;
 235                 break;
 236             case CT_UPDATE_NEW:
 237                 ovs_list_remove(&conn->exp_node);
 238                 hmap_remove(&ct->buckets[bucket].connections, &conn->node);
 239                 atomic_count_dec(&ct->n_conn);
 240                 delete_conn(conn);
 241                 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
 242                 break;
 243             default:
 244                 OVS_NOT_REACHED();
 245             }
 246         }
 247     } else {
 248         conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
 249     }
 250
 251     write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
 252                 conn ? conn->label : OVS_U128_ZERO);
 253
 254     return conn;
 255 }
 256
 257 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
 258  * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
 259  * the l3 and and l4 offset properly set.
 260  *
 261  * If 'commit' is true, the packets are allowed to create new entries in the
 262  * connection tables.  'setmark', if not NULL, should point to a two
 263  * elements array containing a value and a mask to set the connection mark.
 264  * 'setlabel' behaves similarly for the connection label.*/
 265 int
 266 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
 267                   bool commit, uint16_t zone, const uint32_t *setmark,
 268                   const struct ovs_key_ct_labels *setlabel,
 269                   const char *helper)
 270 {
 271     struct dp_packet **pkts = pkt_batch->packets;
 272     size_t cnt = pkt_batch->count;
 273 #if !defined(__CHECKER__) && !defined(_WIN32)
 274     const size_t KEY_ARRAY_SIZE = cnt;
 275 #else
 276     enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
 277 #endif
 278     struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
 279     int8_t bucket_list[CONNTRACK_BUCKETS];
 280     struct {
 281         unsigned bucket;
 282         unsigned long maps;
 283     } arr[KEY_ARRAY_SIZE];
 284     long long now = time_msec();
 285     size_t i = 0;
 286     uint8_t arrcnt = 0;
 287
 288     BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
 289
 290     if (helper) {
 291         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
 292
 293         VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
 294         /* Continue without the helper */
 295     }
 296
 297     memset(bucket_list, INT8_C(-1), sizeof bucket_list);
 298     for (i = 0; i < cnt; i++) {
 299         unsigned bucket;
 300
 301         if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
 302             write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
 303             continue;
 304         }
 305
 306         bucket = hash_to_bucket(ctxs[i].hash);
 307         if (bucket_list[bucket] == INT8_C(-1)) {
 308             bucket_list[bucket] = arrcnt;
 309
 310             arr[arrcnt].maps = 0;
 311             ULLONG_SET1(arr[arrcnt].maps, i);
 312             arr[arrcnt++].bucket = bucket;
 313         } else {
 314             ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
 315             arr[bucket_list[bucket]].maps |= 1UL << i;
 316         }
 317     }
 318
 319     for (i = 0; i < arrcnt; i++) {
 320         struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
 321         size_t j;
 322
 323         ct_lock_lock(&ctb->lock);
 324
 325         ULLONG_FOR_EACH_1(j, arr[i].maps) {
 326             struct conn *conn;
 327
 328             conn_key_lookup(ctb, &ctxs[j], now);
 329
 330             conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
 331
 332             if (conn && setmark) {
 333                 set_mark(pkts[j], conn, setmark[0], setmark[1]);
 334             }
 335
 336             if (conn && setlabel) {
 337                 set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
 338             }
 339         }
 340         ct_lock_unlock(&ctb->lock);
 341     }
 342
 343     return 0;
 344 }
 345
 346 static void
 347 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
 348 {
 349     pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
 350     conn->mark = pkt->md.ct_mark;
 351 }
 352
 353 static void
 354 set_label(struct dp_packet *pkt, struct conn *conn,
 355           const struct ovs_key_ct_labels *val,
 356           const struct ovs_key_ct_labels *mask)
 357 {
 358     ovs_u128 v, m;
 359
 360     memcpy(&v, val, sizeof v);
 361     memcpy(&m, mask, sizeof m);
 362
 363     pkt->md.ct_label.u64.lo = v.u64.lo
 364                               | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
 365     pkt->md.ct_label.u64.hi = v.u64.hi
 366                               | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
 367     conn->label = pkt->md.ct_label;
 368 }
 369 \f
 370 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
 371  * earliest expiration time among the remaining connections in 'ctb'.  Returns
 372  * LLONG_MAX if 'ctb' is empty.  The return value might be smaller than 'now',
 373  * if 'limit' is reached */
 374 static long long
 375 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
 376              size_t limit)
 377     OVS_REQUIRES(ctb->lock)
 378 {
 379     struct conn *conn, *next;
 380     long long min_expiration = LLONG_MAX;
 381     unsigned i;
 382     size_t count = 0;
 383
 384     for (i = 0; i < N_CT_TM; i++) {
 385         LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
 386             if (!conn_expired(conn, now) || count >= limit) {
 387                 min_expiration = MIN(min_expiration, conn->expiration);
 388                 if (count >= limit) {
 389                     /* Do not check other lists. */
 390                     COVERAGE_INC(conntrack_long_cleanup);
 391                     return min_expiration;
 392                 }
 393                 break;
 394             }
 395             ovs_list_remove(&conn->exp_node);
 396             hmap_remove(&ctb->connections, &conn->node);
 397             atomic_count_dec(&ct->n_conn);
 398             delete_conn(conn);
 399             count++;
 400         }
 401     }
 402
 403     return min_expiration;
 404 }
 405
 406 /* Cleans up old connection entries from 'ct'.  Returns the time when the
 407  * next expiration might happen.  The return value might be smaller than
 408  * 'now', meaning that an internal limit has been reached, and some expired
 409  * connections have not been deleted. */
 410 static long long
 411 conntrack_clean(struct conntrack *ct, long long now)
 412 {
 413     long long next_wakeup = now + CT_TM_MIN;
 414     unsigned int n_conn_limit;
 415     size_t clean_count = 0;
 416     unsigned i;
 417
 418     atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
 419
 420     for (i = 0; i < CONNTRACK_BUCKETS; i++) {
 421         struct conntrack_bucket *ctb = &ct->buckets[i];
 422         size_t prev_count;
 423         long long min_exp;
 424
 425         ovs_mutex_lock(&ctb->cleanup_mutex);
 426         if (ctb->next_cleanup > now) {
 427             goto next_bucket;
 428         }
 429
 430         ct_lock_lock(&ctb->lock);
 431         prev_count = hmap_count(&ctb->connections);
 432         /* If the connections are well distributed among buckets, we want to
 433          * limit to 10% of the global limit equally split among buckets. If
 434          * the bucket is busier than the others, we limit to 10% of its
 435          * current size. */
 436         min_exp = sweep_bucket(ct, ctb, now,
 437                 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
 438         clean_count += prev_count - hmap_count(&ctb->connections);
 439
 440         if (min_exp > now) {
 441             /* We call hmap_shrink() only if sweep_bucket() managed to delete
 442              * every expired connection. */
 443             hmap_shrink(&ctb->connections);
 444         }
 445
 446         ct_lock_unlock(&ctb->lock);
 447
 448         ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
 449
 450 next_bucket:
 451         next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
 452         ovs_mutex_unlock(&ctb->cleanup_mutex);
 453     }
 454
 455     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
 456              clean_count, time_msec() - now);
 457
 458     return next_wakeup;
 459 }
 460
 461 /* Cleanup:
 462  *
 463  *
 464  * We must call conntrack_clean() periodically.  conntrack_clean() return
 465  * value gives an hint on when the next cleanup must be done (either because
 466  * there is an actual connection that expires, or because a new connection
 467  * might be created with the minimum timeout).
 468  *
 469  * The logic below has two goals:
 470  *
 471  * - Avoid calling conntrack_clean() too often.  If we call conntrack_clean()
 472  *   each time a connection expires, the thread will consume 100% CPU, so we
 473  *   try to call the function _at most_ once every CT_CLEAN_INTERVAL, to batch
 474  *   removal.
 475  *
 476  * - On the other hand, it's not a good idea to keep the buckets locked for
 477  *   too long, as we might prevent traffic from flowing.  If conntrack_clean()
 478  *   returns a value which is in the past, it means that the internal limit
 479  *   has been reached and more cleanup is required.  In this case, just wait
 480  *   CT_CLEAN_MIN_INTERVAL before the next call.
 481  */
 482 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
 483 #define CT_CLEAN_MIN_INTERVAL 200  /* 0.2 seconds */
 484
 485 static void *
 486 clean_thread_main(void *f_)
 487 {
 488     struct conntrack *ct = f_;
 489
 490     while (!latch_is_set(&ct->clean_thread_exit)) {
 491         long long next_wake;
 492         long long now = time_msec();
 493
 494         next_wake = conntrack_clean(ct, now);
 495
 496         if (next_wake < now) {
 497             poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
 498         } else {
 499             poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
 500         }
 501         latch_wait(&ct->clean_thread_exit);
 502         poll_block();
 503     }
 504
 505     return NULL;
 506 }
 507 \f
 508 /* Key extraction */
 509
 510 /* The function stores a pointer to the first byte after the header in
 511  * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
 512  * not interested in the header's tail,  meaning that the header has
 513  * already been parsed (e.g. by flow_extract): we take this as a hint to
 514  * save a few checks.  If 'validate_checksum' is true, the function returns
 515  * false if the IPv4 checksum is invalid. */
 516 static inline bool
 517 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
 518                 const char **new_data, bool validate_checksum)
 519 {
 520     const struct ip_header *ip = data;
 521     size_t ip_len;
 522
 523     if (new_data) {
 524         if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
 525             return false;
 526         }
 527     }
 528
 529     ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
 530
 531     if (new_data) {
 532         if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
 533             return false;
 534         }
 535         if (OVS_UNLIKELY(size < ip_len)) {
 536             return false;
 537         }
 538
 539         *new_data = (char *) data + ip_len;
 540     }
 541
 542     if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
 543         return false;
 544     }
 545
 546     if (validate_checksum && csum(data, ip_len) != 0) {
 547         return false;
 548     }
 549
 550     key->src.addr.ipv4 = ip->ip_src;
 551     key->dst.addr.ipv4 = ip->ip_dst;
 552     key->nw_proto = ip->ip_proto;
 553
 554     return true;
 555 }
 556
 557 /* The function stores a pointer to the first byte after the header in
 558  * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
 559  * not interested in the header's tail,  meaning that the header has
 560  * already been parsed (e.g. by flow_extract): we take this as a hint to
 561  * save a few checks. */
 562 static inline bool
 563 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
 564                 const char **new_data)
 565 {
 566     const struct ovs_16aligned_ip6_hdr *ip6 = data;
 567     uint8_t nw_proto = ip6->ip6_nxt;
 568     uint8_t nw_frag = 0;
 569
 570     if (new_data) {
 571         if (OVS_UNLIKELY(size < sizeof *ip6)) {
 572             return false;
 573         }
 574     }
 575
 576     data = ip6 + 1;
 577     size -=  sizeof *ip6;
 578
 579     if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
 580         return false;
 581     }
 582
 583     if (new_data) {
 584         *new_data = data;
 585     }
 586
 587     if (nw_frag) {
 588         return false;
 589     }
 590
 591     key->src.addr.ipv6 = ip6->ip6_src;
 592     key->dst.addr.ipv6 = ip6->ip6_dst;
 593     key->nw_proto = nw_proto;
 594
 595     return true;
 596 }
 597
 598 static inline bool
 599 checksum_valid(const struct conn_key *key, const void *data, size_t size,
 600                const void *l3)
 601 {
 602     uint32_t csum = 0;
 603
 604     if (key->dl_type == htons(ETH_TYPE_IP)) {
 605         csum = packet_csum_pseudoheader(l3);
 606     } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
 607         csum = packet_csum_pseudoheader6(l3);
 608     } else {
 609         return false;
 610     }
 611
 612     csum = csum_continue(csum, data, size);
 613
 614     return csum_finish(csum) == 0;
 615 }
 616
 617 static inline bool
 618 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
 619              const void *l3)
 620 {
 621     const struct tcp_header *tcp = data;
 622     size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
 623
 624     if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
 625         return false;
 626     }
 627
 628     return checksum_valid(key, data, size, l3);
 629 }
 630
 631 static inline bool
 632 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
 633              const void *l3)
 634 {
 635     const struct udp_header *udp = data;
 636     size_t udp_len = ntohs(udp->udp_len);
 637
 638     if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
 639         return false;
 640     }
 641
 642     /* Validation must be skipped if checksum is 0 on IPv4 packets */
 643     return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
 644            || checksum_valid(key, data, size, l3);
 645 }
 646
 647 static inline bool
 648 check_l4_icmp(const void *data, size_t size)
 649 {
 650     return csum(data, size) == 0;
 651 }
 652
 653 static inline bool
 654 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
 655                const void *l3)
 656 {
 657     return checksum_valid(key, data, size, l3);
 658 }
 659
 660 static inline bool
 661 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
 662 {
 663     const struct tcp_header *tcp = data;
 664
 665     if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
 666         return false;
 667     }
 668
 669     key->src.port = tcp->tcp_src;
 670     key->dst.port = tcp->tcp_dst;
 671
 672     /* Port 0 is invalid */
 673     return key->src.port && key->dst.port;
 674 }
 675
 676 static inline bool
 677 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
 678 {
 679     const struct udp_header *udp = data;
 680
 681     if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
 682         return false;
 683     }
 684
 685     key->src.port = udp->udp_src;
 686     key->dst.port = udp->udp_dst;
 687
 688     /* Port 0 is invalid */
 689     return key->src.port && key->dst.port;
 690 }
 691
 692 static inline bool extract_l4(struct conn_key *key, const void *data,
 693                               size_t size, bool *related, const void *l3);
 694
 695 /* If 'related' is not NULL and the function is processing an ICMP
 696  * error packet, extract the l3 and l4 fields from the nested header
 697  * instead and set *related to true.  If 'related' is NULL we're
 698  * already processing a nested header and no such recursion is
 699  * possible */
 700 static inline int
 701 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
 702                 bool *related)
 703 {
 704     const struct icmp_header *icmp = data;
 705
 706     if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
 707         return false;
 708     }
 709
 710     switch (icmp->icmp_type) {
 711     case ICMP4_ECHO_REQUEST:
 712     case ICMP4_ECHO_REPLY:
 713     case ICMP4_TIMESTAMP:
 714     case ICMP4_TIMESTAMPREPLY:
 715     case ICMP4_INFOREQUEST:
 716     case ICMP4_INFOREPLY:
 717         /* Separate ICMP connection: identified using id */
 718         key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
 719         break;
 720     case ICMP4_DST_UNREACH:
 721     case ICMP4_TIME_EXCEEDED:
 722     case ICMP4_PARAM_PROB:
 723     case ICMP4_SOURCEQUENCH:
 724     case ICMP4_REDIRECT: {
 725         /* ICMP packet part of another connection. We should
 726          * extract the key from embedded packet header */
 727         struct conn_key inner_key;
 728         const char *l3 = (const char *) (icmp + 1);
 729         const char *tail = (const char *) data + size;
 730         const char *l4;
 731         bool ok;
 732
 733         if (!related) {
 734             return false;
 735         }
 736
 737         memset(&inner_key, 0, sizeof inner_key);
 738         inner_key.dl_type = htons(ETH_TYPE_IP);
 739         ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
 740         if (!ok) {
 741             return false;
 742         }
 743
 744         /* pf doesn't do this, but it seems a good idea */
 745         if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
 746             || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
 747             return false;
 748         }
 749
 750         key->src = inner_key.src;
 751         key->dst = inner_key.dst;
 752         key->nw_proto = inner_key.nw_proto;
 753
 754         ok = extract_l4(key, l4, tail - l4, NULL, l3);
 755         if (ok) {
 756             conn_key_reverse(key);
 757             *related = true;
 758         }
 759         return ok;
 760     }
 761     default:
 762         return false;
 763     }
 764
 765     return true;
 766 }
 767
 768 /* If 'related' is not NULL and the function is processing an ICMP
 769  * error packet, extract the l3 and l4 fields from the nested header
 770  * instead and set *related to true.  If 'related' is NULL we're
 771  * already processing a nested header and no such recursion is
 772  * possible */
 773 static inline bool
 774 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
 775                  bool *related)
 776 {
 777     const struct icmp6_header *icmp6 = data;
 778
 779     /* All the messages that we support need at least 4 bytes after
 780      * the header */
 781     if (size < sizeof *icmp6 + 4) {
 782         return false;
 783     }
 784
 785     switch (icmp6->icmp6_type) {
 786     case ICMP6_ECHO_REQUEST:
 787     case ICMP6_ECHO_REPLY:
 788         /* Separate ICMP connection: identified using id */
 789         key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
 790         break;
 791     case ICMP6_DST_UNREACH:
 792     case ICMP6_PACKET_TOO_BIG:
 793     case ICMP6_TIME_EXCEEDED:
 794     case ICMP6_PARAM_PROB: {
 795         /* ICMP packet part of another connection. We should
 796          * extract the key from embedded packet header */
 797         struct conn_key inner_key;
 798         const char *l3 = (const char *) icmp6 + 8;
 799         const char *tail = (const char *) data + size;
 800         const char *l4 = NULL;
 801         bool ok;
 802
 803         if (!related) {
 804             return false;
 805         }
 806
 807         memset(&inner_key, 0, sizeof inner_key);
 808         inner_key.dl_type = htons(ETH_TYPE_IPV6);
 809         ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
 810         if (!ok) {
 811             return false;
 812         }
 813
 814         /* pf doesn't do this, but it seems a good idea */
 815         if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
 816                               &key->dst.addr.ipv6_aligned)
 817             || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
 818                                  &key->src.addr.ipv6_aligned)) {
 819             return false;
 820         }
 821
 822         key->src = inner_key.src;
 823         key->dst = inner_key.dst;
 824         key->nw_proto = inner_key.nw_proto;
 825
 826         ok = extract_l4(key, l4, tail - l4, NULL, l3);
 827         if (ok) {
 828             conn_key_reverse(key);
 829             *related = true;
 830         }
 831         return ok;
 832     }
 833     default:
 834         return false;
 835     }
 836
 837     return true;
 838 }
 839
 840 /* Extract l4 fields into 'key', which must already contain valid l3
 841  * members.
 842  *
 843  * If 'related' is not NULL and an ICMP error packet is being
 844  * processed, the function will extract the key from the packet nested
 845  * in the ICMP paylod and set '*related' to true.
 846  *
 847  * If 'related' is NULL, it means that we're already parsing a header nested
 848  * in an ICMP error.  In this case, we skip checksum and length validation. */
 849 static inline bool
 850 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
 851            const void *l3)
 852 {
 853     if (key->nw_proto == IPPROTO_TCP) {
 854         return (!related || check_l4_tcp(key, data, size, l3))
 855                && extract_l4_tcp(key, data, size);
 856     } else if (key->nw_proto == IPPROTO_UDP) {
 857         return (!related || check_l4_udp(key, data, size, l3))
 858                && extract_l4_udp(key, data, size);
 859     } else if (key->dl_type == htons(ETH_TYPE_IP)
 860                && key->nw_proto == IPPROTO_ICMP) {
 861         return (!related || check_l4_icmp(data, size))
 862                && extract_l4_icmp(key, data, size, related);
 863     } else if (key->dl_type == htons(ETH_TYPE_IPV6)
 864                && key->nw_proto == IPPROTO_ICMPV6) {
 865         return (!related || check_l4_icmp6(key, data, size, l3))
 866                && extract_l4_icmp6(key, data, size, related);
 867     } else {
 868         return false;
 869     }
 870 }
 871
 872 static bool
 873 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
 874                  struct conn_lookup_ctx *ctx, uint16_t zone)
 875 {
 876     const struct eth_header *l2 = dp_packet_l2(pkt);
 877     const struct ip_header *l3 = dp_packet_l3(pkt);
 878     const char *l4 = dp_packet_l4(pkt);
 879     const char *tail = dp_packet_tail(pkt);
 880     bool ok;
 881
 882     memset(ctx, 0, sizeof *ctx);
 883
 884     if (!l2 || !l3 || !l4) {
 885         return false;
 886     }
 887
 888     ctx->key.zone = zone;
 889
 890     /* XXX In this function we parse the packet (again, it has already
 891      * gone through miniflow_extract()) for two reasons:
 892      *
 893      * 1) To extract the l3 addresses and l4 ports.
 894      *    We already have the l3 and l4 headers' pointers.  Extracting
 895      *    the l3 addresses and the l4 ports is really cheap, since they
 896      *    can be found at fixed locations.
 897      * 2) To extract the l3 and l4 types.
 898      *    Extracting the l3 and l4 types (especially the l3[1]) on the
 899      *    other hand is quite expensive, because they're not at a
 900      *    fixed location.
 901      *
 902      * Here's a way to avoid (2) with the help of the datapath.
 903      * The datapath doesn't keep the packet's extracted flow[2], so
 904      * using that is not an option.  We could use the packet's matching
 905      * megaflow for l3 type (it's always unwildcarded), and for l4 type
 906      * (we have to unwildcard it first).  This means either:
 907      *
 908      * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which
 909      *    is used to extract the l3 type.  Unfortunately, dp_execute_cb() is
 910      *    used also in dpif_netdev_execute(), which doesn't have a matching
 911      *    megaflow.
 912      *
 913      * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the
 914      *    userspace datapath, which includes l3 (and l4) type.  The
 915      *    alternative action could be generated by ofproto-dpif specifically
 916      *    for the userspace datapath. Having a different interface for
 917      *    userspace and kernel doesn't seem very clean, though.
 918      *
 919      * ---
 920      * [1] A simple benchmark (running only the connection tracker
 921      *     over and over on the same packets) shows that if the
 922      *     l3 type is already provided we are 15% faster (running the
 923      *     connection tracker over a couple of DPDK devices with a
 924      *     stream of UDP 64-bytes packets shows that we are 4% faster).
 925      *
 926      * [2] The reasons for this are that keeping the flow increases
 927      *     (slightly) the cache footprint and increases computation
 928      *     time as we move the packet around. Most importantly, the flow
 929      *     should be updated by the actions and this can be slow, as
 930      *     we use a sparse representation (miniflow).
 931      *
 932      */
 933     ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
 934     if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
 935         ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
 936     } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
 937         ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
 938     } else {
 939         ok = false;
 940     }
 941
 942     if (ok) {
 943         if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
 944             ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
 945             return true;
 946         }
 947     }
 948
 949     return false;
 950 }
 951 \f
 952 /* Symmetric */
 953 static uint32_t
 954 conn_key_hash(const struct conn_key *key, uint32_t basis)
 955 {
 956     uint32_t hsrc, hdst, hash;
 957     int i;
 958
 959     hsrc = hdst = basis;
 960
 961     /* Hash the source and destination tuple */
 962     for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
 963         hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
 964         hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
 965     }
 966
 967     /* Even if source and destination are swapped the hash will be the same. */
 968     hash = hsrc ^ hdst;
 969
 970     /* Hash the rest of the key(L3 and L4 types and zone). */
 971     hash = hash_words((uint32_t *) &key->dst + 1,
 972                       (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
 973                       hash);
 974
 975     return hash;
 976 }
 977
 978 static void
 979 conn_key_reverse(struct conn_key *key)
 980 {
 981     struct ct_endpoint tmp;
 982     tmp = key->src;
 983     key->src = key->dst;
 984     key->dst = tmp;
 985 }
 986
 987 static void
 988 conn_key_lookup(struct conntrack_bucket *ctb,
 989                 struct conn_lookup_ctx *ctx,
 990                 long long now)
 991 {
 992     uint32_t hash = ctx->hash;
 993     struct conn *conn;
 994
 995     ctx->conn = NULL;
 996
 997     HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
 998         if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
 999                 && !conn_expired(conn, now)) {
1000             ctx->conn = conn;
1001             ctx->reply = false;
1002             break;
1003         }
1004         if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
1005                 && !conn_expired(conn, now)) {
1006             ctx->conn = conn;
1007             ctx->reply = true;
1008             break;
1009         }
1010     }
1011 }
1012
1013 static enum ct_update_res
1014 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1015             struct dp_packet *pkt, bool reply, long long now)
1016 {
1017     return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1018                                                       reply, now);
1019 }
1020
1021 static bool
1022 conn_expired(struct conn *conn, long long now)
1023 {
1024     return now >= conn->expiration;
1025 }
1026
1027 static bool
1028 valid_new(struct dp_packet *pkt, struct conn_key *key)
1029 {
1030     return l4_protos[key->nw_proto]->valid_new(pkt);
1031 }
1032
1033 static struct conn *
1034 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1035          struct conn_key *key, long long now)
1036 {
1037     struct conn *newconn;
1038
1039     newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1040
1041     if (newconn) {
1042         newconn->key = *key;
1043     }
1044
1045     return newconn;
1046 }
1047
1048 static void
1049 delete_conn(struct conn *conn)
1050 {
1051     free(conn);
1052 }