lib/conntrack-tcp.c

   1 /*-
   2  * Copyright (c) 2001 Daniel Hartmeier
   3  * Copyright (c) 2002 - 2008 Henning Brauer
   4  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
   5  * Copyright (c) 2015, 2016 Nicira, Inc.
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  *    - Redistributions of source code must retain the above copyright
  13  *      notice, this list of conditions and the following disclaimer.
  14  *    - Redistributions in binary form must reproduce the above
  15  *      copyright notice, this list of conditions and the following
  16  *      disclaimer in the documentation and/or other materials provided
  17  *      with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  *
  32  * Effort sponsored in part by the Defense Advanced Research Projects
  33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  35  *
  36  *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
  37  */
  38
  39 #include <config.h>
  40
  41 #include "conntrack-private.h"
  42 #include "conntrack-tp.h"
  43 #include "coverage.h"
  44 #include "ct-dpif.h"
  45 #include "dp-packet.h"
  46 #include "util.h"
  47
  48 COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass);
  49 COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed);
  50 COVERAGE_DEFINE(conntrack_invalid_tcp_flags);
  51
  52 struct tcp_peer {
  53     uint32_t               seqlo;          /* Max sequence number sent     */
  54     uint32_t               seqhi;          /* Max the other end ACKd + win */
  55     uint16_t               max_win;        /* largest window (pre scaling) */
  56     uint8_t                wscale;         /* window scaling factor        */
  57     enum ct_dpif_tcp_state state;
  58 };
  59
  60 struct conn_tcp {
  61     struct conn up;
  62     struct tcp_peer peer[2]; /* 'conn' lock protected. */
  63 };
  64
  65 enum {
  66     TCPOPT_EOL,
  67     TCPOPT_NOP,
  68     TCPOPT_WINDOW = 3,
  69 };
  70
  71 /* TCP sequence numbers are 32 bit integers operated
  72  * on with modular arithmetic.  These macros can be
  73  * used to compare such integers. */
  74 #define SEQ_LT(a,b)     INT_MOD_LT(a, b)
  75 #define SEQ_LEQ(a,b)    INT_MOD_LEQ(a, b)
  76 #define SEQ_GT(a,b)     INT_MOD_GT(a, b)
  77 #define SEQ_GEQ(a,b)    INT_MOD_GEQ(a, b)
  78
  79 #define SEQ_MIN(a, b)   INT_MOD_MIN(a, b)
  80 #define SEQ_MAX(a, b)   INT_MOD_MAX(a, b)
  81
  82 static struct conn_tcp*
  83 conn_tcp_cast(const struct conn* conn)
  84 {
  85     return CONTAINER_OF(conn, struct conn_tcp, up);
  86 }
  87
  88 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
  89  * is enabled.  We're not scrubbing, but this check seems reasonable.  */
  90 static bool
  91 tcp_invalid_flags(uint16_t flags)
  92 {
  93
  94     if (flags & TCP_SYN) {
  95         if (flags & TCP_RST || flags & TCP_FIN) {
  96             return true;
  97         }
  98     } else {
  99         /* Illegal packet */
 100         if (!(flags & (TCP_ACK|TCP_RST))) {
 101             return true;
 102         }
 103     }
 104
 105     if (!(flags & TCP_ACK)) {
 106         /* These flags are only valid if ACK is set */
 107         if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
 108             return true;
 109         }
 110     }
 111
 112     return false;
 113 }
 114
 115 #define TCP_MAX_WSCALE 14
 116 #define CT_WSCALE_FLAG 0x80
 117 #define CT_WSCALE_UNKNOWN 0x40
 118 #define CT_WSCALE_MASK 0xf
 119
 120 static uint8_t
 121 tcp_get_wscale(const struct tcp_header *tcp)
 122 {
 123     int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
 124     const uint8_t *opt = (const uint8_t *)(tcp + 1);
 125     uint8_t wscale = 0;
 126     uint8_t optlen;
 127
 128     while (len >= 3) {
 129         switch (*opt) {
 130         case TCPOPT_EOL:
 131             return wscale;
 132         case TCPOPT_NOP:
 133             opt++;
 134             len--;
 135             break;
 136         case TCPOPT_WINDOW:
 137             wscale = MIN(opt[2], TCP_MAX_WSCALE);
 138             wscale |= CT_WSCALE_FLAG;
 139             /* fall through */
 140         default:
 141             optlen = opt[1];
 142             if (optlen < 2) {
 143                 optlen = 2;
 144             }
 145             len -= optlen;
 146             opt += optlen;
 147         }
 148     }
 149
 150     return wscale;
 151 }
 152
 153 static bool
 154 tcp_bypass_seq_chk(struct conntrack *ct)
 155 {
 156     if (!conntrack_get_tcp_seq_chk(ct)) {
 157         COVERAGE_INC(conntrack_tcp_seq_chk_bypass);
 158         return true;
 159     }
 160     return false;
 161 }
 162
 163 static enum ct_update_res
 164 tcp_conn_update(struct conntrack *ct, struct conn *conn_,
 165                 struct dp_packet *pkt, bool reply, long long now)
 166 {
 167     struct conn_tcp *conn = conn_tcp_cast(conn_);
 168     struct tcp_header *tcp = dp_packet_l4(pkt);
 169     /* The peer that sent 'pkt' */
 170     struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
 171     /* The peer that should receive 'pkt' */
 172     struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
 173     uint8_t sws = 0, dws = 0;
 174     uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
 175
 176     uint16_t win = ntohs(tcp->tcp_winsz);
 177     uint32_t ack, end, seq, orig_seq;
 178     uint32_t p_len = tcp_payload_length(pkt);
 179
 180     if (tcp_invalid_flags(tcp_flags)) {
 181         COVERAGE_INC(conntrack_invalid_tcp_flags);
 182         return CT_UPDATE_INVALID;
 183     }
 184
 185     if ((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) {
 186         if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
 187             && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
 188             src->state = dst->state = CT_DPIF_TCPS_CLOSED;
 189             return CT_UPDATE_NEW;
 190         } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) {
 191             src->state = CT_DPIF_TCPS_SYN_SENT;
 192             conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now);
 193             return CT_UPDATE_VALID_NEW;
 194         }
 195     }
 196
 197     if (src->wscale & CT_WSCALE_FLAG
 198         && dst->wscale & CT_WSCALE_FLAG
 199         && !(tcp_flags & TCP_SYN)) {
 200
 201         sws = src->wscale & CT_WSCALE_MASK;
 202         dws = dst->wscale & CT_WSCALE_MASK;
 203
 204     } else if (src->wscale & CT_WSCALE_UNKNOWN
 205                && dst->wscale & CT_WSCALE_UNKNOWN
 206                && !(tcp_flags & TCP_SYN)) {
 207
 208         sws = TCP_MAX_WSCALE;
 209         dws = TCP_MAX_WSCALE;
 210     }
 211
 212     /*
 213      * Sequence tracking algorithm from Guido van Rooij's paper:
 214      *   http://www.madison-gurkha.com/publications/tcp_filtering/
 215      *      tcp_filtering.ps
 216      */
 217
 218     orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
 219     bool check_ackskew = true;
 220     if (src->state < CT_DPIF_TCPS_SYN_SENT) {
 221         /* First packet from this end. Set its state */
 222
 223         ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
 224
 225         end = seq + p_len;
 226         if (tcp_flags & TCP_SYN) {
 227             end++;
 228             if (dst->wscale & CT_WSCALE_FLAG) {
 229                 src->wscale = tcp_get_wscale(tcp);
 230                 if (src->wscale & CT_WSCALE_FLAG) {
 231                     /* Remove scale factor from initial window */
 232                     sws = src->wscale & CT_WSCALE_MASK;
 233                     win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
 234                     dws = dst->wscale & CT_WSCALE_MASK;
 235                 } else {
 236                     /* fixup other window */
 237                     dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
 238                     /* in case of a retrans SYN|ACK */
 239                     dst->wscale = 0;
 240                 }
 241             }
 242         }
 243         if (tcp_flags & TCP_FIN) {
 244             end++;
 245         }
 246
 247         src->seqlo = seq;
 248         src->state = CT_DPIF_TCPS_SYN_SENT;
 249         /*
 250          * May need to slide the window (seqhi may have been set by
 251          * the crappy stack check or if we picked up the connection
 252          * after establishment)
 253          */
 254         if (src->seqhi == 1
 255                 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
 256             src->seqhi = end + MAX(1, dst->max_win << dws);
 257             /* We are either picking up a new connection or a connection which
 258              * was already in place.  We are more permissive in terms of
 259              * ackskew checking in these cases.
 260              */
 261             check_ackskew = false;
 262         }
 263         if (win > src->max_win) {
 264             src->max_win = win;
 265         }
 266
 267     } else {
 268         ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
 269         end = seq + p_len;
 270         if (tcp_flags & TCP_SYN) {
 271             end++;
 272         }
 273         if (tcp_flags & TCP_FIN) {
 274             end++;
 275         }
 276     }
 277
 278     if ((tcp_flags & TCP_ACK) == 0) {
 279         /* Let it pass through the ack skew check */
 280         ack = dst->seqlo;
 281     } else if ((ack == 0
 282                 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
 283                /* broken tcp stacks do not set ack */) {
 284         /* Many stacks (ours included) will set the ACK number in an
 285          * FIN|ACK if the SYN times out -- no sequence to ACK. */
 286         ack = dst->seqlo;
 287     }
 288
 289     if (seq == end) {
 290         /* Ease sequencing restrictions on no data packets */
 291         seq = src->seqlo;
 292         end = seq;
 293     }
 294
 295     int ackskew = check_ackskew ? dst->seqlo - ack : 0;
 296 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
 297     if ((SEQ_GEQ(src->seqhi, end)
 298         /* Last octet inside other's window space */
 299         && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
 300         /* Retrans: not more than one window back */
 301         && (ackskew >= -MAXACKWINDOW)
 302         /* Acking not more than one reassembled fragment backwards */
 303         && (ackskew <= (MAXACKWINDOW << sws))
 304         /* Acking not more than one window forward */
 305         && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
 306             || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
 307         || tcp_bypass_seq_chk(ct)) {
 308         /* Require an exact/+1 sequence match on resets when possible */
 309
 310         /* update max window */
 311         if (src->max_win < win) {
 312             src->max_win = win;
 313         }
 314         /* synchronize sequencing */
 315         if (SEQ_GT(end, src->seqlo)) {
 316             src->seqlo = end;
 317         }
 318         /* slide the window of what the other end can send */
 319         if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
 320             dst->seqhi = ack + MAX((win << sws), 1);
 321         }
 322
 323         /* update states */
 324         if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
 325                 src->state = CT_DPIF_TCPS_SYN_SENT;
 326         }
 327         if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
 328                 src->state = CT_DPIF_TCPS_CLOSING;
 329         }
 330         if (tcp_flags & TCP_ACK) {
 331             if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
 332                 dst->state = CT_DPIF_TCPS_ESTABLISHED;
 333             } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
 334                 dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
 335             }
 336         }
 337         if (tcp_flags & TCP_RST) {
 338             src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
 339         }
 340
 341         if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
 342             && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
 343             conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSED, now);
 344         } else if (src->state >= CT_DPIF_TCPS_CLOSING
 345                    && dst->state >= CT_DPIF_TCPS_CLOSING) {
 346             conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIN_WAIT, now);
 347         } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
 348                    || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
 349             conn_update_expiration(ct, &conn->up, CT_TM_TCP_OPENING, now);
 350         } else if (src->state >= CT_DPIF_TCPS_CLOSING
 351                    || dst->state >= CT_DPIF_TCPS_CLOSING) {
 352             conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSING, now);
 353         } else {
 354             conn_update_expiration(ct, &conn->up, CT_TM_TCP_ESTABLISHED, now);
 355         }
 356     } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
 357                 || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
 358                 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
 359                && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
 360                /* Within a window forward of the originating packet */
 361                && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
 362                /* Within a window backward of the originating packet */
 363
 364         /*
 365          * This currently handles three situations:
 366          *  1) Stupid stacks will shotgun SYNs before their peer
 367          *     replies.
 368          *  2) When PF catches an already established stream (the
 369          *     firewall rebooted, the state table was flushed, routes
 370          *     changed...)
 371          *  3) Packets get funky immediately after the connection
 372          *     closes (this should catch Solaris spurious ACK|FINs
 373          *     that web servers like to spew after a close)
 374          *
 375          * This must be a little more careful than the above code
 376          * since packet floods will also be caught here. We don't
 377          * update the TTL here to mitigate the damage of a packet
 378          * flood and so the same code can handle awkward establishment
 379          * and a loosened connection close.
 380          * In the establishment case, a correct peer response will
 381          * validate the connection, go through the normal state code
 382          * and keep updating the state TTL.
 383          */
 384
 385         /* update max window */
 386         if (src->max_win < win) {
 387             src->max_win = win;
 388         }
 389         /* synchronize sequencing */
 390         if (SEQ_GT(end, src->seqlo)) {
 391             src->seqlo = end;
 392         }
 393         /* slide the window of what the other end can send */
 394         if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
 395             dst->seqhi = ack + MAX((win << sws), 1);
 396         }
 397
 398         /*
 399          * Cannot set dst->seqhi here since this could be a shotgunned
 400          * SYN and not an already established connection.
 401          */
 402
 403         if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
 404             src->state = CT_DPIF_TCPS_CLOSING;
 405         }
 406
 407         if (tcp_flags & TCP_RST) {
 408             src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
 409         }
 410     } else {
 411         COVERAGE_INC(conntrack_tcp_seq_chk_failed);
 412         return CT_UPDATE_INVALID;
 413     }
 414
 415     return CT_UPDATE_VALID;
 416 }
 417
 418 static bool
 419 tcp_valid_new(struct dp_packet *pkt)
 420 {
 421     struct tcp_header *tcp = dp_packet_l4(pkt);
 422     uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
 423
 424     if (tcp_invalid_flags(tcp_flags)) {
 425         return false;
 426     }
 427
 428     /* A syn+ack is not allowed to create a connection.  We want to allow
 429      * totally new connections (syn) or already established, not partially
 430      * open (syn+ack). */
 431     if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
 432         return false;
 433     }
 434
 435     return true;
 436 }
 437
 438 static struct conn *
 439 tcp_new_conn(struct conntrack *ct, struct dp_packet *pkt, long long now,
 440              uint32_t tp_id)
 441 {
 442     struct conn_tcp* newconn = NULL;
 443     struct tcp_header *tcp = dp_packet_l4(pkt);
 444     struct tcp_peer *src, *dst;
 445     uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
 446
 447     newconn = xzalloc(sizeof *newconn);
 448
 449     src = &newconn->peer[0];
 450     dst = &newconn->peer[1];
 451
 452     src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
 453     src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
 454
 455     if (tcp_flags & TCP_SYN) {
 456         src->seqhi++;
 457         src->wscale = tcp_get_wscale(tcp);
 458     } else {
 459         src->wscale = CT_WSCALE_UNKNOWN;
 460         dst->wscale = CT_WSCALE_UNKNOWN;
 461     }
 462     src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
 463     if (src->wscale & CT_WSCALE_MASK) {
 464         /* Remove scale factor from initial window */
 465         uint8_t sws = src->wscale & CT_WSCALE_MASK;
 466         src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
 467     }
 468     if (tcp_flags & TCP_FIN) {
 469         src->seqhi++;
 470     }
 471     dst->seqhi = 1;
 472     dst->max_win = 1;
 473     src->state = CT_DPIF_TCPS_SYN_SENT;
 474     dst->state = CT_DPIF_TCPS_CLOSED;
 475
 476     newconn->up.tp_id = tp_id;
 477     conn_init_expiration(ct, &newconn->up, CT_TM_TCP_FIRST_PACKET, now);
 478
 479     return &newconn->up;
 480 }
 481
 482 static uint8_t
 483 tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
 484 {
 485     uint8_t res = 0;
 486
 487     if (peer->wscale & CT_WSCALE_FLAG) {
 488         res |= CT_DPIF_TCPF_WINDOW_SCALE;
 489     }
 490
 491     if (peer->wscale & CT_WSCALE_UNKNOWN) {
 492         res |= CT_DPIF_TCPF_BE_LIBERAL;
 493     }
 494
 495     return res;
 496 }
 497
 498 static void
 499 tcp_conn_get_protoinfo(const struct conn *conn_,
 500                        struct ct_dpif_protoinfo *protoinfo)
 501 {
 502     const struct conn_tcp *conn = conn_tcp_cast(conn_);
 503
 504     protoinfo->proto = IPPROTO_TCP;
 505     protoinfo->tcp.state_orig = conn->peer[0].state;
 506     protoinfo->tcp.state_reply = conn->peer[1].state;
 507
 508     protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
 509     protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
 510
 511     protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(&conn->peer[0]);
 512     protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(&conn->peer[1]);
 513 }
 514
 515 struct ct_l4_proto ct_proto_tcp = {
 516     .new_conn = tcp_new_conn,
 517     .valid_new = tcp_valid_new,
 518     .conn_update = tcp_conn_update,
 519     .conn_get_protoinfo = tcp_conn_get_protoinfo,
 520 };