2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 Nicira, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
41 #include "conntrack-private.h"
42 #include "conntrack-tp.h"
45 #include "dp-packet.h"
48 COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass
);
49 COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed
);
50 COVERAGE_DEFINE(conntrack_invalid_tcp_flags
);
53 uint32_t seqlo
; /* Max sequence number sent */
54 uint32_t seqhi
; /* Max the other end ACKd + win */
55 uint16_t max_win
; /* largest window (pre scaling) */
56 uint8_t wscale
; /* window scaling factor */
57 enum ct_dpif_tcp_state state
;
62 struct tcp_peer peer
[2]; /* 'conn' lock protected. */
71 /* TCP sequence numbers are 32 bit integers operated
72 * on with modular arithmetic. These macros can be
73 * used to compare such integers. */
74 #define SEQ_LT(a,b) INT_MOD_LT(a, b)
75 #define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
76 #define SEQ_GT(a,b) INT_MOD_GT(a, b)
77 #define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
79 #define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
80 #define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
82 static struct conn_tcp
*
83 conn_tcp_cast(const struct conn
* conn
)
85 return CONTAINER_OF(conn
, struct conn_tcp
, up
);
88 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
89 * is enabled. We're not scrubbing, but this check seems reasonable. */
91 tcp_invalid_flags(uint16_t flags
)
94 if (flags
& TCP_SYN
) {
95 if (flags
& TCP_RST
|| flags
& TCP_FIN
) {
100 if (!(flags
& (TCP_ACK
|TCP_RST
))) {
105 if (!(flags
& TCP_ACK
)) {
106 /* These flags are only valid if ACK is set */
107 if ((flags
& TCP_FIN
) || (flags
& TCP_PSH
) || (flags
& TCP_URG
)) {
115 #define TCP_MAX_WSCALE 14
116 #define CT_WSCALE_FLAG 0x80
117 #define CT_WSCALE_UNKNOWN 0x40
118 #define CT_WSCALE_MASK 0xf
121 tcp_get_wscale(const struct tcp_header
*tcp
)
123 int len
= TCP_OFFSET(tcp
->tcp_ctl
) * 4 - sizeof *tcp
;
124 const uint8_t *opt
= (const uint8_t *)(tcp
+ 1);
137 wscale
= MIN(opt
[2], TCP_MAX_WSCALE
);
138 wscale
|= CT_WSCALE_FLAG
;
154 tcp_bypass_seq_chk(struct conntrack
*ct
)
156 if (!conntrack_get_tcp_seq_chk(ct
)) {
157 COVERAGE_INC(conntrack_tcp_seq_chk_bypass
);
163 static enum ct_update_res
164 tcp_conn_update(struct conntrack
*ct
, struct conn
*conn_
,
165 struct dp_packet
*pkt
, bool reply
, long long now
)
167 struct conn_tcp
*conn
= conn_tcp_cast(conn_
);
168 struct tcp_header
*tcp
= dp_packet_l4(pkt
);
169 /* The peer that sent 'pkt' */
170 struct tcp_peer
*src
= &conn
->peer
[reply
? 1 : 0];
171 /* The peer that should receive 'pkt' */
172 struct tcp_peer
*dst
= &conn
->peer
[reply
? 0 : 1];
173 uint8_t sws
= 0, dws
= 0;
174 uint16_t tcp_flags
= TCP_FLAGS(tcp
->tcp_ctl
);
176 uint16_t win
= ntohs(tcp
->tcp_winsz
);
177 uint32_t ack
, end
, seq
, orig_seq
;
178 uint32_t p_len
= tcp_payload_length(pkt
);
180 if (tcp_invalid_flags(tcp_flags
)) {
181 COVERAGE_INC(conntrack_invalid_tcp_flags
);
182 return CT_UPDATE_INVALID
;
185 if ((tcp_flags
& (TCP_SYN
| TCP_ACK
)) == TCP_SYN
) {
186 if (dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
187 && src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
188 src
->state
= dst
->state
= CT_DPIF_TCPS_CLOSED
;
189 return CT_UPDATE_NEW
;
190 } else if (src
->state
<= CT_DPIF_TCPS_SYN_SENT
) {
191 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
192 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_FIRST_PACKET
, now
);
193 return CT_UPDATE_VALID_NEW
;
197 if (src
->wscale
& CT_WSCALE_FLAG
198 && dst
->wscale
& CT_WSCALE_FLAG
199 && !(tcp_flags
& TCP_SYN
)) {
201 sws
= src
->wscale
& CT_WSCALE_MASK
;
202 dws
= dst
->wscale
& CT_WSCALE_MASK
;
204 } else if (src
->wscale
& CT_WSCALE_UNKNOWN
205 && dst
->wscale
& CT_WSCALE_UNKNOWN
206 && !(tcp_flags
& TCP_SYN
)) {
208 sws
= TCP_MAX_WSCALE
;
209 dws
= TCP_MAX_WSCALE
;
213 * Sequence tracking algorithm from Guido van Rooij's paper:
214 * http://www.madison-gurkha.com/publications/tcp_filtering/
218 orig_seq
= seq
= ntohl(get_16aligned_be32(&tcp
->tcp_seq
));
219 bool check_ackskew
= true;
220 if (src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
221 /* First packet from this end. Set its state */
223 ack
= ntohl(get_16aligned_be32(&tcp
->tcp_ack
));
226 if (tcp_flags
& TCP_SYN
) {
228 if (dst
->wscale
& CT_WSCALE_FLAG
) {
229 src
->wscale
= tcp_get_wscale(tcp
);
230 if (src
->wscale
& CT_WSCALE_FLAG
) {
231 /* Remove scale factor from initial window */
232 sws
= src
->wscale
& CT_WSCALE_MASK
;
233 win
= DIV_ROUND_UP((uint32_t) win
, 1 << sws
);
234 dws
= dst
->wscale
& CT_WSCALE_MASK
;
236 /* fixup other window */
237 dst
->max_win
<<= dst
->wscale
& CT_WSCALE_MASK
;
238 /* in case of a retrans SYN|ACK */
243 if (tcp_flags
& TCP_FIN
) {
248 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
250 * May need to slide the window (seqhi may have been set by
251 * the crappy stack check or if we picked up the connection
252 * after establishment)
255 || SEQ_GEQ(end
+ MAX(1, dst
->max_win
<< dws
), src
->seqhi
)) {
256 src
->seqhi
= end
+ MAX(1, dst
->max_win
<< dws
);
257 /* We are either picking up a new connection or a connection which
258 * was already in place. We are more permissive in terms of
259 * ackskew checking in these cases.
261 check_ackskew
= false;
263 if (win
> src
->max_win
) {
268 ack
= ntohl(get_16aligned_be32(&tcp
->tcp_ack
));
270 if (tcp_flags
& TCP_SYN
) {
273 if (tcp_flags
& TCP_FIN
) {
278 if ((tcp_flags
& TCP_ACK
) == 0) {
279 /* Let it pass through the ack skew check */
282 && (tcp_flags
& (TCP_ACK
|TCP_RST
)) == (TCP_ACK
|TCP_RST
))
283 /* broken tcp stacks do not set ack */) {
284 /* Many stacks (ours included) will set the ACK number in an
285 * FIN|ACK if the SYN times out -- no sequence to ACK. */
290 /* Ease sequencing restrictions on no data packets */
295 int ackskew
= check_ackskew
? dst
->seqlo
- ack
: 0;
296 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
297 if ((SEQ_GEQ(src
->seqhi
, end
)
298 /* Last octet inside other's window space */
299 && SEQ_GEQ(seq
, src
->seqlo
- (dst
->max_win
<< dws
))
300 /* Retrans: not more than one window back */
301 && (ackskew
>= -MAXACKWINDOW
)
302 /* Acking not more than one reassembled fragment backwards */
303 && (ackskew
<= (MAXACKWINDOW
<< sws
))
304 /* Acking not more than one window forward */
305 && ((tcp_flags
& TCP_RST
) == 0 || orig_seq
== src
->seqlo
306 || (orig_seq
== src
->seqlo
+ 1) || (orig_seq
+ 1 == src
->seqlo
)))
307 || tcp_bypass_seq_chk(ct
)) {
308 /* Require an exact/+1 sequence match on resets when possible */
310 /* update max window */
311 if (src
->max_win
< win
) {
314 /* synchronize sequencing */
315 if (SEQ_GT(end
, src
->seqlo
)) {
318 /* slide the window of what the other end can send */
319 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
320 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
324 if (tcp_flags
& TCP_SYN
&& src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
325 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
327 if (tcp_flags
& TCP_FIN
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
328 src
->state
= CT_DPIF_TCPS_CLOSING
;
330 if (tcp_flags
& TCP_ACK
) {
331 if (dst
->state
== CT_DPIF_TCPS_SYN_SENT
) {
332 dst
->state
= CT_DPIF_TCPS_ESTABLISHED
;
333 } else if (dst
->state
== CT_DPIF_TCPS_CLOSING
) {
334 dst
->state
= CT_DPIF_TCPS_FIN_WAIT_2
;
337 if (tcp_flags
& TCP_RST
) {
338 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
341 if (src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
342 && dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
343 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_CLOSED
, now
);
344 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
345 && dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
346 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_FIN_WAIT
, now
);
347 } else if (src
->state
< CT_DPIF_TCPS_ESTABLISHED
348 || dst
->state
< CT_DPIF_TCPS_ESTABLISHED
) {
349 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_OPENING
, now
);
350 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
351 || dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
352 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_CLOSING
, now
);
354 conn_update_expiration(ct
, &conn
->up
, CT_TM_TCP_ESTABLISHED
, now
);
356 } else if ((dst
->state
< CT_DPIF_TCPS_SYN_SENT
357 || dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
358 || src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
)
359 && SEQ_GEQ(src
->seqhi
+ MAXACKWINDOW
, end
)
360 /* Within a window forward of the originating packet */
361 && SEQ_GEQ(seq
, src
->seqlo
- MAXACKWINDOW
)) {
362 /* Within a window backward of the originating packet */
365 * This currently handles three situations:
366 * 1) Stupid stacks will shotgun SYNs before their peer
368 * 2) When PF catches an already established stream (the
369 * firewall rebooted, the state table was flushed, routes
371 * 3) Packets get funky immediately after the connection
372 * closes (this should catch Solaris spurious ACK|FINs
373 * that web servers like to spew after a close)
375 * This must be a little more careful than the above code
376 * since packet floods will also be caught here. We don't
377 * update the TTL here to mitigate the damage of a packet
378 * flood and so the same code can handle awkward establishment
379 * and a loosened connection close.
380 * In the establishment case, a correct peer response will
381 * validate the connection, go through the normal state code
382 * and keep updating the state TTL.
385 /* update max window */
386 if (src
->max_win
< win
) {
389 /* synchronize sequencing */
390 if (SEQ_GT(end
, src
->seqlo
)) {
393 /* slide the window of what the other end can send */
394 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
395 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
399 * Cannot set dst->seqhi here since this could be a shotgunned
400 * SYN and not an already established connection.
403 if (tcp_flags
& TCP_FIN
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
404 src
->state
= CT_DPIF_TCPS_CLOSING
;
407 if (tcp_flags
& TCP_RST
) {
408 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
411 COVERAGE_INC(conntrack_tcp_seq_chk_failed
);
412 return CT_UPDATE_INVALID
;
415 return CT_UPDATE_VALID
;
419 tcp_valid_new(struct dp_packet
*pkt
)
421 struct tcp_header
*tcp
= dp_packet_l4(pkt
);
422 uint16_t tcp_flags
= TCP_FLAGS(tcp
->tcp_ctl
);
424 if (tcp_invalid_flags(tcp_flags
)) {
428 /* A syn+ack is not allowed to create a connection. We want to allow
429 * totally new connections (syn) or already established, not partially
431 if ((tcp_flags
& TCP_SYN
) && (tcp_flags
& TCP_ACK
)) {
439 tcp_new_conn(struct conntrack
*ct
, struct dp_packet
*pkt
, long long now
,
442 struct conn_tcp
* newconn
= NULL
;
443 struct tcp_header
*tcp
= dp_packet_l4(pkt
);
444 struct tcp_peer
*src
, *dst
;
445 uint16_t tcp_flags
= TCP_FLAGS(tcp
->tcp_ctl
);
447 newconn
= xzalloc(sizeof *newconn
);
449 src
= &newconn
->peer
[0];
450 dst
= &newconn
->peer
[1];
452 src
->seqlo
= ntohl(get_16aligned_be32(&tcp
->tcp_seq
));
453 src
->seqhi
= src
->seqlo
+ tcp_payload_length(pkt
) + 1;
455 if (tcp_flags
& TCP_SYN
) {
457 src
->wscale
= tcp_get_wscale(tcp
);
459 src
->wscale
= CT_WSCALE_UNKNOWN
;
460 dst
->wscale
= CT_WSCALE_UNKNOWN
;
462 src
->max_win
= MAX(ntohs(tcp
->tcp_winsz
), 1);
463 if (src
->wscale
& CT_WSCALE_MASK
) {
464 /* Remove scale factor from initial window */
465 uint8_t sws
= src
->wscale
& CT_WSCALE_MASK
;
466 src
->max_win
= DIV_ROUND_UP((uint32_t) src
->max_win
, 1 << sws
);
468 if (tcp_flags
& TCP_FIN
) {
473 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
474 dst
->state
= CT_DPIF_TCPS_CLOSED
;
476 newconn
->up
.tp_id
= tp_id
;
477 conn_init_expiration(ct
, &newconn
->up
, CT_TM_TCP_FIRST_PACKET
, now
);
483 tcp_peer_to_protoinfo_flags(const struct tcp_peer
*peer
)
487 if (peer
->wscale
& CT_WSCALE_FLAG
) {
488 res
|= CT_DPIF_TCPF_WINDOW_SCALE
;
491 if (peer
->wscale
& CT_WSCALE_UNKNOWN
) {
492 res
|= CT_DPIF_TCPF_BE_LIBERAL
;
499 tcp_conn_get_protoinfo(const struct conn
*conn_
,
500 struct ct_dpif_protoinfo
*protoinfo
)
502 const struct conn_tcp
*conn
= conn_tcp_cast(conn_
);
504 protoinfo
->proto
= IPPROTO_TCP
;
505 protoinfo
->tcp
.state_orig
= conn
->peer
[0].state
;
506 protoinfo
->tcp
.state_reply
= conn
->peer
[1].state
;
508 protoinfo
->tcp
.wscale_orig
= conn
->peer
[0].wscale
& CT_WSCALE_MASK
;
509 protoinfo
->tcp
.wscale_reply
= conn
->peer
[1].wscale
& CT_WSCALE_MASK
;
511 protoinfo
->tcp
.flags_orig
= tcp_peer_to_protoinfo_flags(&conn
->peer
[0]);
512 protoinfo
->tcp
.flags_reply
= tcp_peer_to_protoinfo_flags(&conn
->peer
[1]);
515 struct ct_l4_proto ct_proto_tcp
= {
516 .new_conn
= tcp_new_conn
,
517 .valid_new
= tcp_valid_new
,
518 .conn_update
= tcp_conn_update
,
519 .conn_get_protoinfo
= tcp_conn_get_protoinfo
,