2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 VMware, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
39 #include "Conntrack.h"
43 enum ct_dpif_tcp_state state
;
44 uint32_t seqlo
; /* Max sequence number sent */
45 uint32_t seqhi
; /* Max the other end ACKd + win */
46 uint16_t max_win
;/* largest window (pre scaling) */
47 uint8_t wscale
; /* window scaling factor */
51 struct OVS_CT_ENTRY up
;
52 struct tcp_peer peer
[2];
61 /* TCP sequence numbers are 32 bit integers operated
62 * on with modular arithmetic. These macros can be
63 * used to compare such integers. */
64 #define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
65 #define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
66 #define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
67 #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
69 #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
70 #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
82 #define CT_DPIF_TCP_FLAGS \
83 CT_DPIF_TCP_FLAG(WINDOW_SCALE) \
84 CT_DPIF_TCP_FLAG(SACK_PERM) \
85 CT_DPIF_TCP_FLAG(CLOSE_INIT) \
86 CT_DPIF_TCP_FLAG(BE_LIBERAL) \
87 CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \
88 CT_DPIF_TCP_FLAG(MAXACK_SET) \
90 enum ct_dpif_tcp_flags_count_ {
91 #define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_,
93 #undef CT_DPIF_TCP_FLAG
96 enum ct_dpif_tcp_flags
{
97 #define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \
100 #undef CT_DPIF_TCP_FLAG
104 #define CT_DPIF_TCP_STATES \
105 CT_DPIF_TCP_STATE(CLOSED) \
106 CT_DPIF_TCP_STATE(LISTEN) \
107 CT_DPIF_TCP_STATE(SYN_SENT) \
108 CT_DPIF_TCP_STATE(SYN_RECV) \
109 CT_DPIF_TCP_STATE(ESTABLISHED) \
110 CT_DPIF_TCP_STATE(CLOSE_WAIT) \
111 CT_DPIF_TCP_STATE(FIN_WAIT_1) \
112 CT_DPIF_TCP_STATE(CLOSING) \
113 CT_DPIF_TCP_STATE(LAST_ACK) \
114 CT_DPIF_TCP_STATE(FIN_WAIT_2) \
115 CT_DPIF_TCP_STATE(TIME_WAIT)
117 enum ct_dpif_tcp_state
{
118 #define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE,
120 #undef CT_DPIF_TCP_STATE
123 #define TCP_MAX_WSCALE 14
124 #define CT_WSCALE_FLAG 0x80
125 #define CT_WSCALE_UNKNOWN 0x40
126 #define CT_WSCALE_MASK 0xf
128 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
129 * is enabled. We're not scrubbing, but this check seems reasonable. */
130 static __inline BOOLEAN
131 OvsCtInvalidTcpFlags(uint16_t flags
)
133 if (flags
& TCP_SYN
) {
134 if (flags
& TCP_RST
|| flags
& TCP_FIN
) {
139 if (!(flags
& (TCP_ACK
|TCP_RST
))) {
144 if (!(flags
& TCP_ACK
)) {
145 /* These flags are only valid if ACK is set */
146 if ((flags
& TCP_FIN
) || (flags
& TCP_PSH
) || (flags
& TCP_URG
)) {
154 static __inline
uint8_t
155 OvsTcpGetWscale(const TCPHdr
*tcp
)
157 int len
= tcp
->doff
* 4 - sizeof *tcp
;
158 const uint8_t *opt
= (const uint8_t *)(tcp
+ 1);
171 wscale
= MIN(opt
[2], TCP_MAX_WSCALE
);
172 wscale
|= CT_WSCALE_FLAG
;
187 static __inline
struct conn_tcp
*
188 OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY
* conn
)
190 return CONTAINER_OF(conn
, struct conn_tcp
, up
);
194 OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY
* conn_
,
196 PNET_BUFFER_LIST nbl
,
200 struct conn_tcp
*conn
= OvsCastConntrackEntryToTcpEntry(conn_
);
201 /* The peer that sent 'pkt' */
202 struct tcp_peer
*src
= &conn
->peer
[reply
? 1 : 0];
203 /* The peer that should receive 'pkt' */
204 struct tcp_peer
*dst
= &conn
->peer
[reply
? 0 : 1];
205 uint8_t sws
= 0, dws
= 0;
206 UINT16 tcp_flags
= ntohs(tcp
->flags
);
207 uint16_t win
= ntohs(tcp
->window
);
208 uint32_t ack
, end
, seq
, orig_seq
;
209 uint32_t p_len
= OvsGetTcpPayloadLength(nbl
);
212 if (OvsCtInvalidTcpFlags(tcp_flags
)) {
213 return CT_UPDATE_INVALID
;
216 if (((tcp_flags
& (TCP_SYN
|TCP_ACK
)) == TCP_SYN
)
217 && dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
218 && src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
219 src
->state
= dst
->state
= CT_DPIF_TCPS_CLOSED
;
220 return CT_UPDATE_NEW
;
223 if (src
->wscale
& CT_WSCALE_FLAG
224 && dst
->wscale
& CT_WSCALE_FLAG
225 && !(tcp_flags
& TCP_SYN
)) {
227 sws
= src
->wscale
& CT_WSCALE_MASK
;
228 dws
= dst
->wscale
& CT_WSCALE_MASK
;
230 } else if (src
->wscale
& CT_WSCALE_UNKNOWN
231 && dst
->wscale
& CT_WSCALE_UNKNOWN
232 && !(tcp_flags
& TCP_SYN
)) {
234 sws
= TCP_MAX_WSCALE
;
235 dws
= TCP_MAX_WSCALE
;
239 * Sequence tracking algorithm from Guido van Rooij's paper:
240 * http://www.madison-gurkha.com/publications/tcp_filtering/
244 orig_seq
= seq
= ntohl(tcp
->seq
);
245 if (src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
246 /* First packet from this end. Set its state */
248 ack
= ntohl(tcp
->ack_seq
);
251 if (tcp_flags
& TCP_SYN
) {
253 if (dst
->wscale
& CT_WSCALE_FLAG
) {
254 src
->wscale
= OvsTcpGetWscale(tcp
);
255 if (src
->wscale
& CT_WSCALE_FLAG
) {
256 /* Remove scale factor from initial window */
257 sws
= src
->wscale
& CT_WSCALE_MASK
;
258 win
= DIV_ROUND_UP((uint32_t) win
, 1 << sws
);
259 dws
= dst
->wscale
& CT_WSCALE_MASK
;
261 /* fixup other window */
262 dst
->max_win
<<= dst
->wscale
& CT_WSCALE_MASK
;
263 /* in case of a retrans SYN|ACK */
268 if (tcp_flags
& TCP_FIN
) {
273 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
275 * May need to slide the window (seqhi may have been set by
276 * the crappy stack check or if we picked up the connection
277 * after establishment)
279 if (src
->seqhi
== 1 ||
280 SEQ_GEQ(end
+ MAX(1, dst
->max_win
<< dws
), src
->seqhi
)) {
281 src
->seqhi
= end
+ MAX(1, dst
->max_win
<< dws
);
283 if (win
> src
->max_win
) {
288 ack
= ntohl(tcp
->ack_seq
);
290 if (tcp_flags
& TCP_SYN
) {
293 if (tcp_flags
& TCP_FIN
) {
298 if ((tcp_flags
& TCP_ACK
) == 0) {
299 /* Let it pass through the ack skew check */
302 && (tcp_flags
& (TCP_ACK
|TCP_RST
)) == (TCP_ACK
|TCP_RST
))
303 /* broken tcp stacks do not set ack */) {
304 /* Many stacks (ours included) will set the ACK number in an
305 * FIN|ACK if the SYN times out -- no sequence to ACK. */
310 /* Ease sequencing restrictions on no data packets */
315 ackskew
= dst
->seqlo
- ack
;
316 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
317 if (SEQ_GEQ(src
->seqhi
, end
)
318 /* Last octet inside other's window space */
319 && SEQ_GEQ(seq
, src
->seqlo
- (dst
->max_win
<< dws
))
320 /* Retrans: not more than one window back */
321 && (ackskew
>= -MAXACKWINDOW
)
322 /* Acking not more than one reassembled fragment backwards */
323 && (ackskew
<= (MAXACKWINDOW
<< sws
))
324 /* Acking not more than one window forward */
325 && ((tcp_flags
& TCP_RST
) == 0 || orig_seq
== src
->seqlo
326 || (orig_seq
== src
->seqlo
+ 1)
327 || (orig_seq
+ 1 == src
->seqlo
))) {
328 /* Require an exact/+1 sequence match on resets when possible */
330 /* update max window */
331 if (src
->max_win
< win
) {
334 /* synchronize sequencing */
335 if (SEQ_GT(end
, src
->seqlo
)) {
338 /* slide the window of what the other end can send */
339 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
340 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
344 if (tcp_flags
& TCP_SYN
&& src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
345 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
347 if (tcp_flags
& TCP_FIN
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
348 src
->state
= CT_DPIF_TCPS_CLOSING
;
350 if (tcp_flags
& TCP_ACK
) {
351 if (dst
->state
== CT_DPIF_TCPS_SYN_SENT
) {
352 dst
->state
= CT_DPIF_TCPS_ESTABLISHED
;
353 } else if (dst
->state
== CT_DPIF_TCPS_CLOSING
) {
354 dst
->state
= CT_DPIF_TCPS_FIN_WAIT_2
;
357 if (tcp_flags
& TCP_RST
) {
358 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
361 if (src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
362 && dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
363 OvsConntrackUpdateExpiration(&conn
->up
, now
,
364 30 * CT_INTERVAL_SEC
);
365 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
366 && dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
367 OvsConntrackUpdateExpiration(&conn
->up
, now
,
368 45 * CT_INTERVAL_SEC
);
369 } else if (src
->state
< CT_DPIF_TCPS_ESTABLISHED
370 || dst
->state
< CT_DPIF_TCPS_ESTABLISHED
) {
371 OvsConntrackUpdateExpiration(&conn
->up
, now
,
372 30 * CT_INTERVAL_SEC
);
373 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
374 || dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
375 OvsConntrackUpdateExpiration(&conn
->up
, now
,
376 15 * 60 * CT_INTERVAL_SEC
);
378 OvsConntrackUpdateExpiration(&conn
->up
, now
,
379 24 * 60 * 60 * CT_INTERVAL_SEC
);
381 } else if ((dst
->state
< CT_DPIF_TCPS_SYN_SENT
382 || dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
383 || src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
)
384 && SEQ_GEQ(src
->seqhi
+ MAXACKWINDOW
, end
)
385 /* Within a window forward of the originating packet */
386 && SEQ_GEQ(seq
, src
->seqlo
- MAXACKWINDOW
)) {
387 /* Within a window backward of the originating packet */
390 * This currently handles three situations:
391 * 1) Stupid stacks will shotgun SYNs before their peer
393 * 2) When PF catches an already established stream (the
394 * firewall rebooted, the state table was flushed, routes
396 * 3) Packets get funky immediately after the connection
397 * closes (this should catch Solaris spurious ACK|FINs
398 * that web servers like to spew after a close)
400 * This must be a little more careful than the above code
401 * since packet floods will also be caught here. We don't
402 * update the TTL here to mitigate the damage of a packet
403 * flood and so the same code can handle awkward establishment
404 * and a loosened connection close.
405 * In the establishment case, a correct peer response will
406 * validate the connection, go through the normal state code
407 * and keep updating the state TTL.
410 /* update max window */
411 if (src
->max_win
< win
) {
414 /* synchronize sequencing */
415 if (SEQ_GT(end
, src
->seqlo
)) {
418 /* slide the window of what the other end can send */
419 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
420 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
424 * Cannot set dst->seqhi here since this could be a shotgunned
425 * SYN and not an already established connection.
428 if (tcp_flags
& TCP_FIN
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
429 src
->state
= CT_DPIF_TCPS_CLOSING
;
432 if (tcp_flags
& TCP_RST
) {
433 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
436 return CT_UPDATE_INVALID
;
439 return CT_UPDATE_VALID
;
443 OvsConntrackValidateTcpPacket(const TCPHdr
*tcp
)
449 UINT16 tcp_flags
= ntohs(tcp
->flags
);
451 if (OvsCtInvalidTcpFlags(tcp_flags
)) {
455 /* A syn+ack is not allowed to create a connection. We want to allow
456 * totally new connections (syn) or already established, not partially
458 if ((tcp_flags
& TCP_SYN
) && (tcp_flags
& TCP_ACK
)) {
466 OvsConntrackCreateTcpEntry(const TCPHdr
*tcp
,
467 PNET_BUFFER_LIST nbl
,
470 struct conn_tcp
* newconn
;
471 struct tcp_peer
*src
, *dst
;
473 newconn
= OvsAllocateMemoryWithTag(sizeof(struct conn_tcp
),
479 newconn
->up
= (OVS_CT_ENTRY
) {0};
480 src
= &newconn
->peer
[0];
481 dst
= &newconn
->peer
[1];
483 src
->seqlo
= ntohl(tcp
->seq
);
484 src
->seqhi
= src
->seqlo
+ OvsGetTcpPayloadLength(nbl
) + 1;
486 if (tcp
->flags
& TCP_SYN
) {
488 src
->wscale
= OvsTcpGetWscale(tcp
);
490 src
->wscale
= CT_WSCALE_UNKNOWN
;
491 dst
->wscale
= CT_WSCALE_UNKNOWN
;
493 src
->max_win
= MAX(ntohs(tcp
->window
), 1);
494 if (src
->wscale
& CT_WSCALE_MASK
) {
495 /* Remove scale factor from initial window */
496 uint8_t sws
= src
->wscale
& CT_WSCALE_MASK
;
497 src
->max_win
= DIV_ROUND_UP((uint32_t) src
->max_win
, 1 << sws
);
499 if (tcp
->flags
& TCP_FIN
) {
504 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
505 dst
->state
= CT_DPIF_TCPS_CLOSED
;
507 OvsConntrackUpdateExpiration(&newconn
->up
, now
, CT_ENTRY_TIMEOUT
);
512 static __inline
uint8_t
513 OvsCtTcpPeerToProtoInfoFlags(const struct tcp_peer
*peer
)
517 if (peer
->wscale
& CT_WSCALE_FLAG
) {
518 res
|= CT_DPIF_TCPF_WINDOW_SCALE
;
521 if (peer
->wscale
& CT_WSCALE_UNKNOWN
) {
522 res
|= CT_DPIF_TCPF_BE_LIBERAL
;
529 OvsCtMapTcpProtoInfoToNl(PNL_BUFFER nlBuf
, OVS_CT_ENTRY
*conn_
)
531 struct conn_tcp
*conn
= OvsCastConntrackEntryToTcpEntry(conn_
);
532 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
535 offset
= NlMsgStartNested(nlBuf
, CTA_PROTOINFO_TCP
);
537 return NDIS_STATUS_FAILURE
;
540 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTOINFO_TCP_STATE
,
541 conn
->peer
[0].state
)) {
542 status
= NDIS_STATUS_FAILURE
;
545 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL
,
546 (conn
->peer
[0].wscale
& CT_WSCALE_MASK
))) {
547 status
= NDIS_STATUS_FAILURE
;
550 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTOINFO_TCP_WSCALE_REPLY
,
551 (conn
->peer
[1].wscale
& CT_WSCALE_MASK
))) {
552 status
= NDIS_STATUS_FAILURE
;
555 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL
,
556 OvsCtTcpPeerToProtoInfoFlags(&conn
->peer
[0]))) {
557 status
= NDIS_STATUS_FAILURE
;
560 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTOINFO_TCP_FLAGS_REPLY
,
561 OvsCtTcpPeerToProtoInfoFlags(&conn
->peer
[1]))) {
562 status
= NDIS_STATUS_FAILURE
;
567 NlMsgEndNested(nlBuf
, offset
);