]>
git.proxmox.com Git - mirror_ovs.git/blob - datapath-windows/ovsext/Conntrack-tcp.c
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 VMware, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
39 #include "Conntrack.h"
43 enum ct_dpif_tcp_state state
;
44 uint32_t seqlo
; /* Max sequence number sent */
45 uint32_t seqhi
; /* Max the other end ACKd + win */
46 uint16_t max_win
;/* largest window (pre scaling) */
47 uint8_t wscale
; /* window scaling factor */
51 struct OVS_CT_ENTRY up
;
52 struct tcp_peer peer
[2];
61 /* TCP sequence numbers are 32 bit integers operated
62 * on with modular arithmetic. These macros can be
63 * used to compare such integers. */
64 #define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
65 #define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
66 #define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
67 #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
69 #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
70 #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
82 #define CT_DPIF_TCP_FLAGS \
83 CT_DPIF_TCP_FLAG(WINDOW_SCALE) \
84 CT_DPIF_TCP_FLAG(SACK_PERM) \
85 CT_DPIF_TCP_FLAG(CLOSE_INIT) \
86 CT_DPIF_TCP_FLAG(BE_LIBERAL) \
87 CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \
88 CT_DPIF_TCP_FLAG(MAXACK_SET) \
90 enum ct_dpif_tcp_flags_count_ {
91 #define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_,
93 #undef CT_DPIF_TCP_FLAG
96 enum ct_dpif_tcp_flags
{
97 #define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \
100 #undef CT_DPIF_TCP_FLAG
104 #define CT_DPIF_TCP_STATES \
105 CT_DPIF_TCP_STATE(CLOSED) \
106 CT_DPIF_TCP_STATE(LISTEN) \
107 CT_DPIF_TCP_STATE(SYN_SENT) \
108 CT_DPIF_TCP_STATE(SYN_RECV) \
109 CT_DPIF_TCP_STATE(ESTABLISHED) \
110 CT_DPIF_TCP_STATE(CLOSE_WAIT) \
111 CT_DPIF_TCP_STATE(FIN_WAIT_1) \
112 CT_DPIF_TCP_STATE(CLOSING) \
113 CT_DPIF_TCP_STATE(LAST_ACK) \
114 CT_DPIF_TCP_STATE(FIN_WAIT_2) \
115 CT_DPIF_TCP_STATE(TIME_WAIT)
117 enum ct_dpif_tcp_state
{
118 #define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE,
120 #undef CT_DPIF_TCP_STATE
123 #define TCP_MAX_WSCALE 14
124 #define CT_WSCALE_FLAG 0x80
125 #define CT_WSCALE_UNKNOWN 0x40
126 #define CT_WSCALE_MASK 0xf
128 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
129 * is enabled. We're not scrubbing, but this check seems reasonable. */
130 static __inline BOOLEAN
131 OvsConntrackValidateTcpFlags(const TCPHdr
*tcp
)
138 /* Here pf removes the fin flag. We simply mark the packet as
144 if (!(tcp
->ack
|| tcp
->rst
)) {
150 /* These flags are only valid if ACK is set */
151 if ((tcp
->fin
) || (tcp
->psh
) || (tcp
->urg
)) {
159 static __inline
uint8_t
160 OvsTcpGetWscale(const TCPHdr
*tcp
)
162 int len
= tcp
->doff
* 4 - sizeof *tcp
;
163 const uint8_t *opt
= (const uint8_t *)(tcp
+ 1);
168 if (*opt
== TCPOPT_EOL
) {
177 wscale
= MIN(opt
[2], TCP_MAX_WSCALE
);
178 wscale
|= CT_WSCALE_FLAG
;
193 static __inline
uint32_t
194 OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl
)
197 char *ipBuf
[sizeof(IPHdr
)];
199 curNb
= NET_BUFFER_LIST_FIRST_NB(nbl
);
200 ipHdr
= NdisGetDataBuffer(curNb
, sizeof *ipHdr
, (PVOID
) &ipBuf
,
202 TCPHdr
*tcp
= (TCPHdr
*)((PCHAR
)ipHdr
+ ipHdr
->ihl
* 4);
203 return (UINT16
)ntohs(ipHdr
->tot_len
)
209 OvsConntrackUpdateExpiration(struct conn_tcp
*conn
,
213 conn
->up
.expiration
= now
+ interval
;
216 static __inline
struct conn_tcp
*
217 OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY
* conn
)
219 return CONTAINER_OF(conn
, struct conn_tcp
, up
);
223 OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY
* conn_
,
225 PNET_BUFFER_LIST nbl
,
229 struct conn_tcp
*conn
= OvsCastConntrackEntryToTcpEntry(conn_
);
230 /* The peer that sent 'pkt' */
231 struct tcp_peer
*src
= &conn
->peer
[reply
? 1 : 0];
232 /* The peer that should receive 'pkt' */
233 struct tcp_peer
*dst
= &conn
->peer
[reply
? 0 : 1];
234 uint8_t sws
= 0, dws
= 0;
235 uint16_t win
= ntohs(tcp
->window
);
236 uint32_t ack
, end
, seq
, orig_seq
;
237 uint32_t p_len
= OvsGetTcpPayloadLength(nbl
);
240 if (OvsConntrackValidateTcpFlags(tcp
)) {
241 return CT_UPDATE_INVALID
;
244 if ((tcp
->syn
) && dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
&&
245 src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
246 src
->state
= dst
->state
= CT_DPIF_TCPS_CLOSED
;
247 return CT_UPDATE_NEW
;
250 if (src
->wscale
& CT_WSCALE_FLAG
251 && dst
->wscale
& CT_WSCALE_FLAG
254 sws
= src
->wscale
& CT_WSCALE_MASK
;
255 dws
= dst
->wscale
& CT_WSCALE_MASK
;
257 } else if (src
->wscale
& CT_WSCALE_UNKNOWN
258 && dst
->wscale
& CT_WSCALE_UNKNOWN
261 sws
= TCP_MAX_WSCALE
;
262 dws
= TCP_MAX_WSCALE
;
266 * Sequence tracking algorithm from Guido van Rooij's paper:
267 * http://www.madison-gurkha.com/publications/tcp_filtering/
271 orig_seq
= seq
= ntohl(tcp
->seq
);
272 if (src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
273 /* First packet from this end. Set its state */
275 ack
= ntohl(tcp
->ack
);
280 if (dst
->wscale
& CT_WSCALE_FLAG
) {
281 src
->wscale
= OvsTcpGetWscale(tcp
);
282 if (src
->wscale
& CT_WSCALE_FLAG
) {
283 /* Remove scale factor from initial window */
284 sws
= src
->wscale
& CT_WSCALE_MASK
;
285 win
= DIV_ROUND_UP((uint32_t) win
, 1 << sws
);
286 dws
= dst
->wscale
& CT_WSCALE_MASK
;
288 /* fixup other window */
289 dst
->max_win
<<= dst
->wscale
&
291 /* in case of a retrans SYN|ACK */
301 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
303 * May need to slide the window (seqhi may have been set by
304 * the crappy stack check or if we picked up the connection
305 * after establishment)
307 if (src
->seqhi
== 1 ||
308 SEQ_GEQ(end
+ MAX(1, dst
->max_win
<< dws
),
310 src
->seqhi
= end
+ MAX(1, dst
->max_win
<< dws
);
312 if (win
> src
->max_win
) {
317 ack
= ntohl(tcp
->ack
);
327 if ((tcp
->ack
) == 0) {
328 /* Let it pass through the ack skew check */
331 && (tcp
->ack
&& tcp
->rst
) == (TCP_ACK
|TCP_RST
))
332 /* broken tcp stacks do not set ack */) {
333 /* Many stacks (ours included) will set the ACK number in an
334 * FIN|ACK if the SYN times out -- no sequence to ACK. */
339 /* Ease sequencing restrictions on no data packets */
344 ackskew
= dst
->seqlo
- ack
;
345 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
346 if (SEQ_GEQ(src
->seqhi
, end
)
347 /* Last octet inside other's window space */
348 && SEQ_GEQ(seq
, src
->seqlo
- (dst
->max_win
<< dws
))
349 /* Retrans: not more than one window back */
350 && (ackskew
>= -MAXACKWINDOW
)
351 /* Acking not more than one reassembled fragment backwards */
352 && (ackskew
<= (MAXACKWINDOW
<< sws
))
353 /* Acking not more than one window forward */
354 && ((tcp
->rst
) == 0 || orig_seq
== src
->seqlo
355 || (orig_seq
== src
->seqlo
+ 1)
356 || (orig_seq
+ 1 == src
->seqlo
))) {
357 /* Require an exact/+1 sequence match on resets when possible */
359 /* update max window */
360 if (src
->max_win
< win
) {
363 /* synchronize sequencing */
364 if (SEQ_GT(end
, src
->seqlo
)) {
367 /* slide the window of what the other end can send */
368 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
369 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
373 if (tcp
->syn
&& src
->state
< CT_DPIF_TCPS_SYN_SENT
) {
374 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
376 if (tcp
->fin
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
377 src
->state
= CT_DPIF_TCPS_CLOSING
;
380 if (dst
->state
== CT_DPIF_TCPS_SYN_SENT
) {
381 dst
->state
= CT_DPIF_TCPS_ESTABLISHED
;
382 } else if (dst
->state
== CT_DPIF_TCPS_CLOSING
) {
383 dst
->state
= CT_DPIF_TCPS_FIN_WAIT_2
;
387 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
390 if (src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
391 && dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
) {
392 OvsConntrackUpdateExpiration(conn
, now
, 30 * CT_INTERVAL_SEC
);
393 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
394 && dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
395 OvsConntrackUpdateExpiration(conn
, now
, 45 * CT_INTERVAL_SEC
);
396 } else if (src
->state
< CT_DPIF_TCPS_ESTABLISHED
397 || dst
->state
< CT_DPIF_TCPS_ESTABLISHED
) {
398 OvsConntrackUpdateExpiration(conn
, now
, 30 * CT_INTERVAL_SEC
);
399 } else if (src
->state
>= CT_DPIF_TCPS_CLOSING
400 || dst
->state
>= CT_DPIF_TCPS_CLOSING
) {
401 OvsConntrackUpdateExpiration(conn
, now
, 15 * 60 * CT_INTERVAL_SEC
);
403 OvsConntrackUpdateExpiration(conn
, now
, 24 * 60 * 60 * CT_INTERVAL_SEC
);
405 } else if ((dst
->state
< CT_DPIF_TCPS_SYN_SENT
406 || dst
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
407 || src
->state
>= CT_DPIF_TCPS_FIN_WAIT_2
)
408 && SEQ_GEQ(src
->seqhi
+ MAXACKWINDOW
, end
)
409 /* Within a window forward of the originating packet */
410 && SEQ_GEQ(seq
, src
->seqlo
- MAXACKWINDOW
)) {
411 /* Within a window backward of the originating packet */
414 * This currently handles three situations:
415 * 1) Stupid stacks will shotgun SYNs before their peer
417 * 2) When PF catches an already established stream (the
418 * firewall rebooted, the state table was flushed, routes
420 * 3) Packets get funky immediately after the connection
421 * closes (this should catch Solaris spurious ACK|FINs
422 * that web servers like to spew after a close)
424 * This must be a little more careful than the above code
425 * since packet floods will also be caught here. We don't
426 * update the TTL here to mitigate the damage of a packet
427 * flood and so the same code can handle awkward establishment
428 * and a loosened connection close.
429 * In the establishment case, a correct peer response will
430 * validate the connection, go through the normal state code
431 * and keep updating the state TTL.
434 /* update max window */
435 if (src
->max_win
< win
) {
438 /* synchronize sequencing */
439 if (SEQ_GT(end
, src
->seqlo
)) {
442 /* slide the window of what the other end can send */
443 if (SEQ_GEQ(ack
+ (win
<< sws
), dst
->seqhi
)) {
444 dst
->seqhi
= ack
+ MAX((win
<< sws
), 1);
448 * Cannot set dst->seqhi here since this could be a shotgunned
449 * SYN and not an already established connection.
452 if (tcp
->fin
&& src
->state
< CT_DPIF_TCPS_CLOSING
) {
453 src
->state
= CT_DPIF_TCPS_CLOSING
;
457 src
->state
= dst
->state
= CT_DPIF_TCPS_TIME_WAIT
;
460 return CT_UPDATE_INVALID
;
463 return CT_UPDATE_VALID
;
467 OvsConntrackValidateTcpPacket(const TCPHdr
*tcp
)
469 if (tcp
== NULL
|| OvsConntrackValidateTcpFlags(tcp
)) {
473 /* A syn+ack is not allowed to create a connection. We want to allow
474 * totally new connections (syn) or already established, not partially
476 if ((tcp
->syn
) && (tcp
->ack
)) {
484 OvsConntrackCreateTcpEntry(const TCPHdr
*tcp
,
485 PNET_BUFFER_LIST nbl
,
488 struct conn_tcp
* newconn
= NULL
;
489 struct tcp_peer
*src
, *dst
;
491 newconn
= OvsAllocateMemoryWithTag(sizeof(struct conn_tcp
),
493 newconn
->up
= (OVS_CT_ENTRY
) {0};
494 src
= &newconn
->peer
[0];
495 dst
= &newconn
->peer
[1];
497 src
->seqlo
= ntohl(tcp
->seq
);
498 src
->seqhi
= src
->seqlo
+ OvsGetTcpPayloadLength(nbl
) + 1;
502 src
->wscale
= OvsTcpGetWscale(tcp
);
504 src
->wscale
= CT_WSCALE_UNKNOWN
;
505 dst
->wscale
= CT_WSCALE_UNKNOWN
;
507 src
->max_win
= MAX(ntohs(tcp
->window
), 1);
508 if (src
->wscale
& CT_WSCALE_MASK
) {
509 /* Remove scale factor from initial window */
510 uint8_t sws
= src
->wscale
& CT_WSCALE_MASK
;
511 src
->max_win
= DIV_ROUND_UP((uint32_t) src
->max_win
,
519 src
->state
= CT_DPIF_TCPS_SYN_SENT
;
520 dst
->state
= CT_DPIF_TCPS_CLOSED
;
522 OvsConntrackUpdateExpiration(newconn
, now
, CT_ENTRY_TIMEOUT
);