]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack-tcp.c
conntrack: Fix conntrack new state
[mirror_ovs.git] / lib / conntrack-tcp.c
1 /*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 Nicira, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
37 */
38
39 #include <config.h>
40
41 #include "conntrack-private.h"
42 #include "coverage.h"
43 #include "ct-dpif.h"
44 #include "dp-packet.h"
45 #include "util.h"
46
47 COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass);
48 COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed);
49 COVERAGE_DEFINE(conntrack_invalid_tcp_flags);
50
51 struct tcp_peer {
52 uint32_t seqlo; /* Max sequence number sent */
53 uint32_t seqhi; /* Max the other end ACKd + win */
54 uint16_t max_win; /* largest window (pre scaling) */
55 uint8_t wscale; /* window scaling factor */
56 enum ct_dpif_tcp_state state;
57 };
58
59 struct conn_tcp {
60 struct conn up;
61 struct tcp_peer peer[2]; /* 'conn' lock protected. */
62 };
63
64 enum {
65 TCPOPT_EOL,
66 TCPOPT_NOP,
67 TCPOPT_WINDOW = 3,
68 };
69
70 /* TCP sequence numbers are 32 bit integers operated
71 * on with modular arithmetic. These macros can be
72 * used to compare such integers. */
73 #define SEQ_LT(a,b) INT_MOD_LT(a, b)
74 #define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
75 #define SEQ_GT(a,b) INT_MOD_GT(a, b)
76 #define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
77
78 #define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
79 #define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
80
81 static struct conn_tcp*
82 conn_tcp_cast(const struct conn* conn)
83 {
84 return CONTAINER_OF(conn, struct conn_tcp, up);
85 }
86
87 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
88 * is enabled. We're not scrubbing, but this check seems reasonable. */
89 static bool
90 tcp_invalid_flags(uint16_t flags)
91 {
92
93 if (flags & TCP_SYN) {
94 if (flags & TCP_RST || flags & TCP_FIN) {
95 return true;
96 }
97 } else {
98 /* Illegal packet */
99 if (!(flags & (TCP_ACK|TCP_RST))) {
100 return true;
101 }
102 }
103
104 if (!(flags & TCP_ACK)) {
105 /* These flags are only valid if ACK is set */
106 if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
107 return true;
108 }
109 }
110
111 return false;
112 }
113
114 #define TCP_MAX_WSCALE 14
115 #define CT_WSCALE_FLAG 0x80
116 #define CT_WSCALE_UNKNOWN 0x40
117 #define CT_WSCALE_MASK 0xf
118
119 static uint8_t
120 tcp_get_wscale(const struct tcp_header *tcp)
121 {
122 int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
123 const uint8_t *opt = (const uint8_t *)(tcp + 1);
124 uint8_t wscale = 0;
125 uint8_t optlen;
126
127 while (len >= 3) {
128 switch (*opt) {
129 case TCPOPT_EOL:
130 return wscale;
131 case TCPOPT_NOP:
132 opt++;
133 len--;
134 break;
135 case TCPOPT_WINDOW:
136 wscale = MIN(opt[2], TCP_MAX_WSCALE);
137 wscale |= CT_WSCALE_FLAG;
138 /* fall through */
139 default:
140 optlen = opt[1];
141 if (optlen < 2) {
142 optlen = 2;
143 }
144 len -= optlen;
145 opt += optlen;
146 }
147 }
148
149 return wscale;
150 }
151
152 static bool
153 tcp_bypass_seq_chk(struct conntrack *ct)
154 {
155 if (!conntrack_get_tcp_seq_chk(ct)) {
156 COVERAGE_INC(conntrack_tcp_seq_chk_bypass);
157 return true;
158 }
159 return false;
160 }
161
162 static enum ct_update_res
163 tcp_conn_update(struct conntrack *ct, struct conn *conn_,
164 struct dp_packet *pkt, bool reply, long long now)
165 {
166 struct conn_tcp *conn = conn_tcp_cast(conn_);
167 struct tcp_header *tcp = dp_packet_l4(pkt);
168 /* The peer that sent 'pkt' */
169 struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
170 /* The peer that should receive 'pkt' */
171 struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
172 uint8_t sws = 0, dws = 0;
173 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
174
175 uint16_t win = ntohs(tcp->tcp_winsz);
176 uint32_t ack, end, seq, orig_seq;
177 uint32_t p_len = tcp_payload_length(pkt);
178
179 if (tcp_invalid_flags(tcp_flags)) {
180 COVERAGE_INC(conntrack_invalid_tcp_flags);
181 return CT_UPDATE_INVALID;
182 }
183
184 if ((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) {
185 if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
186 && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
187 src->state = dst->state = CT_DPIF_TCPS_CLOSED;
188 return CT_UPDATE_NEW;
189 } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) {
190 src->state = CT_DPIF_TCPS_SYN_SENT;
191 conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now);
192 return CT_UPDATE_NEW;
193 }
194 }
195
196 if (src->wscale & CT_WSCALE_FLAG
197 && dst->wscale & CT_WSCALE_FLAG
198 && !(tcp_flags & TCP_SYN)) {
199
200 sws = src->wscale & CT_WSCALE_MASK;
201 dws = dst->wscale & CT_WSCALE_MASK;
202
203 } else if (src->wscale & CT_WSCALE_UNKNOWN
204 && dst->wscale & CT_WSCALE_UNKNOWN
205 && !(tcp_flags & TCP_SYN)) {
206
207 sws = TCP_MAX_WSCALE;
208 dws = TCP_MAX_WSCALE;
209 }
210
211 /*
212 * Sequence tracking algorithm from Guido van Rooij's paper:
213 * http://www.madison-gurkha.com/publications/tcp_filtering/
214 * tcp_filtering.ps
215 */
216
217 orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
218 bool check_ackskew = true;
219 if (src->state < CT_DPIF_TCPS_SYN_SENT) {
220 /* First packet from this end. Set its state */
221
222 ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
223
224 end = seq + p_len;
225 if (tcp_flags & TCP_SYN) {
226 end++;
227 if (dst->wscale & CT_WSCALE_FLAG) {
228 src->wscale = tcp_get_wscale(tcp);
229 if (src->wscale & CT_WSCALE_FLAG) {
230 /* Remove scale factor from initial window */
231 sws = src->wscale & CT_WSCALE_MASK;
232 win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
233 dws = dst->wscale & CT_WSCALE_MASK;
234 } else {
235 /* fixup other window */
236 dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
237 /* in case of a retrans SYN|ACK */
238 dst->wscale = 0;
239 }
240 }
241 }
242 if (tcp_flags & TCP_FIN) {
243 end++;
244 }
245
246 src->seqlo = seq;
247 src->state = CT_DPIF_TCPS_SYN_SENT;
248 /*
249 * May need to slide the window (seqhi may have been set by
250 * the crappy stack check or if we picked up the connection
251 * after establishment)
252 */
253 if (src->seqhi == 1
254 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
255 src->seqhi = end + MAX(1, dst->max_win << dws);
256 /* We are either picking up a new connection or a connection which
257 * was already in place. We are more permissive in terms of
258 * ackskew checking in these cases.
259 */
260 check_ackskew = false;
261 }
262 if (win > src->max_win) {
263 src->max_win = win;
264 }
265
266 } else {
267 ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
268 end = seq + p_len;
269 if (tcp_flags & TCP_SYN) {
270 end++;
271 }
272 if (tcp_flags & TCP_FIN) {
273 end++;
274 }
275 }
276
277 if ((tcp_flags & TCP_ACK) == 0) {
278 /* Let it pass through the ack skew check */
279 ack = dst->seqlo;
280 } else if ((ack == 0
281 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
282 /* broken tcp stacks do not set ack */) {
283 /* Many stacks (ours included) will set the ACK number in an
284 * FIN|ACK if the SYN times out -- no sequence to ACK. */
285 ack = dst->seqlo;
286 }
287
288 if (seq == end) {
289 /* Ease sequencing restrictions on no data packets */
290 seq = src->seqlo;
291 end = seq;
292 }
293
294 int ackskew = check_ackskew ? dst->seqlo - ack : 0;
295 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
296 if ((SEQ_GEQ(src->seqhi, end)
297 /* Last octet inside other's window space */
298 && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
299 /* Retrans: not more than one window back */
300 && (ackskew >= -MAXACKWINDOW)
301 /* Acking not more than one reassembled fragment backwards */
302 && (ackskew <= (MAXACKWINDOW << sws))
303 /* Acking not more than one window forward */
304 && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
305 || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
306 || tcp_bypass_seq_chk(ct)) {
307 /* Require an exact/+1 sequence match on resets when possible */
308
309 /* update max window */
310 if (src->max_win < win) {
311 src->max_win = win;
312 }
313 /* synchronize sequencing */
314 if (SEQ_GT(end, src->seqlo)) {
315 src->seqlo = end;
316 }
317 /* slide the window of what the other end can send */
318 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
319 dst->seqhi = ack + MAX((win << sws), 1);
320 }
321
322 /* update states */
323 if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
324 src->state = CT_DPIF_TCPS_SYN_SENT;
325 }
326 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
327 src->state = CT_DPIF_TCPS_CLOSING;
328 }
329 if (tcp_flags & TCP_ACK) {
330 if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
331 dst->state = CT_DPIF_TCPS_ESTABLISHED;
332 } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
333 dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
334 }
335 }
336 if (tcp_flags & TCP_RST) {
337 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
338 }
339
340 if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
341 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
342 conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSED, now);
343 } else if (src->state >= CT_DPIF_TCPS_CLOSING
344 && dst->state >= CT_DPIF_TCPS_CLOSING) {
345 conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIN_WAIT, now);
346 } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
347 || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
348 conn_update_expiration(ct, &conn->up, CT_TM_TCP_OPENING, now);
349 } else if (src->state >= CT_DPIF_TCPS_CLOSING
350 || dst->state >= CT_DPIF_TCPS_CLOSING) {
351 conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSING, now);
352 } else {
353 conn_update_expiration(ct, &conn->up, CT_TM_TCP_ESTABLISHED, now);
354 }
355 } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
356 || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
357 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
358 && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
359 /* Within a window forward of the originating packet */
360 && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
361 /* Within a window backward of the originating packet */
362
363 /*
364 * This currently handles three situations:
365 * 1) Stupid stacks will shotgun SYNs before their peer
366 * replies.
367 * 2) When PF catches an already established stream (the
368 * firewall rebooted, the state table was flushed, routes
369 * changed...)
370 * 3) Packets get funky immediately after the connection
371 * closes (this should catch Solaris spurious ACK|FINs
372 * that web servers like to spew after a close)
373 *
374 * This must be a little more careful than the above code
375 * since packet floods will also be caught here. We don't
376 * update the TTL here to mitigate the damage of a packet
377 * flood and so the same code can handle awkward establishment
378 * and a loosened connection close.
379 * In the establishment case, a correct peer response will
380 * validate the connection, go through the normal state code
381 * and keep updating the state TTL.
382 */
383
384 /* update max window */
385 if (src->max_win < win) {
386 src->max_win = win;
387 }
388 /* synchronize sequencing */
389 if (SEQ_GT(end, src->seqlo)) {
390 src->seqlo = end;
391 }
392 /* slide the window of what the other end can send */
393 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
394 dst->seqhi = ack + MAX((win << sws), 1);
395 }
396
397 /*
398 * Cannot set dst->seqhi here since this could be a shotgunned
399 * SYN and not an already established connection.
400 */
401
402 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
403 src->state = CT_DPIF_TCPS_CLOSING;
404 }
405
406 if (tcp_flags & TCP_RST) {
407 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
408 }
409 } else {
410 COVERAGE_INC(conntrack_tcp_seq_chk_failed);
411 return CT_UPDATE_INVALID;
412 }
413
414 return CT_UPDATE_VALID;
415 }
416
417 static bool
418 tcp_valid_new(struct dp_packet *pkt)
419 {
420 struct tcp_header *tcp = dp_packet_l4(pkt);
421 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
422
423 if (tcp_invalid_flags(tcp_flags)) {
424 return false;
425 }
426
427 /* A syn+ack is not allowed to create a connection. We want to allow
428 * totally new connections (syn) or already established, not partially
429 * open (syn+ack). */
430 if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
431 return false;
432 }
433
434 return true;
435 }
436
437 static struct conn *
438 tcp_new_conn(struct conntrack *ct, struct dp_packet *pkt, long long now)
439 {
440 struct conn_tcp* newconn = NULL;
441 struct tcp_header *tcp = dp_packet_l4(pkt);
442 struct tcp_peer *src, *dst;
443 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
444
445 newconn = xzalloc(sizeof *newconn);
446
447 src = &newconn->peer[0];
448 dst = &newconn->peer[1];
449
450 src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
451 src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
452
453 if (tcp_flags & TCP_SYN) {
454 src->seqhi++;
455 src->wscale = tcp_get_wscale(tcp);
456 } else {
457 src->wscale = CT_WSCALE_UNKNOWN;
458 dst->wscale = CT_WSCALE_UNKNOWN;
459 }
460 src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
461 if (src->wscale & CT_WSCALE_MASK) {
462 /* Remove scale factor from initial window */
463 uint8_t sws = src->wscale & CT_WSCALE_MASK;
464 src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
465 }
466 if (tcp_flags & TCP_FIN) {
467 src->seqhi++;
468 }
469 dst->seqhi = 1;
470 dst->max_win = 1;
471 src->state = CT_DPIF_TCPS_SYN_SENT;
472 dst->state = CT_DPIF_TCPS_CLOSED;
473
474 conn_init_expiration(ct, &newconn->up, CT_TM_TCP_FIRST_PACKET, now);
475
476 return &newconn->up;
477 }
478
479 static uint8_t
480 tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
481 {
482 uint8_t res = 0;
483
484 if (peer->wscale & CT_WSCALE_FLAG) {
485 res |= CT_DPIF_TCPF_WINDOW_SCALE;
486 }
487
488 if (peer->wscale & CT_WSCALE_UNKNOWN) {
489 res |= CT_DPIF_TCPF_BE_LIBERAL;
490 }
491
492 return res;
493 }
494
495 static void
496 tcp_conn_get_protoinfo(const struct conn *conn_,
497 struct ct_dpif_protoinfo *protoinfo)
498 {
499 const struct conn_tcp *conn = conn_tcp_cast(conn_);
500
501 protoinfo->proto = IPPROTO_TCP;
502 protoinfo->tcp.state_orig = conn->peer[0].state;
503 protoinfo->tcp.state_reply = conn->peer[1].state;
504
505 protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
506 protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
507
508 protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(&conn->peer[0]);
509 protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(&conn->peer[1]);
510 }
511
512 struct ct_l4_proto ct_proto_tcp = {
513 .new_conn = tcp_new_conn,
514 .valid_new = tcp_valid_new,
515 .conn_update = tcp_conn_update,
516 .conn_get_protoinfo = tcp_conn_get_protoinfo,
517 };