]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack-tcp.c
netdev-offload-tc: Use single 'once' variable for probing tc features
[mirror_ovs.git] / lib / conntrack-tcp.c
CommitLineData
a489b168
DDP
1/*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 Nicira, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
37 */
38
39#include <config.h>
40
41#include "conntrack-private.h"
2078901a 42#include "conntrack-tp.h"
64207120 43#include "coverage.h"
a489b168
DDP
44#include "ct-dpif.h"
45#include "dp-packet.h"
46#include "util.h"
47
64207120
DB
48COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass);
49COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed);
50COVERAGE_DEFINE(conntrack_invalid_tcp_flags);
51
a489b168 52struct tcp_peer {
a489b168
DDP
53 uint32_t seqlo; /* Max sequence number sent */
54 uint32_t seqhi; /* Max the other end ACKd + win */
55 uint16_t max_win; /* largest window (pre scaling) */
56 uint8_t wscale; /* window scaling factor */
967bb5c5 57 enum ct_dpif_tcp_state state;
a489b168
DDP
58};
59
60struct conn_tcp {
61 struct conn up;
967bb5c5 62 struct tcp_peer peer[2]; /* 'conn' lock protected. */
a489b168
DDP
63};
64
65enum {
66 TCPOPT_EOL,
67 TCPOPT_NOP,
68 TCPOPT_WINDOW = 3,
69};
70
71/* TCP sequence numbers are 32 bit integers operated
72 * on with modular arithmetic. These macros can be
73 * used to compare such integers. */
74#define SEQ_LT(a,b) INT_MOD_LT(a, b)
75#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
76#define SEQ_GT(a,b) INT_MOD_GT(a, b)
77#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
78
79#define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
80#define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
81
82static struct conn_tcp*
83conn_tcp_cast(const struct conn* conn)
84{
85 return CONTAINER_OF(conn, struct conn_tcp, up);
86}
87
88/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
89 * is enabled. We're not scrubbing, but this check seems reasonable. */
90static bool
91tcp_invalid_flags(uint16_t flags)
92{
93
94 if (flags & TCP_SYN) {
95 if (flags & TCP_RST || flags & TCP_FIN) {
96 return true;
97 }
98 } else {
99 /* Illegal packet */
100 if (!(flags & (TCP_ACK|TCP_RST))) {
101 return true;
102 }
103 }
104
105 if (!(flags & TCP_ACK)) {
106 /* These flags are only valid if ACK is set */
107 if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
108 return true;
109 }
110 }
111
112 return false;
113}
114
115#define TCP_MAX_WSCALE 14
116#define CT_WSCALE_FLAG 0x80
117#define CT_WSCALE_UNKNOWN 0x40
118#define CT_WSCALE_MASK 0xf
119
120static uint8_t
121tcp_get_wscale(const struct tcp_header *tcp)
122{
123 int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
124 const uint8_t *opt = (const uint8_t *)(tcp + 1);
125 uint8_t wscale = 0;
126 uint8_t optlen;
127
128 while (len >= 3) {
129 switch (*opt) {
130 case TCPOPT_EOL:
131 return wscale;
132 case TCPOPT_NOP:
133 opt++;
134 len--;
135 break;
136 case TCPOPT_WINDOW:
137 wscale = MIN(opt[2], TCP_MAX_WSCALE);
138 wscale |= CT_WSCALE_FLAG;
139 /* fall through */
140 default:
141 optlen = opt[1];
142 if (optlen < 2) {
143 optlen = 2;
144 }
145 len -= optlen;
146 opt += optlen;
147 }
148 }
149
150 return wscale;
151}
152
64207120
DB
153static bool
154tcp_bypass_seq_chk(struct conntrack *ct)
155{
156 if (!conntrack_get_tcp_seq_chk(ct)) {
157 COVERAGE_INC(conntrack_tcp_seq_chk_bypass);
158 return true;
159 }
160 return false;
161}
162
a489b168 163static enum ct_update_res
967bb5c5 164tcp_conn_update(struct conntrack *ct, struct conn *conn_,
e6ef6cc6 165 struct dp_packet *pkt, bool reply, long long now)
a489b168
DDP
166{
167 struct conn_tcp *conn = conn_tcp_cast(conn_);
168 struct tcp_header *tcp = dp_packet_l4(pkt);
169 /* The peer that sent 'pkt' */
170 struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
171 /* The peer that should receive 'pkt' */
172 struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
173 uint8_t sws = 0, dws = 0;
174 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
175
176 uint16_t win = ntohs(tcp->tcp_winsz);
177 uint32_t ack, end, seq, orig_seq;
178 uint32_t p_len = tcp_payload_length(pkt);
a489b168
DDP
179
180 if (tcp_invalid_flags(tcp_flags)) {
64207120 181 COVERAGE_INC(conntrack_invalid_tcp_flags);
a489b168
DDP
182 return CT_UPDATE_INVALID;
183 }
184
a867c010
YHW
185 if ((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) {
186 if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
187 && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
188 src->state = dst->state = CT_DPIF_TCPS_CLOSED;
189 return CT_UPDATE_NEW;
190 } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) {
191 src->state = CT_DPIF_TCPS_SYN_SENT;
192 conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now);
ac23d20f 193 return CT_UPDATE_VALID_NEW;
a867c010 194 }
a489b168
DDP
195 }
196
197 if (src->wscale & CT_WSCALE_FLAG
198 && dst->wscale & CT_WSCALE_FLAG
199 && !(tcp_flags & TCP_SYN)) {
200
201 sws = src->wscale & CT_WSCALE_MASK;
202 dws = dst->wscale & CT_WSCALE_MASK;
203
204 } else if (src->wscale & CT_WSCALE_UNKNOWN
205 && dst->wscale & CT_WSCALE_UNKNOWN
206 && !(tcp_flags & TCP_SYN)) {
207
208 sws = TCP_MAX_WSCALE;
209 dws = TCP_MAX_WSCALE;
210 }
211
212 /*
213 * Sequence tracking algorithm from Guido van Rooij's paper:
214 * http://www.madison-gurkha.com/publications/tcp_filtering/
215 * tcp_filtering.ps
216 */
217
218 orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
7c0cb293 219 bool check_ackskew = true;
a489b168
DDP
220 if (src->state < CT_DPIF_TCPS_SYN_SENT) {
221 /* First packet from this end. Set its state */
222
223 ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
224
225 end = seq + p_len;
226 if (tcp_flags & TCP_SYN) {
227 end++;
228 if (dst->wscale & CT_WSCALE_FLAG) {
229 src->wscale = tcp_get_wscale(tcp);
230 if (src->wscale & CT_WSCALE_FLAG) {
231 /* Remove scale factor from initial window */
232 sws = src->wscale & CT_WSCALE_MASK;
233 win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
234 dws = dst->wscale & CT_WSCALE_MASK;
235 } else {
236 /* fixup other window */
237 dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
238 /* in case of a retrans SYN|ACK */
239 dst->wscale = 0;
240 }
241 }
242 }
243 if (tcp_flags & TCP_FIN) {
244 end++;
245 }
246
247 src->seqlo = seq;
248 src->state = CT_DPIF_TCPS_SYN_SENT;
249 /*
250 * May need to slide the window (seqhi may have been set by
251 * the crappy stack check or if we picked up the connection
252 * after establishment)
253 */
254 if (src->seqhi == 1
255 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
256 src->seqhi = end + MAX(1, dst->max_win << dws);
7c0cb293
DB
257 /* We are either picking up a new connection or a connection which
258 * was already in place. We are more permissive in terms of
259 * ackskew checking in these cases.
260 */
261 check_ackskew = false;
a489b168
DDP
262 }
263 if (win > src->max_win) {
264 src->max_win = win;
265 }
266
267 } else {
268 ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
269 end = seq + p_len;
270 if (tcp_flags & TCP_SYN) {
271 end++;
272 }
273 if (tcp_flags & TCP_FIN) {
274 end++;
275 }
276 }
277
278 if ((tcp_flags & TCP_ACK) == 0) {
279 /* Let it pass through the ack skew check */
280 ack = dst->seqlo;
281 } else if ((ack == 0
282 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
283 /* broken tcp stacks do not set ack */) {
284 /* Many stacks (ours included) will set the ACK number in an
285 * FIN|ACK if the SYN times out -- no sequence to ACK. */
286 ack = dst->seqlo;
287 }
288
289 if (seq == end) {
290 /* Ease sequencing restrictions on no data packets */
291 seq = src->seqlo;
292 end = seq;
293 }
294
7c0cb293 295 int ackskew = check_ackskew ? dst->seqlo - ack : 0;
a489b168 296#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
64207120 297 if ((SEQ_GEQ(src->seqhi, end)
a489b168
DDP
298 /* Last octet inside other's window space */
299 && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
300 /* Retrans: not more than one window back */
301 && (ackskew >= -MAXACKWINDOW)
302 /* Acking not more than one reassembled fragment backwards */
303 && (ackskew <= (MAXACKWINDOW << sws))
304 /* Acking not more than one window forward */
305 && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
64207120
DB
306 || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
307 || tcp_bypass_seq_chk(ct)) {
a489b168
DDP
308 /* Require an exact/+1 sequence match on resets when possible */
309
310 /* update max window */
311 if (src->max_win < win) {
312 src->max_win = win;
313 }
314 /* synchronize sequencing */
315 if (SEQ_GT(end, src->seqlo)) {
316 src->seqlo = end;
317 }
318 /* slide the window of what the other end can send */
319 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
320 dst->seqhi = ack + MAX((win << sws), 1);
321 }
322
323 /* update states */
324 if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
325 src->state = CT_DPIF_TCPS_SYN_SENT;
326 }
327 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
328 src->state = CT_DPIF_TCPS_CLOSING;
329 }
330 if (tcp_flags & TCP_ACK) {
331 if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
332 dst->state = CT_DPIF_TCPS_ESTABLISHED;
333 } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
334 dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
335 }
336 }
337 if (tcp_flags & TCP_RST) {
338 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
339 }
340
341 if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
342 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
967bb5c5 343 conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSED, now);
a489b168
DDP
344 } else if (src->state >= CT_DPIF_TCPS_CLOSING
345 && dst->state >= CT_DPIF_TCPS_CLOSING) {
967bb5c5 346 conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIN_WAIT, now);
a489b168
DDP
347 } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
348 || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
967bb5c5 349 conn_update_expiration(ct, &conn->up, CT_TM_TCP_OPENING, now);
a489b168
DDP
350 } else if (src->state >= CT_DPIF_TCPS_CLOSING
351 || dst->state >= CT_DPIF_TCPS_CLOSING) {
967bb5c5 352 conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSING, now);
a489b168 353 } else {
967bb5c5 354 conn_update_expiration(ct, &conn->up, CT_TM_TCP_ESTABLISHED, now);
a489b168
DDP
355 }
356 } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
357 || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
358 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
359 && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
360 /* Within a window forward of the originating packet */
361 && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
362 /* Within a window backward of the originating packet */
363
364 /*
365 * This currently handles three situations:
366 * 1) Stupid stacks will shotgun SYNs before their peer
367 * replies.
368 * 2) When PF catches an already established stream (the
369 * firewall rebooted, the state table was flushed, routes
370 * changed...)
371 * 3) Packets get funky immediately after the connection
372 * closes (this should catch Solaris spurious ACK|FINs
373 * that web servers like to spew after a close)
374 *
375 * This must be a little more careful than the above code
376 * since packet floods will also be caught here. We don't
377 * update the TTL here to mitigate the damage of a packet
378 * flood and so the same code can handle awkward establishment
379 * and a loosened connection close.
380 * In the establishment case, a correct peer response will
381 * validate the connection, go through the normal state code
382 * and keep updating the state TTL.
383 */
384
385 /* update max window */
386 if (src->max_win < win) {
387 src->max_win = win;
388 }
389 /* synchronize sequencing */
390 if (SEQ_GT(end, src->seqlo)) {
391 src->seqlo = end;
392 }
393 /* slide the window of what the other end can send */
394 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
395 dst->seqhi = ack + MAX((win << sws), 1);
396 }
397
398 /*
399 * Cannot set dst->seqhi here since this could be a shotgunned
400 * SYN and not an already established connection.
401 */
402
403 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
404 src->state = CT_DPIF_TCPS_CLOSING;
405 }
406
407 if (tcp_flags & TCP_RST) {
408 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
409 }
410 } else {
64207120 411 COVERAGE_INC(conntrack_tcp_seq_chk_failed);
a489b168
DDP
412 return CT_UPDATE_INVALID;
413 }
414
415 return CT_UPDATE_VALID;
416}
417
418static bool
419tcp_valid_new(struct dp_packet *pkt)
420{
421 struct tcp_header *tcp = dp_packet_l4(pkt);
422 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
423
424 if (tcp_invalid_flags(tcp_flags)) {
425 return false;
426 }
427
428 /* A syn+ack is not allowed to create a connection. We want to allow
429 * totally new connections (syn) or already established, not partially
430 * open (syn+ack). */
431 if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
432 return false;
433 }
434
435 return true;
436}
437
438static struct conn *
2078901a
WT
439tcp_new_conn(struct conntrack *ct, struct dp_packet *pkt, long long now,
440 uint32_t tp_id)
a489b168
DDP
441{
442 struct conn_tcp* newconn = NULL;
443 struct tcp_header *tcp = dp_packet_l4(pkt);
444 struct tcp_peer *src, *dst;
445 uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
446
447 newconn = xzalloc(sizeof *newconn);
448
449 src = &newconn->peer[0];
450 dst = &newconn->peer[1];
451
452 src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
453 src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
454
455 if (tcp_flags & TCP_SYN) {
456 src->seqhi++;
457 src->wscale = tcp_get_wscale(tcp);
458 } else {
459 src->wscale = CT_WSCALE_UNKNOWN;
460 dst->wscale = CT_WSCALE_UNKNOWN;
461 }
462 src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
463 if (src->wscale & CT_WSCALE_MASK) {
464 /* Remove scale factor from initial window */
465 uint8_t sws = src->wscale & CT_WSCALE_MASK;
466 src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
467 }
468 if (tcp_flags & TCP_FIN) {
469 src->seqhi++;
470 }
471 dst->seqhi = 1;
472 dst->max_win = 1;
473 src->state = CT_DPIF_TCPS_SYN_SENT;
474 dst->state = CT_DPIF_TCPS_CLOSED;
475
2078901a 476 newconn->up.tp_id = tp_id;
967bb5c5 477 conn_init_expiration(ct, &newconn->up, CT_TM_TCP_FIRST_PACKET, now);
a489b168
DDP
478
479 return &newconn->up;
480}
481
4d4e68ed
DDP
482static uint8_t
483tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
484{
485 uint8_t res = 0;
486
487 if (peer->wscale & CT_WSCALE_FLAG) {
488 res |= CT_DPIF_TCPF_WINDOW_SCALE;
489 }
490
491 if (peer->wscale & CT_WSCALE_UNKNOWN) {
492 res |= CT_DPIF_TCPF_BE_LIBERAL;
493 }
494
495 return res;
496}
497
498static void
499tcp_conn_get_protoinfo(const struct conn *conn_,
500 struct ct_dpif_protoinfo *protoinfo)
501{
502 const struct conn_tcp *conn = conn_tcp_cast(conn_);
503
504 protoinfo->proto = IPPROTO_TCP;
505 protoinfo->tcp.state_orig = conn->peer[0].state;
506 protoinfo->tcp.state_reply = conn->peer[1].state;
507
508 protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
509 protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
510
511 protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(&conn->peer[0]);
512 protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(&conn->peer[1]);
513}
514
a489b168
DDP
515struct ct_l4_proto ct_proto_tcp = {
516 .new_conn = tcp_new_conn,
517 .valid_new = tcp_valid_new,
518 .conn_update = tcp_conn_update,
4d4e68ed 519 .conn_get_protoinfo = tcp_conn_get_protoinfo,
a489b168 520};