]> git.proxmox.com Git - ovs.git/blob - datapath-windows/ovsext/Conntrack-tcp.c
datapath-windows: Conntrack - Fix OvsGetTcpPayloadLength()
[ovs.git] / datapath-windows / ovsext / Conntrack-tcp.c
1 /*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 VMware, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
37 */
38
39 #include "Conntrack.h"
40 #include <stddef.h>
41
42 struct tcp_peer {
43 enum ct_dpif_tcp_state state;
44 uint32_t seqlo; /* Max sequence number sent */
45 uint32_t seqhi; /* Max the other end ACKd + win */
46 uint16_t max_win;/* largest window (pre scaling) */
47 uint8_t wscale; /* window scaling factor */
48 };
49
50 struct conn_tcp {
51 struct OVS_CT_ENTRY up;
52 struct tcp_peer peer[2];
53 };
54
55 enum {
56 TCPOPT_EOL,
57 TCPOPT_NOP,
58 TCPOPT_WINDOW = 3,
59 };
60
61 /* TCP sequence numbers are 32 bit integers operated
62 * on with modular arithmetic. These macros can be
63 * used to compare such integers. */
64 #define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
65 #define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
66 #define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
67 #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
68
69 #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
70 #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
71
72 #define TCP_FIN 0x001
73 #define TCP_SYN 0x002
74 #define TCP_RST 0x004
75 #define TCP_PSH 0x008
76 #define TCP_ACK 0x010
77 #define TCP_URG 0x020
78 #define TCP_ECE 0x040
79 #define TCP_CWR 0x080
80 #define TCP_NS 0x100
81
82 #define CT_DPIF_TCP_FLAGS \
83 CT_DPIF_TCP_FLAG(WINDOW_SCALE) \
84 CT_DPIF_TCP_FLAG(SACK_PERM) \
85 CT_DPIF_TCP_FLAG(CLOSE_INIT) \
86 CT_DPIF_TCP_FLAG(BE_LIBERAL) \
87 CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \
88 CT_DPIF_TCP_FLAG(MAXACK_SET) \
89
90 enum ct_dpif_tcp_flags_count_ {
91 #define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_,
92 CT_DPIF_TCP_FLAGS
93 #undef CT_DPIF_TCP_FLAG
94 };
95
96 enum ct_dpif_tcp_flags {
97 #define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \
98 FLAG##_COUNT_),
99 CT_DPIF_TCP_FLAGS
100 #undef CT_DPIF_TCP_FLAG
101 };
102
103
104 #define CT_DPIF_TCP_STATES \
105 CT_DPIF_TCP_STATE(CLOSED) \
106 CT_DPIF_TCP_STATE(LISTEN) \
107 CT_DPIF_TCP_STATE(SYN_SENT) \
108 CT_DPIF_TCP_STATE(SYN_RECV) \
109 CT_DPIF_TCP_STATE(ESTABLISHED) \
110 CT_DPIF_TCP_STATE(CLOSE_WAIT) \
111 CT_DPIF_TCP_STATE(FIN_WAIT_1) \
112 CT_DPIF_TCP_STATE(CLOSING) \
113 CT_DPIF_TCP_STATE(LAST_ACK) \
114 CT_DPIF_TCP_STATE(FIN_WAIT_2) \
115 CT_DPIF_TCP_STATE(TIME_WAIT)
116
117 enum ct_dpif_tcp_state {
118 #define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE,
119 CT_DPIF_TCP_STATES
120 #undef CT_DPIF_TCP_STATE
121 };
122
123 #define TCP_MAX_WSCALE 14
124 #define CT_WSCALE_FLAG 0x80
125 #define CT_WSCALE_UNKNOWN 0x40
126 #define CT_WSCALE_MASK 0xf
127
128 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
129 * is enabled. We're not scrubbing, but this check seems reasonable. */
130 static __inline BOOLEAN
131 OvsCtInvalidTcpFlags(uint16_t flags)
132 {
133 if (flags & TCP_SYN) {
134 if (flags & TCP_RST || flags & TCP_FIN) {
135 return TRUE;
136 }
137 } else {
138 /* Illegal packet */
139 if (!(flags & (TCP_ACK|TCP_RST))) {
140 return TRUE;
141 }
142 }
143
144 if (!(flags & TCP_ACK)) {
145 /* These flags are only valid if ACK is set */
146 if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
147 return TRUE;
148 }
149 }
150
151 return FALSE;
152 }
153
154 static __inline uint8_t
155 OvsTcpGetWscale(const TCPHdr *tcp)
156 {
157 int len = tcp->doff * 4 - sizeof *tcp;
158 const uint8_t *opt = (const uint8_t *)(tcp + 1);
159 uint8_t wscale = 0;
160 uint8_t optlen;
161
162 while (len >= 3) {
163 switch (*opt) {
164 case TCPOPT_EOL:
165 return wscale;
166 case TCPOPT_NOP:
167 opt++;
168 len--;
169 break;
170 case TCPOPT_WINDOW:
171 wscale = MIN(opt[2], TCP_MAX_WSCALE);
172 wscale |= CT_WSCALE_FLAG;
173 /* fall through */
174 default:
175 optlen = opt[1];
176 if (optlen < 2) {
177 optlen = 2;
178 }
179 len -= optlen;
180 opt += optlen;
181 }
182 }
183
184 return wscale;
185 }
186
187 static __inline struct conn_tcp*
188 OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn)
189 {
190 return CONTAINER_OF(conn, struct conn_tcp, up);
191 }
192
193 enum CT_UPDATE_RES
194 OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_,
195 const TCPHdr *tcp,
196 PNET_BUFFER_LIST nbl,
197 BOOLEAN reply,
198 UINT64 now)
199 {
200 struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_);
201 /* The peer that sent 'pkt' */
202 struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
203 /* The peer that should receive 'pkt' */
204 struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
205 uint8_t sws = 0, dws = 0;
206 UINT16 tcp_flags = ntohs(tcp->flags);
207 uint16_t win = ntohs(tcp->window);
208 uint32_t ack, end, seq, orig_seq;
209 uint32_t p_len = OvsGetTcpPayloadLength(nbl);
210 int ackskew;
211
212 if (OvsCtInvalidTcpFlags(tcp_flags)) {
213 return CT_UPDATE_INVALID;
214 }
215
216 if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN)
217 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
218 && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
219 src->state = dst->state = CT_DPIF_TCPS_CLOSED;
220 return CT_UPDATE_NEW;
221 }
222
223 if (src->wscale & CT_WSCALE_FLAG
224 && dst->wscale & CT_WSCALE_FLAG
225 && !(tcp_flags & TCP_SYN)) {
226
227 sws = src->wscale & CT_WSCALE_MASK;
228 dws = dst->wscale & CT_WSCALE_MASK;
229
230 } else if (src->wscale & CT_WSCALE_UNKNOWN
231 && dst->wscale & CT_WSCALE_UNKNOWN
232 && !(tcp_flags & TCP_SYN)) {
233
234 sws = TCP_MAX_WSCALE;
235 dws = TCP_MAX_WSCALE;
236 }
237
238 /*
239 * Sequence tracking algorithm from Guido van Rooij's paper:
240 * http://www.madison-gurkha.com/publications/tcp_filtering/
241 * tcp_filtering.ps
242 */
243
244 orig_seq = seq = ntohl(tcp->seq);
245 if (src->state < CT_DPIF_TCPS_SYN_SENT) {
246 /* First packet from this end. Set its state */
247
248 ack = ntohl(tcp->ack_seq);
249
250 end = seq + p_len;
251 if (tcp_flags & TCP_SYN) {
252 end++;
253 if (dst->wscale & CT_WSCALE_FLAG) {
254 src->wscale = OvsTcpGetWscale(tcp);
255 if (src->wscale & CT_WSCALE_FLAG) {
256 /* Remove scale factor from initial window */
257 sws = src->wscale & CT_WSCALE_MASK;
258 win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
259 dws = dst->wscale & CT_WSCALE_MASK;
260 } else {
261 /* fixup other window */
262 dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
263 /* in case of a retrans SYN|ACK */
264 dst->wscale = 0;
265 }
266 }
267 }
268 if (tcp_flags & TCP_FIN) {
269 end++;
270 }
271
272 src->seqlo = seq;
273 src->state = CT_DPIF_TCPS_SYN_SENT;
274 /*
275 * May need to slide the window (seqhi may have been set by
276 * the crappy stack check or if we picked up the connection
277 * after establishment)
278 */
279 if (src->seqhi == 1 ||
280 SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
281 src->seqhi = end + MAX(1, dst->max_win << dws);
282 }
283 if (win > src->max_win) {
284 src->max_win = win;
285 }
286
287 } else {
288 ack = ntohl(tcp->ack_seq);
289 end = seq + p_len;
290 if (tcp_flags & TCP_SYN) {
291 end++;
292 }
293 if (tcp_flags & TCP_FIN) {
294 end++;
295 }
296 }
297
298 if ((tcp_flags & TCP_ACK) == 0) {
299 /* Let it pass through the ack skew check */
300 ack = dst->seqlo;
301 } else if ((ack == 0
302 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
303 /* broken tcp stacks do not set ack */) {
304 /* Many stacks (ours included) will set the ACK number in an
305 * FIN|ACK if the SYN times out -- no sequence to ACK. */
306 ack = dst->seqlo;
307 }
308
309 if (seq == end) {
310 /* Ease sequencing restrictions on no data packets */
311 seq = src->seqlo;
312 end = seq;
313 }
314
315 ackskew = dst->seqlo - ack;
316 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
317 if (SEQ_GEQ(src->seqhi, end)
318 /* Last octet inside other's window space */
319 && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
320 /* Retrans: not more than one window back */
321 && (ackskew >= -MAXACKWINDOW)
322 /* Acking not more than one reassembled fragment backwards */
323 && (ackskew <= (MAXACKWINDOW << sws))
324 /* Acking not more than one window forward */
325 && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
326 || (orig_seq == src->seqlo + 1)
327 || (orig_seq + 1 == src->seqlo))) {
328 /* Require an exact/+1 sequence match on resets when possible */
329
330 /* update max window */
331 if (src->max_win < win) {
332 src->max_win = win;
333 }
334 /* synchronize sequencing */
335 if (SEQ_GT(end, src->seqlo)) {
336 src->seqlo = end;
337 }
338 /* slide the window of what the other end can send */
339 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
340 dst->seqhi = ack + MAX((win << sws), 1);
341 }
342
343 /* update states */
344 if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
345 src->state = CT_DPIF_TCPS_SYN_SENT;
346 }
347 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
348 src->state = CT_DPIF_TCPS_CLOSING;
349 }
350 if (tcp_flags & TCP_ACK) {
351 if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
352 dst->state = CT_DPIF_TCPS_ESTABLISHED;
353 } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
354 dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
355 }
356 }
357 if (tcp_flags & TCP_RST) {
358 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
359 }
360
361 if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
362 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
363 OvsConntrackUpdateExpiration(&conn->up, now,
364 30 * CT_INTERVAL_SEC);
365 } else if (src->state >= CT_DPIF_TCPS_CLOSING
366 && dst->state >= CT_DPIF_TCPS_CLOSING) {
367 OvsConntrackUpdateExpiration(&conn->up, now,
368 45 * CT_INTERVAL_SEC);
369 } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
370 || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
371 OvsConntrackUpdateExpiration(&conn->up, now,
372 30 * CT_INTERVAL_SEC);
373 } else if (src->state >= CT_DPIF_TCPS_CLOSING
374 || dst->state >= CT_DPIF_TCPS_CLOSING) {
375 OvsConntrackUpdateExpiration(&conn->up, now,
376 15 * 60 * CT_INTERVAL_SEC);
377 } else {
378 OvsConntrackUpdateExpiration(&conn->up, now,
379 24 * 60 * 60 * CT_INTERVAL_SEC);
380 }
381 } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
382 || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
383 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
384 && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
385 /* Within a window forward of the originating packet */
386 && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
387 /* Within a window backward of the originating packet */
388
389 /*
390 * This currently handles three situations:
391 * 1) Stupid stacks will shotgun SYNs before their peer
392 * replies.
393 * 2) When PF catches an already established stream (the
394 * firewall rebooted, the state table was flushed, routes
395 * changed...)
396 * 3) Packets get funky immediately after the connection
397 * closes (this should catch Solaris spurious ACK|FINs
398 * that web servers like to spew after a close)
399 *
400 * This must be a little more careful than the above code
401 * since packet floods will also be caught here. We don't
402 * update the TTL here to mitigate the damage of a packet
403 * flood and so the same code can handle awkward establishment
404 * and a loosened connection close.
405 * In the establishment case, a correct peer response will
406 * validate the connection, go through the normal state code
407 * and keep updating the state TTL.
408 */
409
410 /* update max window */
411 if (src->max_win < win) {
412 src->max_win = win;
413 }
414 /* synchronize sequencing */
415 if (SEQ_GT(end, src->seqlo)) {
416 src->seqlo = end;
417 }
418 /* slide the window of what the other end can send */
419 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
420 dst->seqhi = ack + MAX((win << sws), 1);
421 }
422
423 /*
424 * Cannot set dst->seqhi here since this could be a shotgunned
425 * SYN and not an already established connection.
426 */
427
428 if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
429 src->state = CT_DPIF_TCPS_CLOSING;
430 }
431
432 if (tcp_flags & TCP_RST) {
433 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
434 }
435 } else {
436 return CT_UPDATE_INVALID;
437 }
438
439 return CT_UPDATE_VALID;
440 }
441
442 BOOLEAN
443 OvsConntrackValidateTcpPacket(const TCPHdr *tcp)
444 {
445 if (!tcp) {
446 return FALSE;
447 }
448
449 UINT16 tcp_flags = ntohs(tcp->flags);
450
451 if (OvsCtInvalidTcpFlags(tcp_flags)) {
452 return FALSE;
453 }
454
455 /* A syn+ack is not allowed to create a connection. We want to allow
456 * totally new connections (syn) or already established, not partially
457 * open (syn+ack). */
458 if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
459 return FALSE;
460 }
461
462 return TRUE;
463 }
464
465 OVS_CT_ENTRY *
466 OvsConntrackCreateTcpEntry(const TCPHdr *tcp,
467 PNET_BUFFER_LIST nbl,
468 UINT64 now)
469 {
470 struct conn_tcp* newconn;
471 struct tcp_peer *src, *dst;
472
473 newconn = OvsAllocateMemoryWithTag(sizeof(struct conn_tcp),
474 OVS_CT_POOL_TAG);
475 if (!newconn) {
476 return NULL;
477 }
478
479 newconn->up = (OVS_CT_ENTRY) {0};
480 src = &newconn->peer[0];
481 dst = &newconn->peer[1];
482
483 src->seqlo = ntohl(tcp->seq);
484 src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1;
485
486 if (tcp->flags & TCP_SYN) {
487 src->seqhi++;
488 src->wscale = OvsTcpGetWscale(tcp);
489 } else {
490 src->wscale = CT_WSCALE_UNKNOWN;
491 dst->wscale = CT_WSCALE_UNKNOWN;
492 }
493 src->max_win = MAX(ntohs(tcp->window), 1);
494 if (src->wscale & CT_WSCALE_MASK) {
495 /* Remove scale factor from initial window */
496 uint8_t sws = src->wscale & CT_WSCALE_MASK;
497 src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
498 }
499 if (tcp->flags & TCP_FIN) {
500 src->seqhi++;
501 }
502 dst->seqhi = 1;
503 dst->max_win = 1;
504 src->state = CT_DPIF_TCPS_SYN_SENT;
505 dst->state = CT_DPIF_TCPS_CLOSED;
506
507 OvsConntrackUpdateExpiration(&newconn->up, now, CT_ENTRY_TIMEOUT);
508
509 return &newconn->up;
510 }
511
512 static __inline uint8_t
513 OvsCtTcpPeerToProtoInfoFlags(const struct tcp_peer *peer)
514 {
515 uint8_t res = 0;
516
517 if (peer->wscale & CT_WSCALE_FLAG) {
518 res |= CT_DPIF_TCPF_WINDOW_SCALE;
519 }
520
521 if (peer->wscale & CT_WSCALE_UNKNOWN) {
522 res |= CT_DPIF_TCPF_BE_LIBERAL;
523 }
524
525 return res;
526 }
527
528 NDIS_STATUS
529 OvsCtMapTcpProtoInfoToNl(PNL_BUFFER nlBuf, OVS_CT_ENTRY *conn_)
530 {
531 struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_);
532 NDIS_STATUS status = NDIS_STATUS_SUCCESS;
533 UINT32 offset = 0;
534
535 offset = NlMsgStartNested(nlBuf, CTA_PROTOINFO_TCP);
536 if (!offset) {
537 return NDIS_STATUS_FAILURE;
538 }
539
540 if (!NlMsgPutTailU8(nlBuf, CTA_PROTOINFO_TCP_STATE,
541 conn->peer[0].state)) {
542 status = NDIS_STATUS_FAILURE;
543 goto done;
544 }
545 if (!NlMsgPutTailU8(nlBuf, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
546 (conn->peer[0].wscale & CT_WSCALE_MASK))) {
547 status = NDIS_STATUS_FAILURE;
548 goto done;
549 }
550 if (!NlMsgPutTailU8(nlBuf, CTA_PROTOINFO_TCP_WSCALE_REPLY,
551 (conn->peer[1].wscale & CT_WSCALE_MASK))) {
552 status = NDIS_STATUS_FAILURE;
553 goto done;
554 }
555 if (!NlMsgPutTailU16(nlBuf, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
556 OvsCtTcpPeerToProtoInfoFlags(&conn->peer[0]))) {
557 status = NDIS_STATUS_FAILURE;
558 goto done;
559 }
560 if (!NlMsgPutTailU16(nlBuf, CTA_PROTOINFO_TCP_FLAGS_REPLY,
561 OvsCtTcpPeerToProtoInfoFlags(&conn->peer[1]))) {
562 status = NDIS_STATUS_FAILURE;
563 goto done;
564 }
565
566 done:
567 NlMsgEndNested(nlBuf, offset);
568 return status;
569 }