]> git.proxmox.com Git - mirror_ovs.git/blob - datapath-windows/ovsext/Conntrack-tcp.c
datapath-windows: Add Geneve support
[mirror_ovs.git] / datapath-windows / ovsext / Conntrack-tcp.c
1 /*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * Copyright (c) 2015, 2016 VMware, Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35 *
36 * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
37 */
38
39 #include "Conntrack.h"
40 #include <stddef.h>
41
42 struct tcp_peer {
43 enum ct_dpif_tcp_state state;
44 uint32_t seqlo; /* Max sequence number sent */
45 uint32_t seqhi; /* Max the other end ACKd + win */
46 uint16_t max_win;/* largest window (pre scaling) */
47 uint8_t wscale; /* window scaling factor */
48 };
49
50 struct conn_tcp {
51 struct OVS_CT_ENTRY up;
52 struct tcp_peer peer[2];
53 };
54
55 enum {
56 TCPOPT_EOL,
57 TCPOPT_NOP,
58 TCPOPT_WINDOW = 3,
59 };
60
61 /* TCP sequence numbers are 32 bit integers operated
62 * on with modular arithmetic. These macros can be
63 * used to compare such integers. */
64 #define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
65 #define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
66 #define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
67 #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
68
69 #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
70 #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
71
72 #define TCP_FIN 0x001
73 #define TCP_SYN 0x002
74 #define TCP_RST 0x004
75 #define TCP_PSH 0x008
76 #define TCP_ACK 0x010
77 #define TCP_URG 0x020
78 #define TCP_ECE 0x040
79 #define TCP_CWR 0x080
80 #define TCP_NS 0x100
81
82 #define CT_DPIF_TCP_FLAGS \
83 CT_DPIF_TCP_FLAG(WINDOW_SCALE) \
84 CT_DPIF_TCP_FLAG(SACK_PERM) \
85 CT_DPIF_TCP_FLAG(CLOSE_INIT) \
86 CT_DPIF_TCP_FLAG(BE_LIBERAL) \
87 CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \
88 CT_DPIF_TCP_FLAG(MAXACK_SET) \
89
90 enum ct_dpif_tcp_flags_count_ {
91 #define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_,
92 CT_DPIF_TCP_FLAGS
93 #undef CT_DPIF_TCP_FLAG
94 };
95
96 enum ct_dpif_tcp_flags {
97 #define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \
98 FLAG##_COUNT_),
99 CT_DPIF_TCP_FLAGS
100 #undef CT_DPIF_TCP_FLAG
101 };
102
103
104 #define CT_DPIF_TCP_STATES \
105 CT_DPIF_TCP_STATE(CLOSED) \
106 CT_DPIF_TCP_STATE(LISTEN) \
107 CT_DPIF_TCP_STATE(SYN_SENT) \
108 CT_DPIF_TCP_STATE(SYN_RECV) \
109 CT_DPIF_TCP_STATE(ESTABLISHED) \
110 CT_DPIF_TCP_STATE(CLOSE_WAIT) \
111 CT_DPIF_TCP_STATE(FIN_WAIT_1) \
112 CT_DPIF_TCP_STATE(CLOSING) \
113 CT_DPIF_TCP_STATE(LAST_ACK) \
114 CT_DPIF_TCP_STATE(FIN_WAIT_2) \
115 CT_DPIF_TCP_STATE(TIME_WAIT)
116
117 enum ct_dpif_tcp_state {
118 #define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE,
119 CT_DPIF_TCP_STATES
120 #undef CT_DPIF_TCP_STATE
121 };
122
123 #define TCP_MAX_WSCALE 14
124 #define CT_WSCALE_FLAG 0x80
125 #define CT_WSCALE_UNKNOWN 0x40
126 #define CT_WSCALE_MASK 0xf
127
128 /* pf does this in in pf_normalize_tcp(), and it is called only if scrub
129 * is enabled. We're not scrubbing, but this check seems reasonable. */
130 static __inline BOOLEAN
131 OvsConntrackValidateTcpFlags(const TCPHdr *tcp)
132 {
133 if (tcp->syn) {
134 if (tcp->rst) {
135 return TRUE;
136 }
137 if (tcp->fin) {
138 /* Here pf removes the fin flag. We simply mark the packet as
139 * invalid */
140 return TRUE;
141 }
142 } else {
143 /* Illegal packet */
144 if (!(tcp->ack || tcp->rst)) {
145 return TRUE;
146 }
147 }
148
149 if (!(tcp->ack)) {
150 /* These flags are only valid if ACK is set */
151 if ((tcp->fin) || (tcp->psh) || (tcp->urg)) {
152 return TRUE;
153 }
154 }
155
156 return FALSE;
157 }
158
159 static __inline uint8_t
160 OvsTcpGetWscale(const TCPHdr *tcp)
161 {
162 int len = tcp->doff * 4 - sizeof *tcp;
163 const uint8_t *opt = (const uint8_t *)(tcp + 1);
164 uint8_t wscale = 0;
165 uint8_t optlen;
166
167 while (len >= 3) {
168 if (*opt == TCPOPT_EOL) {
169 break;
170 }
171 switch (*opt) {
172 case TCPOPT_NOP:
173 opt++;
174 len--;
175 break;
176 case TCPOPT_WINDOW:
177 wscale = MIN(opt[2], TCP_MAX_WSCALE);
178 wscale |= CT_WSCALE_FLAG;
179 /* fall through */
180 default:
181 optlen = opt[1];
182 if (optlen < 2) {
183 optlen = 2;
184 }
185 len -= optlen;
186 opt += optlen;
187 }
188 }
189
190 return wscale;
191 }
192
193 static __inline uint32_t
194 OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl)
195 {
196 IPHdr *ipHdr;
197 char *ipBuf[sizeof(IPHdr)];
198 PNET_BUFFER curNb;
199 curNb = NET_BUFFER_LIST_FIRST_NB(nbl);
200 ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
201 1 /*no align*/, 0);
202 TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
203 return (UINT16)ntohs(ipHdr->tot_len)
204 - (ipHdr->ihl * 4)
205 - (sizeof * tcp);
206 }
207
208 static __inline void
209 OvsConntrackUpdateExpiration(struct conn_tcp *conn,
210 long long now,
211 long long interval)
212 {
213 conn->up.expiration = now + interval;
214 }
215
216 static __inline struct conn_tcp*
217 OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn)
218 {
219 return CONTAINER_OF(conn, struct conn_tcp, up);
220 }
221
222 enum CT_UPDATE_RES
223 OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_,
224 const TCPHdr *tcp,
225 PNET_BUFFER_LIST nbl,
226 BOOLEAN reply,
227 UINT64 now)
228 {
229 struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_);
230 /* The peer that sent 'pkt' */
231 struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
232 /* The peer that should receive 'pkt' */
233 struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
234 uint8_t sws = 0, dws = 0;
235 uint16_t win = ntohs(tcp->window);
236 uint32_t ack, end, seq, orig_seq;
237 uint32_t p_len = OvsGetTcpPayloadLength(nbl);
238 int ackskew;
239
240 if (OvsConntrackValidateTcpFlags(tcp)) {
241 return CT_UPDATE_INVALID;
242 }
243
244 if ((tcp->syn) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
245 src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
246 src->state = dst->state = CT_DPIF_TCPS_CLOSED;
247 return CT_UPDATE_NEW;
248 }
249
250 if (src->wscale & CT_WSCALE_FLAG
251 && dst->wscale & CT_WSCALE_FLAG
252 && !(tcp->syn)) {
253
254 sws = src->wscale & CT_WSCALE_MASK;
255 dws = dst->wscale & CT_WSCALE_MASK;
256
257 } else if (src->wscale & CT_WSCALE_UNKNOWN
258 && dst->wscale & CT_WSCALE_UNKNOWN
259 && !(tcp->syn)) {
260
261 sws = TCP_MAX_WSCALE;
262 dws = TCP_MAX_WSCALE;
263 }
264
265 /*
266 * Sequence tracking algorithm from Guido van Rooij's paper:
267 * http://www.madison-gurkha.com/publications/tcp_filtering/
268 * tcp_filtering.ps
269 */
270
271 orig_seq = seq = ntohl(tcp->seq);
272 if (src->state < CT_DPIF_TCPS_SYN_SENT) {
273 /* First packet from this end. Set its state */
274
275 ack = ntohl(tcp->ack);
276
277 end = seq + p_len;
278 if (tcp->syn) {
279 end++;
280 if (dst->wscale & CT_WSCALE_FLAG) {
281 src->wscale = OvsTcpGetWscale(tcp);
282 if (src->wscale & CT_WSCALE_FLAG) {
283 /* Remove scale factor from initial window */
284 sws = src->wscale & CT_WSCALE_MASK;
285 win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
286 dws = dst->wscale & CT_WSCALE_MASK;
287 } else {
288 /* fixup other window */
289 dst->max_win <<= dst->wscale &
290 CT_WSCALE_MASK;
291 /* in case of a retrans SYN|ACK */
292 dst->wscale = 0;
293 }
294 }
295 }
296 if (tcp->fin) {
297 end++;
298 }
299
300 src->seqlo = seq;
301 src->state = CT_DPIF_TCPS_SYN_SENT;
302 /*
303 * May need to slide the window (seqhi may have been set by
304 * the crappy stack check or if we picked up the connection
305 * after establishment)
306 */
307 if (src->seqhi == 1 ||
308 SEQ_GEQ(end + MAX(1, dst->max_win << dws),
309 src->seqhi)) {
310 src->seqhi = end + MAX(1, dst->max_win << dws);
311 }
312 if (win > src->max_win) {
313 src->max_win = win;
314 }
315
316 } else {
317 ack = ntohl(tcp->ack);
318 end = seq + p_len;
319 if (tcp->syn) {
320 end++;
321 }
322 if (tcp->fin) {
323 end++;
324 }
325 }
326
327 if ((tcp->ack) == 0) {
328 /* Let it pass through the ack skew check */
329 ack = dst->seqlo;
330 } else if ((ack == 0
331 && (tcp->ack && tcp->rst) == (TCP_ACK|TCP_RST))
332 /* broken tcp stacks do not set ack */) {
333 /* Many stacks (ours included) will set the ACK number in an
334 * FIN|ACK if the SYN times out -- no sequence to ACK. */
335 ack = dst->seqlo;
336 }
337
338 if (seq == end) {
339 /* Ease sequencing restrictions on no data packets */
340 seq = src->seqlo;
341 end = seq;
342 }
343
344 ackskew = dst->seqlo - ack;
345 #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
346 if (SEQ_GEQ(src->seqhi, end)
347 /* Last octet inside other's window space */
348 && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
349 /* Retrans: not more than one window back */
350 && (ackskew >= -MAXACKWINDOW)
351 /* Acking not more than one reassembled fragment backwards */
352 && (ackskew <= (MAXACKWINDOW << sws))
353 /* Acking not more than one window forward */
354 && ((tcp->rst) == 0 || orig_seq == src->seqlo
355 || (orig_seq == src->seqlo + 1)
356 || (orig_seq + 1 == src->seqlo))) {
357 /* Require an exact/+1 sequence match on resets when possible */
358
359 /* update max window */
360 if (src->max_win < win) {
361 src->max_win = win;
362 }
363 /* synchronize sequencing */
364 if (SEQ_GT(end, src->seqlo)) {
365 src->seqlo = end;
366 }
367 /* slide the window of what the other end can send */
368 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
369 dst->seqhi = ack + MAX((win << sws), 1);
370 }
371
372 /* update states */
373 if (tcp->syn && src->state < CT_DPIF_TCPS_SYN_SENT) {
374 src->state = CT_DPIF_TCPS_SYN_SENT;
375 }
376 if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) {
377 src->state = CT_DPIF_TCPS_CLOSING;
378 }
379 if (tcp->ack) {
380 if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
381 dst->state = CT_DPIF_TCPS_ESTABLISHED;
382 } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
383 dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
384 }
385 }
386 if (tcp->rst) {
387 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
388 }
389
390 if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
391 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
392 OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC);
393 } else if (src->state >= CT_DPIF_TCPS_CLOSING
394 && dst->state >= CT_DPIF_TCPS_CLOSING) {
395 OvsConntrackUpdateExpiration(conn, now, 45 * CT_INTERVAL_SEC);
396 } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
397 || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
398 OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC);
399 } else if (src->state >= CT_DPIF_TCPS_CLOSING
400 || dst->state >= CT_DPIF_TCPS_CLOSING) {
401 OvsConntrackUpdateExpiration(conn, now, 15 * 60 * CT_INTERVAL_SEC);
402 } else {
403 OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * CT_INTERVAL_SEC);
404 }
405 } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
406 || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
407 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
408 && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
409 /* Within a window forward of the originating packet */
410 && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
411 /* Within a window backward of the originating packet */
412
413 /*
414 * This currently handles three situations:
415 * 1) Stupid stacks will shotgun SYNs before their peer
416 * replies.
417 * 2) When PF catches an already established stream (the
418 * firewall rebooted, the state table was flushed, routes
419 * changed...)
420 * 3) Packets get funky immediately after the connection
421 * closes (this should catch Solaris spurious ACK|FINs
422 * that web servers like to spew after a close)
423 *
424 * This must be a little more careful than the above code
425 * since packet floods will also be caught here. We don't
426 * update the TTL here to mitigate the damage of a packet
427 * flood and so the same code can handle awkward establishment
428 * and a loosened connection close.
429 * In the establishment case, a correct peer response will
430 * validate the connection, go through the normal state code
431 * and keep updating the state TTL.
432 */
433
434 /* update max window */
435 if (src->max_win < win) {
436 src->max_win = win;
437 }
438 /* synchronize sequencing */
439 if (SEQ_GT(end, src->seqlo)) {
440 src->seqlo = end;
441 }
442 /* slide the window of what the other end can send */
443 if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
444 dst->seqhi = ack + MAX((win << sws), 1);
445 }
446
447 /*
448 * Cannot set dst->seqhi here since this could be a shotgunned
449 * SYN and not an already established connection.
450 */
451
452 if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) {
453 src->state = CT_DPIF_TCPS_CLOSING;
454 }
455
456 if (tcp->rst) {
457 src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
458 }
459 } else {
460 return CT_UPDATE_INVALID;
461 }
462
463 return CT_UPDATE_VALID;
464 }
465
466 BOOLEAN
467 OvsConntrackValidateTcpPacket(const TCPHdr *tcp)
468 {
469 if (tcp == NULL || OvsConntrackValidateTcpFlags(tcp)) {
470 return FALSE;
471 }
472
473 /* A syn+ack is not allowed to create a connection. We want to allow
474 * totally new connections (syn) or already established, not partially
475 * open (syn+ack). */
476 if ((tcp->syn) && (tcp->ack)) {
477 return FALSE;
478 }
479
480 return TRUE;
481 }
482
483 OVS_CT_ENTRY *
484 OvsConntrackCreateTcpEntry(const TCPHdr *tcp,
485 PNET_BUFFER_LIST nbl,
486 UINT64 now)
487 {
488 struct conn_tcp* newconn = NULL;
489 struct tcp_peer *src, *dst;
490
491 newconn = OvsAllocateMemoryWithTag(sizeof(struct conn_tcp),
492 OVS_CT_POOL_TAG);
493 newconn->up = (OVS_CT_ENTRY) {0};
494 src = &newconn->peer[0];
495 dst = &newconn->peer[1];
496
497 src->seqlo = ntohl(tcp->seq);
498 src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1;
499
500 if (tcp->syn) {
501 src->seqhi++;
502 src->wscale = OvsTcpGetWscale(tcp);
503 } else {
504 src->wscale = CT_WSCALE_UNKNOWN;
505 dst->wscale = CT_WSCALE_UNKNOWN;
506 }
507 src->max_win = MAX(ntohs(tcp->window), 1);
508 if (src->wscale & CT_WSCALE_MASK) {
509 /* Remove scale factor from initial window */
510 uint8_t sws = src->wscale & CT_WSCALE_MASK;
511 src->max_win = DIV_ROUND_UP((uint32_t) src->max_win,
512 1 << sws);
513 }
514 if (tcp->fin) {
515 src->seqhi++;
516 }
517 dst->seqhi = 1;
518 dst->max_win = 1;
519 src->state = CT_DPIF_TCPS_SYN_SENT;
520 dst->state = CT_DPIF_TCPS_CLOSED;
521
522 OvsConntrackUpdateExpiration(newconn, now, CT_ENTRY_TIMEOUT);
523
524 return &newconn->up;
525 }